Merge pull request #6119 from notoraptor/get-rid-of-get-op-params

Get rid of get_op_params()

Merge pull request #6119 from notoraptor/get-rid-of-get-op-params
f9c8d096 · Pascal Lamblin · GitHub · 2c1c8549 · b63f0dee · f9c8d096
--- a/doc/library/gpuarray/op.txt
+++ b/doc/library/gpuarray/op.txt
@@ -22,9 +22,6 @@ Blas Op
 .. automodule:: theano.gpuarray.blas
    :members:

-.. automodule:: theano.gpuarray.nerv
-    :members:
-
 Elemwise Op
 ===========


--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -1388,11 +1388,10 @@ class COp(Op):
                raise ValueError("No valid section marker was found in file "
                                 "%s" % func_files[i])

-    def get_op_params(self):
+    def __get_op_params(self):
        """
        Returns a list of (name, value) pairs that will be turned into
-        macros for use within the op code. This is intended to allow
-        an op's properties to influence the generated C code.
+        macros for use within the op code.

        The names must be strings that are not a C keyword and the
        values must be strings of literal C representations.
@@ -1412,6 +1411,10 @@ class COp(Op):
            params = [('PARAMS_TYPE', wrapper.name)]
            for i in range(wrapper.length):
                try:
+                    # NB (reminder): These macros are currently used only in ParamsType example test
+                    # (`theano/gof/tests/test_quadratic_function.c`), to demonstrate how we can
+                    # access params dtypes when dtypes may change (e.g. if based on theano.config.floatX).
+                    # But in practice, params types generally have fixed types per op.
                    params.append(('DTYPE_PARAM_' + wrapper.fields[i], wrapper.types[i].c_element_type()))
                except utils.MethodNotDefined:
                    pass
@@ -1506,7 +1509,7 @@ class COp(Op):
                                                "str##_%s" % name))
        undef_macros.append(undef_template % "APPLY_SPECIFIC")

-        for n, v in self.get_op_params():
+        for n, v in self.__get_op_params():
            define_macros.append(define_template % (n, v))
            undef_macros.append(undef_template % (n,))


--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
                   reg_context, get_context, ContextNotDefined)
 from .basic_ops import as_gpuarray_variable
-from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction, rng_mrg
+from . import fft, dnn, opt, extra_ops, multinomial, reduction, rng_mrg


 def transfer(x, target):

--- a/theano/gpuarray/blockgemv.c
+++ b/theano/gpuarray/blockgemv.c
@@ -4,19 +4,19 @@ int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
                              PyGpuArrayObject *h, PyArrayObject *inputIdx,
                              PyArrayObject *outputIdx,
                              PyGpuArrayObject **_out,
-                              PyGpuContextObject *ctx) {
+                              PARAMS_TYPE* params) {
  PyGpuArrayObject *out = *_out;
-#ifdef INPLACE
-  Py_XDECREF(out);
-  out = o;
-  Py_INCREF(out);
-#else
-  out = theano_try_copy(out, o);
-  if (out == NULL) {
-    // Error already set
-    return -1;
+  if (params->inplace) {
+    Py_XDECREF(out);
+    out = o;
+    Py_INCREF(out);
+  } else {
+    out = theano_try_copy(out, o);
+    if (out == NULL) {
+      // Error already set
+      return -1;
+    }
  }
-#endif

  gpudata **W_list = NULL;
  gpudata **inp_list = NULL;
@@ -26,7 +26,7 @@ int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
  size_t *offOut = NULL;
  int err;

-  err = gpublas_setup(ctx->ctx);
+  err = gpublas_setup(params->context->ctx);
  if (err != GA_NO_ERROR) {
    PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
    return -1;

--- a/theano/gpuarray/blockger.c
+++ b/theano/gpuarray/blockger.c
@@ -4,7 +4,7 @@ int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
                             PyGpuArrayObject *y, PyArrayObject *xIdx,
                             PyArrayObject *yIdx, PyArrayObject *alpha,
                             PyGpuArrayObject **_out,
-                             PyGpuContextObject *ctx) {
+                             PARAMS_TYPE* params) {
  PyGpuArrayObject *out = *_out;
  gpudata **o_list = NULL;
  gpudata **x_list = NULL;
@@ -14,21 +14,21 @@ int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
  size_t *offY = NULL;
  int err;

-  err = gpublas_setup(ctx->ctx);
+  err = gpublas_setup(params->context->ctx);
  if (err != GA_NO_ERROR) {
    PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
    return -1;
  }

-#ifdef INPLACE
-  Py_XDECREF(out);
-  out = o;
-  Py_INCREF(out);
-#else
-  out = theano_try_copy(out, o);
-  if (out == NULL)
-    return -1;
-#endif
+  if (params->inplace) {
+    Py_XDECREF(out);
+    out = o;
+    Py_INCREF(out);
+  } else {
+    out = theano_try_copy(out, o);
+    if (out == NULL)
+      return -1;
+  }
  size_t maxi = PyGpuArray_DIMS(x)[1];
  size_t maxj = PyGpuArray_DIMS(y)[1];
  size_t maxb = PyGpuArray_DIMS(x)[0];

--- a/theano/gpuarray/blocksparse.py
+++ b/theano/gpuarray/blocksparse.py
@@ -4,8 +4,9 @@ import os

 import numpy as np
 from theano import Apply, tensor
-from theano.gof import COp
+from theano.gof import COp, ParamsType
 from theano.tensor import discrete_dtypes, as_tensor_variable
+from theano.scalar import bool as bool_t

 from theano.gradient import grad_undefined

@@ -25,7 +26,8 @@ class GpuSparseBlockGemv(COp):
    function for a stable interface.
    """
    __props__ = ('inplace',)
-    params_type = gpu_context_type
+    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
+    # NB: DTYPE_INPUT_* is used in C code, so I think we should not set check_input to False.

    def __init__(self, inplace=False):
        COp.__init__(self, "blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
@@ -34,13 +36,7 @@ class GpuSparseBlockGemv(COp):
            self.destroy_map = {0: [0]}

    def get_params(self, node):
-        return node.inputs[0].type.context
-
-    def get_op_params(self):
-        if self.inplace:
-            return [('INPLACE', '1')]
-        else:
-            return []
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def c_header_dirs(self):
        return [os.path.dirname(__file__)]
@@ -102,7 +98,7 @@ class GpuSparseBlockOuter(COp):
    of GpuSparseBlockGemv. The gradient is not implemented.
    """
    __props__ = ('inplace',)
-    params_type = gpu_context_type
+    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)

    def __init__(self, inplace=False):
        COp.__init__(self, ["blockger.c"], "APPLY_SPECIFIC(blockger)")
@@ -111,13 +107,7 @@ class GpuSparseBlockOuter(COp):
            self.destroy_map = {0: [0]}

    def get_params(self, node):
-        return node.inputs[0].type.context
-
-    def get_op_params(self):
-        if self.inplace:
-            return [('INPLACE', '1')]
-        else:
-            return []
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
        ctx = infer_context_name(o, x, y)

--- a/theano/gpuarray/gemm16.c
+++ b/theano/gpuarray/gemm16.c
-#section init_code_struct
-
-/* Why do we need this? */
-size_t dim = 2048 * 32;
-rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, PARAMS,
-                       Py_None);
-if (rand_buf == NULL) {
-  FAIL;
-}
-
-#section support_code_struct
-
-PyGpuArrayObject *rand_buf;
-
-int gemm16(PyGpuArrayObject *C, float alpha,
-           PyGpuArrayObject *A, PyGpuArrayObject *B,
-           float beta, PyGpuArrayObject **out,
-           PyGpuContextObject *c) {
-  PyGpuArrayObject *_A = NULL;
-  PyGpuArrayObject *_B = NULL;
-  GpuKernel *gk;
-  char *prand, *pA, *pB, *pout;
-  void *params[13];
-  size_t grid[2];
-  size_t threads[2];
-  int res = 0;
-  int flags = 0;
-  int lda, ldb, ldc, n, m, k;
-  int n128, n64;
-  int size = 0;
-  int vec = 0;
-  static unsigned int nprocs = 0;
-  char opA, opB;
-
-  if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY) &&
-      GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) {
-    /*
-     * The nervana kernels do not cover the case where both inputs are
-     * trans so we need to copy one of them.  We choose the smallest
-     * one.
-     */
-    if (PyGpuArray_DIM(A, 0) * PyGpuArray_DIM(A, 1) <
-        PyGpuArray_DIM(B, 0) * PyGpuArray_DIM(B, 1)) {
-      _A = pygpu_copy(A, GA_C_ORDER);
-      if (_A == NULL) {
-        res = 1;
-        goto cleanup;
-      }
-      /*
-       * This is not an extra reference on _A so don't add an INCREF.
-       * Also, we don't lose the ref on A since our caller will deal
-       * with it.
-       */
-      A = _A;
-    } else {
-      _B = pygpu_copy(B, GA_C_ORDER);
-      if (_B == NULL) {
-        res = 1;
-        goto cleanup;
-      }
-      /*
-       * This is not an extra reference on _B so don't add an INCREF
-       * Also, we don't lose the ref on B since our caller will deal
-       * with it.
-       */
-      B = _B;
-    }
-  }
-  if (GEMM16_INPLACE && GpuArray_CHKFLAGS(&C->ga, GA_CARRAY)) {
-    Py_XDECREF(*out);
-    *out = C;
-    Py_INCREF(*out);
-  } else {
-    *out = theano_try_copy(*out, C);
-    if (*out == NULL) {
-      res = 1;
-      goto cleanup;
-    }
-  }
-
-  if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY)) {
-    opA = 't';
-    lda = PyGpuArray_STRIDE(A, 1);
-  } else {
-    opA = 'n';
-    lda = PyGpuArray_STRIDE(A, 0);
-  }
-
-  if (GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) {
-    opB = 't';
-    ldb = PyGpuArray_STRIDE(B, 1);
-  } else {
-    opB = 'n';
-    ldb = PyGpuArray_STRIDE(B, 0);
-  }
-
-  ldc = PyGpuArray_STRIDE(*out, 0);
-
-  /* lda and friend are in number of elements, not bytes */
-  lda /= 2;
-  ldb /= 2;
-  ldc /= 2;
-
-  m = PyGpuArray_DIM(*out, 0);
-  n = PyGpuArray_DIM(*out, 1);
-  k = PyGpuArray_DIM(B, 0);
-
-  /* Tuning code adapted from the python version */
-  grid[0] = (m + 127) / 128;
-
-  if (opA == 'n' && opB == 't')
-    size = 128;
-  else {
-    if (n < 384-16) {
-      n128 = n % 128;
-      if (n128 < 112) {
-        if (48 < n128 && n128 <= 64) {
-          n64 = n / 64;
-          if (nprocs == 0)
-            if (gpucontext_property(A->context->ctx,
-                                    GA_CTX_PROP_NUMPROCS, &nprocs)) {
-              nprocs = 0;
-              res = 1;
-              goto cleanup;
-            }
-          n64 *= (grid[0] / nprocs);
-          if (n64 > 1 || (opA == 't' && opB == 'n'))
-            size = 64;
-          else
-            size = 32;
-        } else {
-          size = 32;
-        }
-      } else {
-        size = 128;
-      }
-    } else {
-      size = 128;
-    }
-  }
-
-  grid[1] = (n + (size-1)) / size;
-  if (size == 128)
-    threads[0] = 256;
-  else
-    threads[0] = 128;
-  threads[1] = 1;
-
-  if ((opA == 't' && opB == 'n' && m % 8 == 0 && n % 8 == 0) ||
-      (opA == 'n' && opB == 'n' && k % 16 == 0 && n % 8 == 0) ||
-      (opA == 'n' && opB == 't' && k % 16 == 0))
-    vec = 1;
-
-  switch (size) {
-  case 128:
-    if (opA == 'n' && opB == 'n') {
-      if (vec)
-        gk = &k_nn_vec_128x128;
-      else
-        gk = &k_nn_128x128;
-    } else if (opA == 'n' && opB == 't') {
-      if (vec)
-        gk = &k_nt_vec_128x128;
-      else
-        gk = &k_nt_128x128;
-    } else if (opA == 't' && opB == 'n') {
-      if (vec)
-        gk = &k_tn_vec_128x128;
-      else
-        gk = &k_tn_128x128;
-    }
-    break;
-  case 64:
-    if (opA == 'n' && opB == 'n') {
-      if (vec)
-        gk = &k_nn_vec_128x64;
-      else
-        gk = &k_nn_128x64;
-    } else if (opA == 't' && opB == 'n') {
-      if (vec)
-        gk = &k_tn_vec_128x64;
-      else
-        gk = &k_tn_128x64;
-    }
-    break;
-  case 32:
-    if (opA == 'n' && opB == 'n') {
-      if (vec)
-        gk = &k_nn_vec_128x32;
-      else
-        gk = &k_nn_128x32;
-    } else if (opA == 't' && opB == 'n') {
-      if (vec)
-        gk = &k_tn_vec_128x32;
-      else
-        gk = &k_tn_128x32;
-    }
-    break;
-  default:
-    PyErr_SetString(PyExc_RuntimeError, "error selecting kernel");
-    res = 1;
-    goto cleanup;
-  }
-
-  prand = *((char **)rand_buf->ga.data);
-  prand += rand_buf->ga.offset;
-  pA = *((char **)A->ga.data);
-  pA += A->ga.offset;
-  pB = *((char **)B->ga.data);
-  pB += B->ga.offset;
-  pout = *((char **)(*out)->ga.data);
-  pout += (*out)->ga.offset;
-  params[0] = &prand;
-  params[1] = &pA;
-  params[2] = &pB;
-  params[3] = &pout;
-  params[4] = &lda;
-  params[5] = &ldb;
-  params[6] = &ldc;
-  params[7] = &m;
-  params[8] = &n;
-  params[9] = &k;
-  params[10] = &alpha;
-  params[11] = &beta;
-  params[12] = &flags;
-
-  if (GpuKernel_call(gk, 2, grid, threads, 0, params) != GA_NO_ERROR) {
-    PyErr_SetString(PyExc_RuntimeError, "error in gemm16 kernel call");
-    res = 1;
-  }
-
-cleanup:
-  Py_XDECREF(_A);
-  Py_XDECREF(_B);
-  return res;
-}
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -9,7 +9,8 @@ from numpy.linalg.linalg import LinAlgError

 import theano
 from theano import Op, config, tensor
-from theano.gof import COp
+from theano.scalar import bool as bool_t
+from theano.gof import COp, ParamsType
 from theano.gpuarray import GpuArrayType

 from .basic_ops import as_gpuarray_variable, gpu_contiguous, infer_context_name
@@ -350,9 +351,19 @@ def gpu_cholesky(A, lower=True):

 class GpuMagmaSVD(COp):
    """Computes the svd of a matrix :math:`A` using magma library.
+
+    .. warning::
+
+        Because of implementation constraints, this Op returns outputs
+        in order ``S, U, VT``. Use :func:`theano.gpuarray.linalg.gpu_svd`
+        to get them in expected order ``U, S, VT``.
+
    """
    __props__ = ('full_matrices', 'compute_uv')
-    params_type = gpu_context_type
+    _cop_num_inputs = 1
+    _cop_num_outputs = 3
+    check_input = False
+    params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type)

    def __init__(self, full_matrices=True, compute_uv=True):
        self.full_matrices = full_matrices
@@ -385,25 +396,28 @@ class GpuMagmaSVD(COp):
        assert A.dtype == 'float32'
        if self.compute_uv:
            return theano.Apply(self, [A],
-                                [A.type(),
-                                GpuArrayType(A.dtype, broadcastable=[False],
-                                             context_name=ctx_name)(),
-                                A.type()])
+                                # return S, U, VT
+                                [GpuArrayType(A.dtype, broadcastable=[False],
+                                              context_name=ctx_name)(),
+                                 A.type(),
+                                 A.type()])
        else:
            return theano.Apply(self, [A],
+                                # return only S
                                [GpuArrayType(A.dtype, broadcastable=[False],
                                              context_name=ctx_name)()])

-    def get_params(self, node):
-        return node.inputs[0].type.context
-
-    def get_op_params(self):
-        params = []
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        # Check node to prevent eventual errors with old pickled nodes.
        if self.compute_uv:
-            params.append(('COMPUTE_UV', '1'))
-        if self.full_matrices:
-            params.append(('FULL_MATRICES', '1'))
-        return params
+            A, B, C = node.outputs
+            # We expect order: S (vector), U (matrix), VT (matrix)
+            assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, \
+                "Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) " \
+                "instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order."
+
+    def get_params(self, node):
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def infer_shape(self, node, shapes):
        x_shape, = shapes
@@ -413,7 +427,7 @@ class GpuMagmaSVD(COp):
        if self.compute_uv:
            u_shape = (M, M) if self.full_matrices else (M, K)
            vt_shape = (N, N) if self.full_matrices else (K, N)
-            return [u_shape, s_shape, vt_shape]
+            return [s_shape, u_shape, vt_shape]
        else:
            return [s_shape]

@@ -438,14 +452,19 @@ def gpu_svd(a, full_matrices=1, compute_uv=1):
    U, V,  D : matrices

    """
-    return GpuMagmaSVD(full_matrices, compute_uv)(a)
+    out = GpuMagmaSVD(full_matrices, compute_uv)(a)
+    if compute_uv:
+        S, U, VT = out
+        out = [U, S, VT]
+    return out


 class GpuMagmaMatrixInverse(COp):
    """Computes the inverse of a matrix :math:`A` using magma library.
    """
    __props__ = ('inplace', )
-    params_type = gpu_context_type
+    check_input = False
+    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)

    def __init__(self, inplace=False):
        COp.__init__(self, ['magma_inv.c'], 'APPLY_SPECIFIC(magma_inv)')
@@ -483,13 +502,7 @@ class GpuMagmaMatrixInverse(COp):
        return theano.Apply(self, [x], [x.type()])

    def get_params(self, node):
-        return node.inputs[0].type.context
-
-    def get_op_params(self):
-        if self.inplace:
-            return [('INPLACE', '1')]
-        else:
-            return []
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def infer_shape(self, node, shapes):
        return shapes

--- a/theano/gpuarray/magma_inv.c
+++ b/theano/gpuarray/magma_inv.c
@@ -5,7 +5,7 @@ setup_ext_cuda();
 #section support_code_struct

 int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
-                              PyGpuContextObject *c) {
+                              PARAMS_TYPE* params) {
  const size_t *dims;
  magma_int_t N, ldwork, info;
  magma_int_t *piv = NULL;
@@ -19,7 +19,7 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
  }

  // This is early to match the exit() in the fail label.
-  cuda_enter(c->ctx);
+  cuda_enter(params->context->ctx);
  magma_init();

  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
@@ -38,25 +38,25 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
                    "GpuMagmaMatrixInverse: matrix is not square");
    goto fail;
  }
-#ifdef INPLACE
-  Py_XDECREF(*A_inv);
-  *A_inv = A;
-  Py_INCREF(*A_inv);
-#else
-  *A_inv = theano_try_copy(*A_inv, A);
-  if (*A_inv == NULL) {
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        "GpuMagmaMatrixInverse: failed to allocate memory for the output");
-    goto fail;
+  if (params->inplace) {
+    Py_XDECREF(*A_inv);
+    *A_inv = A;
+    Py_INCREF(*A_inv);
+  } else {
+    *A_inv = theano_try_copy(*A_inv, A);
+    if (*A_inv == NULL) {
+      PyErr_SetString(
+          PyExc_RuntimeError,
+          "GpuMagmaMatrixInverse: failed to allocate memory for the output");
+      goto fail;
+    }
  }
-#endif
  // magma matrix inverse

  N = dims[0];

  ldwork = N * magma_get_sgetri_nb(N);
-  dwork = gpudata_alloc(c->ctx, ldwork * sizeof(float), NULL, 0, NULL);
+  dwork = gpudata_alloc(params->context->ctx, ldwork * sizeof(float), NULL, 0, NULL);
  if (dwork == NULL) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaMatrixInverse: failed to allocate working memory");
@@ -94,6 +94,6 @@ fail:
  if (dwork != NULL)
    gpudata_release(dwork);
  magma_finalize();
-  cuda_exit(c->ctx);
+  cuda_exit(params->context->ctx);
  return res;
 }
--- a/theano/gpuarray/magma_svd.c
+++ b/theano/gpuarray/magma_svd.c
@@ -5,14 +5,11 @@ setup_ext_cuda();
 #section support_code_struct

 int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
-#ifdef COMPUTE_UV
-                              PyGpuArrayObject **U,
-#endif
                              PyGpuArrayObject **S,
-#ifdef COMPUTE_UV
-                              PyGpuArrayObject **VT,
-#endif
-                              PyGpuContextObject *c) {
+                              PyGpuArrayObject **U, // may be NULL
+                              PyGpuArrayObject **VT, // may be NULL
+                              PARAMS_TYPE* params) {
+  bool compute_uv = (U != NULL);
  magma_int_t *iwork = NULL, iunused[1];
  magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
  magma_vec_t jobz;
@@ -29,7 +26,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
  }

  // This is early to match the exit() in the fail label.
-  cuda_enter(c->ctx);
+  cuda_enter(params->context->ctx);
  magma_init();

  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
@@ -63,32 +60,32 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
    goto fail;
  }

-#ifdef COMPUTE_UV
-#ifdef FULL_MATRICES
-  jobz = MagmaAllVec;
-#else
-  jobz = MagmaSomeVec;
-#endif
-  M_U  = (jobz == MagmaAllVec ? M : K);
-  N_VT = (jobz == MagmaAllVec ? N : K);
-  ldu = M;
-  ldv = N_VT;
-
-  if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaSVD: failed to allocate memory");
-    goto fail;
+  if (compute_uv) {
+    if (params->full_matrices) {
+      jobz = MagmaAllVec;
+    } else {
+      jobz = MagmaSomeVec;
+    }
+    M_U  = (jobz == MagmaAllVec ? M : K);
+    N_VT = (jobz == MagmaAllVec ? N : K);
+    ldu = M;
+    ldv = N_VT;
+
+    if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMagmaSVD: failed to allocate memory");
+      goto fail;
+    }
+    if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMagmaSVD: failed to allocate memory");
+      goto fail;
+    }
+  } else {
+    jobz = MagmaNoVec;
+    ldu = M;
+    ldv = N;
  }
-  if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaSVD: failed to allocate memory");
-    goto fail;
-  }
-#else
-  jobz = MagmaNoVec;
-  ldu = M;
-  ldv = N;
-#endif

  // query for workspace size
  magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv,
@@ -124,7 +121,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
  }

  s_dims[0] = K;
-  if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
+  if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate memory");
    goto fail;
@@ -132,29 +129,29 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
  cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
             cudaMemcpyDeviceToDevice);

-#ifdef COMPUTE_UV
-  u_dims[0] = N; u_dims[1] = N_VT;
-  if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaSVD: failed to allocate memory");
-    goto fail;
-  }
-  // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
-  // to match numpy.linalg.svd output
-  cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
-             cudaMemcpyDeviceToDevice);
-
-  vt_dims[0] = M_U; vt_dims[1] = M;
-  if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaSVD: failed to allocate memory");
-    goto fail;
+  if (compute_uv) {
+    u_dims[0] = N; u_dims[1] = N_VT;
+    if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMagmaSVD: failed to allocate memory");
+      goto fail;
+    }
+    // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
+    // to match numpy.linalg.svd output
+    cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
+               cudaMemcpyDeviceToDevice);
+
+    vt_dims[0] = M_U; vt_dims[1] = M;
+    if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMagmaSVD: failed to allocate memory");
+      goto fail;
+    }
+    // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
+    // to match numpy.linalg.svd output
+    cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
+               cudaMemcpyDeviceToDevice);
  }
-  // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
-  // to match numpy.linalg.svd output
-  cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
-             cudaMemcpyDeviceToDevice);
-#endif
  res = 0;
 fail:
  if (a_data != NULL)
@@ -170,6 +167,6 @@ fail:
  if (iwork != NULL)
    magma_free_cpu(iwork);
  magma_finalize();
-  cuda_exit(c->ctx);
+  cuda_exit(params->context->ctx);
  return res;
 }
--- a/theano/gpuarray/nerv.py
+++ b/theano/gpuarray/nerv.py
-from __future__ import absolute_import, print_function, division
-import os.path
-import theano
-from theano import Apply, Variable, tensor
-
-from theano.compile import optdb
-from theano.compile.ops import shape_i
-from theano.gof import local_optimizer, COp
-from theano.scalar import as_scalar, constant
-
-from . import opt
-from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
-                        infer_context_name)
-from .type import gpu_context_type
-from .opt_util import alpha_merge, output_merge
-
-try:
-    from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
-    nerv = NervanaGPU()
-except ImportError:
-    GPUTensor = None
-    nerv = None
-
-
-def to_gputensor(a):
-    assert a.flags.c_contiguous or a.flags.f_contiguous
-    return GPUTensor(a.shape, dtype=a.dtype, base=a,
-                     gpudata=a.gpudata + a.offset,
-                     strides=a.strides, is_trans=a.flags.f_contiguous)
-
-
-def ensure_float(val, name):
-    if not isinstance(val, Variable):
-        val = constant(val)
-    if hasattr(val, 'ndim') and val.ndim == 0:
-        val = as_scalar(val)
-    if not isinstance(val.type, theano.scalar.Scalar):
-        raise TypeError("%s: expected a scalar value" % (name,))
-    if not val.type.dtype == 'float32':
-        raise TypeError("%s: type is not float32" % (name,))
-    return val
-
-
-class Gemm16(COp):
-    """
-    Gemm for float16 using the nervena kernels.
-    """
-    __props__ = ('relu', 'inplace')
-    _f16_ok = True
-    params_type = gpu_context_type
-    KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
-                  'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
-                  'tn_128x128', 'tn_128x64', 'tn_128x32',
-                  'tn_vec_128x128', 'tn_vec_128x64', 'tn_vec_128x32',
-                  'tn_vec_128x16', 'nt_128x128', 'nt_vec_128x128')
-
-    def __init__(self, relu=False, inplace=False):
-        COp.__init__(self, ["gemm16.c"], "gemm16")
-        self.relu = relu
-        # relu = True will require more work in optimizations.
-        assert self.relu is False
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-
-    def make_node(self, C, alpha, A, B, beta):
-        if GPUTensor is None:
-            raise RuntimeError("Can't use Gemm16: nervanagpu not found")
-        ctx_name = infer_context_name(C, A, B)
-
-        A = as_gpuarray_variable(A, ctx_name)
-        B = as_gpuarray_variable(B, ctx_name)
-        C = as_gpuarray_variable(C, ctx_name)
-
-        alpha = ensure_float(alpha, 'alpha')
-        beta = ensure_float(beta, 'beta')
-
-        assert C.dtype == A.dtype == B.dtype == 'float16'
-
-        return Apply(self, [C, alpha, A, B, beta], [C.type()])
-
-    def get_params(self, node):
-        return node.inputs[0].type.context
-
-    def c_headers(self):
-        return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
-                'string.h']
-
-    def c_header_dirs(self):
-        return [os.path.dirname(__file__)]
-
-    def get_op_params(self):
-        return [('GEMM16_INPLACE', '1' if self.inplace else '0')]
-
-    @staticmethod
-    def cubin_to_code(name):
-        fname = 'hgemm_{0}.cubin'.format(name)
-        with open(os.path.join(nerv.cubin_path, fname)) as f:
-            cubin = f.read()
-        bcode = ','.join(hex(ord(c)) for c in cubin)
-        return "static const char bin_%s[] = { %s };" % (name, bcode)
-
-    @staticmethod
-    def init_gpukernel(name, fail):
-        return """
-bcode = bin_%(name)s;
-sz = sizeof(bin_%(name)s);
-if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
-                   "hgemm_%(name)s", 13, types, GA_USE_BINARY, NULL)
-    != GA_NO_ERROR) {
-  PyErr_SetString(PyExc_RuntimeError, "Could not initialize kernel %(name)s");
-  %(fail)s;
-}
-""" % dict(name=name, fail=fail)
-
-    def c_support_code(self):
-        codel = []
-        for name in self.KERN_NAMES:
-            codel.append(Gemm16.cubin_to_code(name))
-        return '\n'.join(codel)
-
-    def c_support_code_struct(self, node, nodename):
-        codel = []
-        for name in self.KERN_NAMES:
-            codel.append("GpuKernel k_{0};".format(name))
-        codel.append(super(Gemm16, self).c_support_code_struct(node, nodename))
-        return '\n'.join(codel)
-
-    def c_init_code_struct(self, node, nodename, sub):
-        codel = [super(Gemm16, self).c_init_code_struct(node, nodename, sub)]
-        for name in self.KERN_NAMES:
-            codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name))
-        codel.append("const char *bcode;")
-        codel.append("size_t sz;")
-        codel.append("PyGpuContextObject *c = %s;" % (sub['params'],))
-        codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
-                     "GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
-                     "GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
-        for name in self.KERN_NAMES:
-            codel.append(self.init_gpukernel(name, sub['fail']))
-        return '\n'.join(codel)
-
-    def c_cleanup_code_struct(self, node, nodename):
-        codel = []
-        for name in self.KERN_NAMES:
-            codel.append("GpuKernel_clear(&k_{0});".format(name))
-        return '\n'.join(codel)
-
-
-@opt.register_opt('fast_compile')
-@opt.op_lifter([tensor.Dot])
-@opt.register_opt2([tensor.Dot], 'fast_compile')
-def local_gpua_dot_to_gemm16(op, ctx_name, inputs, outputs):
-    if nerv is None:
-        return
-    A = inputs[0]
-    B = inputs[1]
-    if (A.ndim == 2 and B.ndim == 2 and
-            A.dtype == 'float16' and B.dtype == 'float16'):
-        fgraph = getattr(outputs[0], 'fgraph', None)
-        C = GpuAllocEmpty('float16', ctx_name)(
-            shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
-        return Gemm16()(C, 1.0, A, B, 0.0)
-
-
-@opt.register_opt()
-@alpha_merge(Gemm16, alpha_in=1, beta_in=4)
-def local_gemm16_alpha_merge(node, *inputs):
-    return [Gemm16(relu=node.op.relu)(*inputs)]
-
-
-@opt.register_opt()
-@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
-def local_gemm16_output_merge(node, *inputs):
-    return [Gemm16(relu=node.op.relu)(*inputs)]
-
-
-@local_optimizer([Gemm16], inplace=True)
-def local_gemm16_inplace(node):
-    if type(node.op) != Gemm16 or node.op.inplace:
-        return
-    inputs = list(node.inputs)
-    C = inputs[0]
-    if (C.owner and
-            isinstance(C.owner.op, GpuAllocEmpty) and
-            len(C.clients) > 1):
-        inputs[0] = C.owner.op(*C.owner.inputs)
-    return [Gemm16(relu=node.op.relu, inplace=True)(*inputs)]
-
-optdb.register('local_gemm16_inplace',
-               tensor.opt.in2out(local_gemm16_inplace,
-                                 name='local_gemm16_inplace'),
-               70.0, 'fast_run', 'inplace', 'gpuarray')
+# To prevent flake8 error.
+from __future__ import print_function, absolute_import, division
+
+raise ImportError(
+    "You are importing theano.gpuarray.nerv. "
+    "This module was removed as it was based on nervanagpu that is now deprecated. "
+    "To still get this module, use Theano 0.9. "
+    "More info about nervanagpu here: https://github.com/NervanaSystems/nervanagpu "
+    "(viewed on 2017/07/05).")
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -73,7 +73,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
 from .reduction import GpuMaxAndArgmax
 from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
-                     cusolver_available, GpuMagmaMatrixInverse, GpuMagmaSVD)
+                     cusolver_available, GpuMagmaMatrixInverse, gpu_svd)

 _logger = logging.getLogger("theano.gpuarray.opt")

@@ -2149,11 +2149,16 @@ def local_gpu_svd(op, context_name, inputs, outputs):
        return
    if inputs[0].dtype not in ['float16', 'float32']:
        return
-    op = GpuMagmaSVD(full_matrices=op.full_matrices,
-                     compute_uv=op.compute_uv)
+    x = inputs[0]
    if inputs[0].dtype == 'float16':
-        return op(inputs[0].astype('float32')).astype('float16')
-    return op
+        x = inputs[0].astype('float32')
+    out = gpu_svd(x, compute_uv=op.compute_uv, full_matrices=op.full_matrices)
+    if inputs[0].dtype == 'float16':
+        if op.compute_uv:
+            out = [o.astype('float16') for o in out]
+        else:
+            out = [out.astype('float16')]
+    return out

 # Do not register in fast_run or fast_compile.
 # It will be added to fast_run if the GPU is enabled.

--- a/theano/gpuarray/pool.c
+++ b/theano/gpuarray/pool.c
@@ -217,8 +217,8 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads,

 // output shape for a given input padded shape, window shape and stride
 // We use ssize_t in the max since this is done to avoid negative results.
-#define OUTPUT_DIMS(in_dim, ws, st)                       \
-  (IGNORE_BORDER ? (in_dim - ws)/st + 1 :                 \
+#define OUTPUT_DIMS(in_dim, ws, st, ignore_border)        \
+  (ignore_border ? (in_dim - ws)/st + 1 :                 \
   (st > ws ? (in_dim - 1)/st + 1 :                       \
    std::max<ssize_t>(0, (in_dim - 1 - ws + st)/st) + 1))

@@ -229,7 +229,10 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
                         PyArrayObject *stride,
                         PyArrayObject *pad,
                         PyGpuArrayObject **z,
-                         PyGpuContextObject *ctx) {
+                         PARAMS_TYPE* params) {
+  bool max_pool = (params->mode == POOLING_MAX);
+  bool inc_pad = (params->mode != POOLING_AVERAGE_COUNT_EXCLUDE_PADDING);
+  bool sum_mode = (params->mode  == POOLING_SUM);
  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga))
    {
      PyErr_Format(PyExc_ValueError,
@@ -253,19 +256,19 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
    w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
    s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
    p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
-    z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i]);
+    z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i], params->ignore_border);
    if (p[i] > 0) {
      nonzero_padding = 1;
    }
  }
-  if (!IGNORE_BORDER && nonzero_padding) {
+  if (!params->ignore_border && nonzero_padding) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuPool: padding works only with ignore_border=True");
    return 1;
  }

  if (theano_prep_output(z, PyGpuArray_NDIM(x), z_dims,
-                         x->ga.typecode, GA_C_ORDER, ctx) != 0)
+                         x->ga.typecode, GA_C_ORDER, params->context) != 0)
    {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuPool: failed to allocate memory");
@@ -277,7 +280,7 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,

    if (ndims == 2) {
      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
-      if (MAX_POOL) {
+      if (max_pool) {
        err = max_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
                                      z_dims[0], z_dims[1], z_dims[2], z_dims[3],
                                      x_dims[2], x_dims[3],
@@ -295,7 +298,7 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
                                      x_dims[2], x_dims[3],
                                      x->ga.data, x->ga.offset,
                                      w[0], w[1], s[0], s[1], p[0], p[1],
-                                      INC_PAD, SUM_MODE,
+                                      inc_pad, sum_mode,
                                      (*z)->ga.data, (*z)->ga.offset);
        if (err != GA_NO_ERROR) {
          PyErr_Format(PyExc_RuntimeError,
@@ -307,7 +310,7 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
    }
    else if (ndims == 3) {
      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
-      if (MAX_POOL) {
+      if (max_pool) {
        err = max_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
                                      z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
                                      x_dims[2], x_dims[3], x_dims[4],
@@ -326,7 +329,7 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
                                      x->ga.data, x->ga.offset,
                                      w[0], w[1], w[2], s[0], s[1], s[2],
                                      p[0], p[1], p[2],
-                                      INC_PAD, SUM_MODE,
+                                      inc_pad, sum_mode,
                                      (*z)->ga.data, (*z)->ga.offset);
        if (err != GA_NO_ERROR) {
          PyErr_Format(PyExc_RuntimeError,

--- a/theano/gpuarray/pool.py
+++ b/theano/gpuarray/pool.py
@@ -3,9 +3,12 @@ import os.path

 import theano
 from theano import Apply
+from theano.gof import ParamsType
+from theano.scalar import bool as bool_t
 from theano.tensor.basic import as_tensor_variable
-from theano.tensor.signal.pool import Pool
+from theano.tensor.signal.pool import Pool, PoolingMode_t

+from .type import gpu_context_type
 from .basic_ops import (CGpuKernelBase, infer_context_name,
                        as_gpuarray_variable, gpu_contiguous)

@@ -22,6 +25,9 @@ class GpuPool(CGpuKernelBase):

    """
    __props__ = ('ignore_border', 'mode', 'ndim')
+    params_type = ParamsType(ignore_border=bool_t,
+                             mode=PoolingMode_t,
+                             context=gpu_context_type)

    def __init__(self, ignore_border, mode='max', ndim=2):
        self.ndim = ndim
@@ -31,9 +37,12 @@ class GpuPool(CGpuKernelBase):
        self.mode = mode
        CGpuKernelBase.__init__(self, ['pool.c'],
                                'APPLY_SPECIFIC(pool)')
-        assert mode in ('max', 'sum', 'average_inc_pad', 'average_exc_pad')
+        assert PoolingMode_t.has_alias(self.mode)
        assert self.ndim in [2, 3]

+    def get_params(self, node):
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)
+
    def c_headers(self):
        return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']

@@ -74,16 +83,6 @@ class GpuPool(CGpuKernelBase):

        return Apply(self, [inp, ws, stride, pad], [inp.type()])

-    def get_op_params(self):
-        ignore_border = int(self.ignore_border)
-        max_pool = int(self.mode == 'max')
-        inc_pad = int(self.mode != 'average_exc_pad')
-        sum_mode = int(self.mode == 'sum')
-        return [('IGNORE_BORDER', ignore_border),
-                ('INC_PAD', inc_pad),
-                ('MAX_POOL', max_pool),
-                ('SUM_MODE', sum_mode)]
-
    def infer_shape(self, node, in_shapes):
        ws, stride, pad = [node.inputs[1], node.inputs[2], node.inputs[3]]
        shp = Pool.out_shape(in_shapes[0], ws, self.ignore_border, stride,
@@ -214,6 +213,7 @@ class GpuAveragePoolGrad(CGpuKernelBase):

    """
    __props__ = ('ignore_border', 'mode', 'ndim')
+    params_type = ParamsType(mode=PoolingMode_t, context=gpu_context_type)

    def __init__(self, ignore_border, mode='max', ndim=2):
        self.ndim = ndim
@@ -226,6 +226,9 @@ class GpuAveragePoolGrad(CGpuKernelBase):
        assert mode in ('sum', 'average_inc_pad', 'average_exc_pad')
        assert ndim in [2, 3]

+    def get_params(self, node):
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)
+
    def c_headers(self):
        return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']

@@ -267,12 +270,6 @@ class GpuAveragePoolGrad(CGpuKernelBase):

        return Apply(self, [inp, out_grad, ws, stride, pad], [inp.type()])

-    def get_op_params(self):
-        inc_pad = int(self.mode == 'average_inc_pad')
-        sum_mode = int(self.mode == 'sum')
-        return [('INC_PAD', inc_pad),
-                ('SUM_MODE', sum_mode)]
-
    def infer_shape(self, node, in_shapes):
        return [in_shapes[0]]

@@ -369,6 +366,7 @@ class GpuMaxPoolRop(CGpuKernelBase):

    """
    __props__ = ('ignore_border', 'mode', 'ndim')
+    params_type = ParamsType(ignore_border=bool_t, context=gpu_context_type)

    def __init__(self, ignore_border, mode='max', ndim=2):
        self.ndim = ndim
@@ -379,6 +377,9 @@ class GpuMaxPoolRop(CGpuKernelBase):
        assert mode == 'max'
        assert ndim in [2, 3]

+    def get_params(self, node):
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)
+
    def c_headers(self):
        return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']

@@ -422,10 +423,6 @@ class GpuMaxPoolRop(CGpuKernelBase):

        return Apply(self, [inp, eval_point, ws, stride, pad], [eval_point.type()])

-    def get_op_params(self):
-        ignore_border = int(self.ignore_border)
-        return [('IGNORE_BORDER', ignore_border)]
-
    def infer_shape(self, node, in_shapes):
        ws, stride, pad = [node.inputs[2], node.inputs[3], node.inputs[4]]
        shp = Pool.out_shape(in_shapes[0], ws, self.ignore_border, stride,

--- a/theano/gpuarray/pool_ave_grad.c
+++ b/theano/gpuarray/pool_ave_grad.c
@@ -115,7 +115,9 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
                                  PyArrayObject *stride,
                                  PyArrayObject *pad,
                                  PyGpuArrayObject **gx,
-                                  PyGpuContextObject *ctx) {
+                                  PARAMS_TYPE* params) {
+  bool inc_pad = (params->mode == POOLING_AVERAGE_COUNT_INCLUDE_PADDING);
+  bool sum_mode = (params->mode == POOLING_SUM);
  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
      || !GpuArray_IS_C_CONTIGUOUS(&gz->ga))
    {
@@ -131,7 +133,7 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
      return 1;
    }
  if (theano_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x),
-                         x->ga.typecode, GA_C_ORDER, ctx) != 0)
+                         x->ga.typecode, GA_C_ORDER, params->context) != 0)
    {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMaxPoolGrad: failed to allocate memory");
@@ -161,7 +163,7 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
                                         x->ga.data, x->ga.offset,
                                         gz->ga.data, gz->ga.offset,
                                         w[0], w[1], s[0], s[1], p[0], p[1],
-                                         INC_PAD, SUM_MODE,
+                                         inc_pad, sum_mode,
                                         (*gx)->ga.data, (*gx)->ga.offset);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
@@ -177,7 +179,7 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
                                         x->ga.data, x->ga.offset,
                                         gz->ga.data, gz->ga.offset,
                                         w[0], w[1], w[2], s[0], s[1], s[2],
-                                         p[0], p[1], p[2], INC_PAD, SUM_MODE,
+                                         p[0], p[1], p[2], inc_pad, sum_mode,
                                         (*gx)->ga.data, (*gx)->ga.offset);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,

--- a/theano/gpuarray/pool_max_rop.c
+++ b/theano/gpuarray/pool_max_rop.c
@@ -109,8 +109,8 @@ KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
 #section support_code

 // output shape for a given input padded shape, window shape and stride
-#define OUTPUT_DIMS(in_dim, ws, st)                       \
-  (IGNORE_BORDER ? (in_dim - ws)/st + 1 :                 \
+#define OUTPUT_DIMS(in_dim, ws, st, ignore_border)        \
+  (ignore_border ? (in_dim - ws)/st + 1 :                 \
   (st > ws ? (in_dim - 1)/st + 1 :                       \
    std::max<ssize_t>(0, (in_dim - 1 - ws + st)/st) + 1))

@@ -122,7 +122,7 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
                                 PyArrayObject *stride,
                                 PyArrayObject *pad,
                                 PyGpuArrayObject **z,
-                                 PyGpuContextObject *ctx) {
+                                 PARAMS_TYPE* params) {
  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&ex->ga))
    {
      PyErr_Format(PyExc_ValueError,
@@ -146,19 +146,19 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
    w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
    s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
    p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
-    z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i]);
+    z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i], params->ignore_border);
    if (p[i] > 0) {
      nonzero_padding = 1;
    }
  }
-  if (!IGNORE_BORDER && nonzero_padding) {
+  if (!params->ignore_border && nonzero_padding) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMaxPoolRop: padding works only with ignore_border=True");
    return 1;
  }

  if (theano_prep_output(z, PyGpuArray_NDIM(ex), z_dims,
-                         ex->ga.typecode, GA_C_ORDER, ctx) != 0)
+                         ex->ga.typecode, GA_C_ORDER, params->context) != 0)
    {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMaxPoolRop: failed to allocate memory");

--- a/theano/gpuarray/tests/test_cgpukernelbase.py
+++ b/theano/gpuarray/tests/test_cgpukernelbase.py
@@ -4,10 +4,12 @@ from six.moves import xrange

 import theano
 from theano import tensor, config, Apply, Op
+from theano.scalar import int32 as int_t
+from theano.gof import ParamsType
 from theano.gradient import grad_undefined

 from ..basic_ops import CGpuKernelBase
-from ..type import GpuArrayType, get_context
+from ..type import GpuArrayType, get_context, gpu_context_type


 # This is an implementation to test that CGpuKernelBase works and also
@@ -18,6 +20,7 @@ class GpuEye(CGpuKernelBase, Op):

    """
    __props__ = ('dtype', 'context_name')
+    params_type = ParamsType(typecode=int_t, context=gpu_context_type)

    def __init__(self, dtype=None, context_name=None):
        if dtype is None:
@@ -28,7 +31,9 @@ class GpuEye(CGpuKernelBase, Op):
                                'APPLY_SPECIFIC(tstgpueye)')

    def get_params(self, node):
-        return get_context(self.context_name)
+        from pygpu.gpuarray import dtype_to_typecode
+        return self.params_type.get_params(typecode=dtype_to_typecode(self.dtype),
+                                           context=get_context(self.context_name))

    def c_headers(self):
        return ['<gpuarray/types.h>', '<gpuarray/kernel.h>']
@@ -52,11 +57,6 @@ class GpuEye(CGpuKernelBase, Op):
        return [grad_undefined(self, i, inp[i])
                for i in xrange(2)]

-    def get_op_params(self):
-        from pygpu.gpuarray import dtype_to_typecode
-
-        return [('TYPECODE', str(dtype_to_typecode(self.dtype)))]
-

 def test_cgpukernelbase():
    # Import inside the function to prevent the back-end from being
@@ -69,4 +69,5 @@ def test_cgpukernelbase():

    r = f()

+    assert r.dtype == 'int32'
    assert (np.asarray(r) == np.eye(4, 5, dtype='int32')).all()
--- a/theano/gpuarray/tests/test_nerv.py
+++ b/theano/gpuarray/tests/test_nerv.py
-from __future__ import absolute_import, print_function, division
-from nose.plugins.skip import SkipTest
-
-import numpy as np
-
-from theano import function
-from theano.tests import unittest_tools as utt
-from theano.tensor import vector, matrix, dot
-
-from .config import mode_with_gpu
-from ..nerv import Gemm16, nerv
-
-
-def test_gemm16_swap():
-    if nerv is None:
-        raise SkipTest("nervanagpu not available")
-    v = vector(dtype='float16')
-    m = matrix(dtype='float16')
-    m2 = matrix(dtype='float16')
-    m32 = matrix(dtype='float32')
-
-    # test that we don't try to replace anything but matrix x matrix in float16
-    f = function([v, m], dot(v, m), mode=mode_with_gpu)
-    assert len([node for node in f.maker.fgraph.apply_nodes
-                if isinstance(node.op, Gemm16)]) == 0
-    f = function([m32, m], dot(m32, m), mode=mode_with_gpu)
-    assert len([node for node in f.maker.fgraph.apply_nodes
-                if isinstance(node.op, Gemm16)]) == 0
-
-    f = function([m, m2], dot(m, m2), mode=mode_with_gpu)
-    assert len([node for node in f.maker.fgraph.apply_nodes
-                if isinstance(node.op, Gemm16)]) == 1
-
-
-def test_gemm16_value():
-    if nerv is None:
-        raise SkipTest("nervanagpu not available")
-    m = matrix(dtype='float16')
-    m2 = matrix(dtype='float16')
-
-    f = function([m, m2], dot(m, m2), mode=mode_with_gpu)
-
-    v1 = np.random.random((3, 4)).astype('float16')
-    v2 = np.random.random((4, 2)).astype('float16')
-
-    of = f(v1, v2)
-    on = np.dot(v1, v2)
-
-    utt.assert_allclose(of, on)
--- a/theano/gpuarray/tests/tstgpueye.c
+++ b/theano/gpuarray/tests/tstgpueye.c
@@ -18,7 +18,7 @@ KERNEL void eye(GLOBAL_MEM DTYPE_OUTPUT_0 *a, ga_size a_off, ga_size n, ga_size
 #section support_code_struct

 int APPLY_SPECIFIC(tstgpueye)(PyArrayObject *n, PyArrayObject *m,
-                              PyGpuArrayObject **z, PyGpuContextObject *ctx) {
+                              PyGpuArrayObject **z, PARAMS_TYPE* params) {
  size_t dims[2] = {0, 0};
  size_t ls, gs;
  void *args[3];
@@ -29,9 +29,9 @@ int APPLY_SPECIFIC(tstgpueye)(PyArrayObject *n, PyArrayObject *m,

  Py_XDECREF(*z);
  *z = pygpu_zeros(2, dims,
-                   TYPECODE,
+                   params->typecode,
                   GA_C_ORDER,
-                   ctx, Py_None);
+                   params->context, Py_None);
  if (*z == NULL)
    return -1;


--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
 from nose.plugins.skip import SkipTest
-
+# NB: We raise a SkipTest (instead of another type of exception) because we're in a folder,
+# thus nosetests will look for test files into this folder. With a SkipTest raised,
+# the folder will be skipped by nosetests without failing.
 raise SkipTest(
    "You are importing theano.sandbox.cuda. This is the old GPU back-end and "
    "is removed from Theano. Use Theano 0.9 to use it. Even better, "

--- a/theano/tensor/signal/pool.py
+++ b/theano/tensor/signal/pool.py
@@ -14,7 +14,7 @@ from six.moves import xrange
 import six.moves.builtins as builtins
 import theano
 from theano import gof, OpenMPOp, tensor, Variable, Apply
-from theano.gof.params_type import ParamsType
+from theano.gof import ParamsType, EnumList
 from theano.gradient import DisconnectedType
 from theano.scalar import bool as bool_t

@@ -258,6 +258,16 @@ def pool_3d(input, ws=None, ignore_border=None, stride=None, pad=(0, 0, 0),
    return output


+# NB: This enum type is currently used in gpuarray/pool.py.
+# It may be used later as op param in this current file.
+# Enum name and constants names are inspired from cuDNN type `cudnnPoolingMode_t`
+# (cf. `theano/gpuarray/cudnn_defs.py`).
+PoolingMode_t = EnumList(('POOLING_MAX', 'max'),
+                         ('POOLING_SUM', 'sum'),
+                         ('POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
+                         ('POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'))
+
+
 class Pool(OpenMPOp):
    """
    sum or average over different patches.