Merge pull request #6091 from notoraptor/params-for-other-ops

Params for other ops

Merge pull request #6091 from notoraptor/params-for-other-ops
5df0cfd8 · Frédéric Bastien · GitHub · 49d99209 · 61cb8c41 · 5df0cfd8
--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -346,6 +346,18 @@ class Shape_i(gof.Op):
        i = int(i)
        self.i = i

+    # NB:
+    # 1) params_type is defined as a property to avoid
+    #    loop in Python import caused by importing theano.scalar below
+    #    when params_type is defined directly in class code.
+    # 2) We wrap scalar into ParamsType (instead of directly using scalar as op param)
+    #    to avoid Theano converting scalar param to constant that would be later
+    #    hardcoded as litteral in C code, making us loose all the advantages of
+    #    using params.
+    @property
+    def params_type(self):
+        return gof.ParamsType(i=theano.scalar.basic.int64)
+
    def __str__(self):
        return '%s{%i}' % (self.__class__.__name__, self.i)

@@ -360,7 +372,7 @@ class Shape_i(gof.Op):
                            (x, self.i))
        return theano.Apply(self, [x], [theano.tensor.lscalar()])

-    def perform(self, node, inp, out_):
+    def perform(self, node, inp, out_, params):
        x, = inp
        out, = out_
        if out[0] is None:
@@ -383,7 +395,7 @@ class Shape_i(gof.Op):
            version.append((str(t), v))

        if version:
-            version.append(1)
+            version.append(2)

        return tuple(version)

@@ -391,7 +403,8 @@ class Shape_i(gof.Op):
        iname, = inames
        oname, = onames
        fail = sub['fail']
-        i = self.i
+        # i is then 'params->i', not just 'params'.
+        i = sub['params'] + '->i'

        itype = node.inputs[0].type.__class__
        if itype in self.c_code_and_version:

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -10,6 +10,9 @@ except ImportError:

 from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape, infer_context_name)
 from .opt import register_opt, op_lifter, register_opt2
+from .type import gpu_context_type
+from theano.gof import ParamsType
+import theano.scalar as scalar


 class GpuCumOp(GpuKernelBase, Op):
@@ -21,9 +24,12 @@ class GpuCumOp(GpuKernelBase, Op):
    """
    SUPPORTED_NDIMS = 3
    __props__ = ('axis', 'mode')
+    params_type = ParamsType(axis=scalar.int32,
+                             context=gpu_context_type)

    def __init__(self, axis, mode='add'):
-        self.axis = axis if axis else 0
+        assert axis is not None
+        self.axis = int(axis)
        self.mode = mode

    def __eq__(self, other):
@@ -35,7 +41,7 @@ class GpuCumOp(GpuKernelBase, Op):
        return hash(self.axis) ^ hash(self.mode)

    def c_code_cache_version(self):
-        return (6,)
+        return (7,)

    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
@@ -43,6 +49,9 @@ class GpuCumOp(GpuKernelBase, Op):
    def c_header_dirs(self):
        return [os.path.dirname(__file__)]

+    def get_params(self, node):
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)
+
    def make_node(self, x):
        assert x.type.dtype == 'float32', "Only float32 supported for GpuCumOp"

@@ -244,24 +253,18 @@ class GpuCumOp(GpuKernelBase, Op):
    def c_code(self, node, nodename, inp, out, sub):
        if node.inputs[0].type.context.kind != b'cuda':
            raise NotImplementedError("cuda only")
-        x, = inp
-        z, = out
-        axis = self.axis if self.axis is not None else 0
-        fail = sub['fail']
-        ctx = sub['params']
-
-        code = """
-
+        return """
            const size_t* shape = PyGpuArray_DIMS(%(x)s);
            bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s);

-            int axis = %(axis)s;
+            int axis = %(params)s->axis;
            if (axis < 0) {
                // Convert negative axis to positive axis.
                axis += PyGpuArray_NDIM(%(x)s);
            }

-            if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s) != 0){
+            if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s),
+                                   %(x)s->ga.typecode, GA_C_ORDER, %(params)s->context) != 0) {
                %(fail)s;
            }

@@ -270,17 +273,17 @@ class GpuCumOp(GpuKernelBase, Op):
                size_t max_grid_size1;
                size_t max_grid_size2;
                int err;
-                err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
+                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0");
                    %(fail)s;
                }
-                err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
+                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1");
                    %(fail)s;
                }
-                err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
+                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
                    %(fail)s;
@@ -289,9 +292,7 @@ class GpuCumOp(GpuKernelBase, Op):
                    %(fail)s;
                }
            }
-        """ % locals()
-
-        return code
+        """ % dict(x=inp[0], z=out[0], nodename=nodename, fail=sub['fail'], params=sub['params'])

    def c_support_code_struct(self, node, nodename):
        code = """

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
 from __future__ import absolute_import, print_function, division
-import numpy as np

 from theano import Op, Apply, config
+from theano.gof import ParamsType
 from theano.tensor.nnet.neighbours import Images2Neibs
 import theano.tensor as T

 try:
-    import pygpu
    from pygpu import gpuarray
 except ImportError:
    pass
@@ -14,7 +13,7 @@ except ImportError:
 from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
                        infer_context_name)
 from .opt import register_opt2, op_lifter, register_opt
-from .type import GpuArrayType
+from .type import GpuArrayType, gpu_context_type


 class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
@@ -22,13 +21,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
    Images2Neibs for the GPU.

    """
-    def __init__(self, mode='valid'):
-        if mode not in ['valid', 'half', 'full',
-                        'ignore_borders', 'wrap_centered']:
-            raise NotImplementedError("Only the mode valid, half, full, "
-                                      "ignore_borders and wrap_centered have "
-                                      "been implemented for GpuImages2Neibs")
-        self.mode = mode
+    params_type = ParamsType(mode=Images2Neibs.BORDER_MODE, context=gpu_context_type)
+
+    def get_params(self, node):
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def make_node(self, ten4, neib_shape, neib_step=None):
        ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
@@ -50,7 +46,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                                   context_name=ten4.type.context_name)()])

    def c_code_cache_version(self):
-        return (12,)
+        return (13,)

    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']
@@ -61,13 +57,16 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        flags = Kernel.get_flags(dtype_ten4, dtype_z)
        type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
        type_z = gpuarray.dtype_to_ctype(dtype_z)
-        mode = self.mode
+        # `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here.
+        mode_constants = self.BORDER_MODE.c_support_code()
        kernels = []
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
        code = """
-// a version that uses less registers but doesn't work in all cases.
+        // a version that uses less registers but doesn't work in all cases.
+        %(mode_constants)s
        KERNEL void %(kname)s(
+            const ga_int mode,
            const ga_int nb_batch,
            const ga_int nb_stack,
            const ga_int height,
@@ -110,29 +109,29 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                            ga_int i = LID_1;     // loop over c
                            {
                                ga_int ten4_2 = i + a * step_x;
-                                if("%(mode)s"=="wrap_centered"){
+                                if(mode == MODE_WRAP_CENTERED) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
-                                } else if ("%(mode)s"=="half"){
+                                } else if (mode == MODE_HALF) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
-                                } else if ("%(mode)s"=="full"){
+                                } else if (mode == MODE_FULL) {
                                    ten4_2 -= c - 1;
                                }
                                ga_int j = LID_0;  // loop over d
                                {
                                    ga_int ten4_3 = j + b * step_y;
-                                    if("%(mode)s"=="wrap_centered"){
+                                    if(mode == MODE_WRAP_CENTERED){
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
-                                    } else if ("%(mode)s"=="half"){
+                                    } else if (mode == MODE_HALF) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
-                                    } else if ("%(mode)s"=="full"){
+                                    } else if (mode == MODE_FULL) {
                                        ten4_3 -= d - 1;
                                    }

@@ -150,8 +149,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                                }
                            }
            }
-        }""" % locals()
+        }""" % dict(kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants)
        params = [
+            'intc',
            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
            'intc', 'intc', 'intc', 'intc',
            'uintp', 'uintp', 'uintp', 'uintp',
@@ -165,7 +165,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        kname = "k_multi_warp"
        k_var = "k_multi_warp_" + nodename
        code = """
+        %(mode_constants)s
        KERNEL void %(kname)s(
+            const ga_int mode,
            const ga_int nb_batch,
            const ga_int nb_stack,
            const ga_int height,
@@ -209,30 +211,30 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                            for (ga_int i = LID_1; i < c; i+=LDIM_1)
                            {
                                ga_int ten4_2 = i + a * step_x;
-                                if("%(mode)s"=="wrap_centered"){
+                                if(mode == MODE_WRAP_CENTERED) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
-                                } else if ("%(mode)s"=="half"){
+                                } else if (mode == MODE_HALF) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
-                                } else if ("%(mode)s"=="full"){
+                                } else if (mode == MODE_FULL) {
                                    ten4_2 -= c - 1;
                                }
                                // loop over d
                                for (ga_int j = LID_0; j < d; j+=LDIM_0)
                                {
                                    ga_int ten4_3 = j + b * step_y;
-                                    if("%(mode)s"=="wrap_centered"){
+                                    if(mode == MODE_WRAP_CENTERED) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
-                                    } else if ("%(mode)s"=="half"){
+                                    } else if (mode == MODE_HALF) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
-                                    } else if ("%(mode)s"=="full"){
+                                    } else if (mode == MODE_FULL) {
                                        ten4_3 -= d - 1;
                                    }

@@ -251,8 +253,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                            }
            }
        }
-        """ % locals()
+        """ % dict(kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants)
        params = [
+            'intc',
            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
            'intc', 'intc', 'intc', 'intc',
            'uintp', 'uintp', 'uintp', 'uintp',
@@ -274,18 +277,6 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        """

    def c_code(self, node, name, inp, out, sub):
-        dtype_ten4 = node.inputs[0].dtype
-        dtype_neib_shape = node.inputs[1].dtype
-        dtype_neib_step = node.inputs[2].dtype
-        dtype_z = node.outputs[0].dtype
-        itemsize_ten4 = np.dtype(dtype_ten4).itemsize
-        itemsize_z = np.dtype(dtype_z).itemsize
-        typecode_z = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        ten4, neib_shape, neib_step = inp
-        z, = out
-        fail = sub['fail']
-        ctx = sub['params']
-        mode = self.mode
        err_check = """
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
@@ -293,16 +284,23 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                             GpuKernel_error(fptr, err));
                %(fail)s;
            }
-        """ % locals()
+        """ % dict(fail=sub['fail'])
        sync = ""
        if config.gpuarray.sync:
            sync = """
            err = GpuArray_sync(&%(z)s->ga);
            %(err_check)s
-            """ % locals()
+            """ % dict(z=out[0], err_check=err_check)
+        # NB: To reduce C code variability:
+        # For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize
+        # For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node)
+        # For typecode_z, I use ten4->ga.typecode (for same reason as above)
        return """
        int grid_c = -1;
        int grid_d = -1;
+        size_t itemsize_ten4 = GpuArray_ITEMSIZE(&%(ten4)s->ga);
+        size_t itemsize_z = itemsize_ten4;
+        int typecode_z = %(ten4)s->ga.typecode;

        {
            if (PyGpuArray_NDIM(%(ten4)s) != 4)
@@ -351,10 +349,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                %(fail)s;
            }

-            if ( "%(mode)s" == "wrap_centered") {
+            if (%(params)s->mode == MODE_WRAP_CENTERED) {
                if (c%%2!=1 || d%%2!=1){
                    PyErr_Format(PyExc_TypeError,
-        "GpuImages2Neibs: in mode wrap_centered need patch with odd shapes");
+                                 "GpuImages2Neibs: in mode wrap_centered need patch with odd shapes");
                    %(fail)s;
                }
                if ( PyGpuArray_DIMS(%(ten4)s)[2] < c ||
@@ -375,7 +373,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                                     (size_t)step_y);


-            }else if ( "%(mode)s" == "valid") {
+            } else if (%(params)s->mode == MODE_VALID) {
                if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
                     ((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
                {
@@ -400,12 +398,12 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
                //number of patch in width
                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
-            }else if ( "%(mode)s" == "ignore_borders") {
+            } else if (%(params)s->mode == MODE_IGNORE_BORDERS) {
                //number of patch in height
                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
                //number of patch in width
                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
-            }else if ( "%(mode)s" == "half") {
+            } else if (%(params)s->mode == MODE_HALF) {
                if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
                     ((((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0))
                {
@@ -430,7 +428,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x);
                //number of patch in width
                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y);
-            }else if ( "%(mode)s" == "full") {
+            } else if (%(params)s->mode == MODE_FULL) {
                if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
                 ( (((PyGpuArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0))
                {
@@ -455,9 +453,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]+c-2)/step_x);
                //number of patch in width
                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]+d-2)/step_y);
-            }else{
+            } else {
                PyErr_Format(PyExc_TypeError,
-                             "GpuImages2Neibs:: unknown mode '%(mode)s'");
+                             "GpuImages2Neibs:: unknown mode %%d", %(params)s->mode);
                 %(fail)s;
            }

@@ -476,8 +474,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                size_t dims[2];
                dims[0] = z_dim0;
                dims[1] = z_dim1;
-                %(z)s = pygpu_empty(2, dims, %(typecode_z)s,
-                                    GA_C_ORDER, %(ctx)s, Py_None);
+                %(z)s = pygpu_empty(2, dims, typecode_z,
+                                    GA_C_ORDER, %(params)s->context, Py_None);
                if (!%(z)s)
                {
                    PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
@@ -490,6 +488,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):

        { // NESTED SCOPE

+            const int mode = %(params)s->mode;
            const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0];
            const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1];
            const int height = PyGpuArray_DIMS(%(ten4)s)[2];
@@ -507,7 +506,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
            size_t threads_per_block[3] = {d, c, 1};
            //get the max threads per blocks
            size_t max_threads_dim;
-            int err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
+            int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
            if (err != GA_NO_ERROR){
                PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
                %(fail)s;
@@ -535,14 +534,19 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
            }else{
                fptr = &k_multi_warp_%(name)s;
            }
-            // printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n", max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1], n_blocks[2]);
-            size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s;
-            size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s;
-            size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s;
-            size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s;
-            size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
-            size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
-            void *kernel_params[] = {(void *)&nb_batch,
+            /*
+            printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n",
+                   max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2],
+                   n_blocks[0], n_blocks[1], n_blocks[2]);
+            */
+            size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / itemsize_ten4;
+            size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / itemsize_ten4;
+            size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / itemsize_ten4;
+            size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / itemsize_ten4;
+            size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / itemsize_z;
+            size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / itemsize_z;
+            void *kernel_params[] = {(void *)&mode,
+                                     (void *)&nb_batch,
                                     (void *)&nb_stack,
                                     (void *)&height, (void *)&width,
                                     (void *)&c, (void *)&d,
@@ -562,11 +566,18 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
            %(err_check)s
            %(sync)s
        } // END NESTED SCOPE
-        """ % locals()
-
-    def perform(self, node, inp, out, ctx):
+        """ % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0],
+                   dtype_neib_shape=node.inputs[1].dtype,
+                   dtype_neib_step=node.inputs[2].dtype,
+                   err_check=err_check,
+                   sync=sync,
+                   name=name,
+                   params=sub['params'],
+                   fail=sub['fail'])
+
+    def perform(self, node, inp, out, params):
        # Disable the perform method from the CPU version
-        Op.perform(self, node, inp, out, ctx)
+        Op.perform(self, node, inp, out, params)


 @register_opt('fast_compile')

--- a/theano/gpuarray/rng_mrg.py
+++ b/theano/gpuarray/rng_mrg.py
@@ -7,16 +7,15 @@ http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
 """
 from __future__ import absolute_import, print_function, division

-import numpy as np
-
 from theano import Apply, tensor
 from theano.gof import local_optimizer
 from theano.sandbox.rng_mrg import mrg_uniform_base, mrg_uniform
 from theano.tensor import as_tensor_variable, get_vector_length
+from theano.scalar import int32 as int_t

 from .basic_ops import (GpuKernelBase, Kernel, infer_context_name,
                        host_from_gpu, as_gpuarray_variable)
-from .type import GpuArrayType
+from .type import GpuArrayType, gpu_context_type
 from .fp16_help import write_w
 from .opt import register_opt, register_opt2

@@ -24,6 +23,9 @@ from .opt import register_opt, register_opt2
 class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
    # GpuArray version
    _f16_ok = True
+    params_type = mrg_uniform_base.params_type.extended(otypecode=int_t, context=gpu_context_type)
+
+    otypecode = property(lambda self: self.output_type.typecode)

    def make_node(self, rstate, size):
        # error checking slightly redundant here, since
@@ -39,6 +41,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                     [rstate, size],
                     [rstate.type(), output_type])

+    def get_params(self, node):
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)
+
    @classmethod
    def new(cls, rstate, ndim, dtype, size):
        v_size = as_tensor_variable(size)
@@ -168,40 +173,34 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                ]

    def c_code(self, node, nodename, inp, out, sub):
-        rstate, size = inp
-        o_rstate, o_sample = out
-        inplace = int(self.inplace)
-        ndim = self.output_type.ndim
-        o_type_num = np.asarray(0, dtype=self.output_type.dtype).dtype.num
-        fail = sub['fail']
-        ctx = sub['params']
-        kname = self.gpu_kernels(node, nodename)[0].objvar
-        otypecode = str(self.output_type.typecode)
-
        return """
        npy_int64 M1 = 2147483647;      //2^31 - 1
-        // The +1 is to avoid odims[0] which fails on windows
-        size_t odims[%(ndim)s+1];
        size_t n_elements = 1;
        unsigned int n_streams;
        int must_alloc_sample = ((NULL == %(o_sample)s)
                || !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
                || !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
-                || (PyGpuArray_NDIM(%(o_sample)s) != %(ndim)s));
+                || (PyGpuArray_NDIM(%(o_sample)s) != %(params)s->ndim));
+
+        size_t* odims = (size_t*)malloc(%(params)s->ndim * sizeof(size_t));
+        if (odims == NULL) {
+            PyErr_NoMemory();
+            %(just_fail)s
+        }

        if (PyArray_NDIM(%(size)s) != 1)
        {
            PyErr_SetString(PyExc_ValueError, "size must be vector");
            %(fail)s
        }
-        if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
+        if (PyArray_DIMS(%(size)s)[0] != %(params)s->ndim)
        {
            PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
-                %(ndim)s, PyArray_DIMS(%(size)s)[0]);
+                %(params)s->ndim, PyArray_DIMS(%(size)s)[0]);
            %(fail)s
        }

-        for (int i = 0; i < %(ndim)s; ++i)
+        for (int i = 0; i < %(params)s->ndim; ++i)
        {
            odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
            n_elements *= odims[i];
@@ -219,8 +218,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        if (must_alloc_sample)
        {
            Py_XDECREF(%(o_sample)s);
-            %(o_sample)s = pygpu_empty(%(ndim)s, odims, %(otypecode)s, GA_C_ORDER,
-                                       %(ctx)s, Py_None);
+            %(o_sample)s = pygpu_empty(%(params)s->ndim, odims, %(params)s->otypecode, GA_C_ORDER,
+                                       %(params)s->context, Py_None);
            if(!%(o_sample)s)
            {
                %(fail)s;
@@ -233,7 +232,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        }

        Py_XDECREF(%(o_rstate)s);
-        if (%(inplace)s)
+        if (%(params)s->inplace)
        {
            Py_INCREF(%(rstate)s);
            %(o_rstate)s = %(rstate)s;
@@ -285,10 +284,22 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
              %(fail)s
          }
        }
-        """ % locals()
+
+        free(odims);
+        """ % dict(rstate=inp[0], size=inp[1],
+                   o_rstate=out[0], o_sample=out[1],
+                   kname=self.gpu_kernels(node, nodename)[0].objvar,
+                   params=sub['params'],
+                   just_fail=sub['fail'],
+                   fail="""
+                   {
+                     free(odims);
+                     %(fail)s
+                   }
+                   """ % dict(fail=sub['fail']))

    def c_code_cache_version(self):
-        return (14,)
+        return (15,)


 @register_opt2([mrg_uniform], 'fast_compile')

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -7,9 +7,11 @@ from six import integer_types
 from six.moves import StringIO

 from theano import tensor, gof, Op
+from theano.gof import ParamsType
 from theano.gradient import grad_not_implemented
 import theano.tensor as T
 from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
+from theano.scalar import bool as bool_t, int32 as int_t, uint32 as size_t

 try:
    import pygpu
@@ -594,7 +596,15 @@ class GpuAdvancedIncSubtensor1(Op):
    """
    _f16_ok = True
    __props__ = ('inplace', 'set_instead_of_inc')
-    params_type = gpu_context_type
+    params_type = ParamsType(inplace=bool_t,
+                             set_instead_of_inc=bool_t,
+                             context=gpu_context_type,
+                             # following params are used into c_init_code_struct(),
+                             # as inputs are not available in that function.
+                             ndim_input_0=size_t,
+                             ndim_input_1=size_t,
+                             typecode_input_0=int_t,
+                             typecode_input_1=int_t)

    def __init__(self, inplace=False, set_instead_of_inc=False):
        self.inplace = inplace
@@ -634,12 +644,17 @@ class GpuAdvancedIncSubtensor1(Op):
        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])

    def get_params(self, node):
-        return node.outputs[0].type.context
+        return self.params_type.get_params(self, context=node.outputs[0].type.context,
+                                           # following params are used into c_init_code_struct().
+                                           ndim_input_0=node.inputs[0].ndim,
+                                           ndim_input_1=node.inputs[1].ndim,
+                                           typecode_input_0=node.inputs[0].type.typecode,
+                                           typecode_input_1=node.inputs[1].type.typecode)

    # We can't use the parent version that loops on each index
    # as we also need to loop when set_instead_of_inc is True and the
    # parent doesn't loop in that case.
-    def perform(self, node, inp, out_, ctx=None):
+    def perform(self, node, inp, out_, params=None):
        # TODO opt to make this inplace
        x, y, idx = inp
        out, = out_
@@ -700,21 +715,18 @@ class GpuAdvancedIncSubtensor1(Op):
        return """
        gpuelemwise_arg args[2] = {{0}};
        args[0].name = "a";
-        args[0].typecode = %(type1)s;
+        args[0].typecode = %(params)s->typecode_input_0;
        args[0].flags = GE_READ|GE_WRITE;
        args[1].name = "b";
-        args[1].typecode = %(type2)s;
+        args[1].typecode = %(params)s->typecode_input_1;
        args[1].flags = GE_READ;
-        iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b",
-                               2, args, %(nd)s, GE_CONVERT_F16);
+        iadd = GpuElemwise_new(%(params)s->context->ctx, "", "a += b",
+                               2, args, %(params)s->ndim_input_1, GE_CONVERT_F16);
        if (iadd == NULL) {
          PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
          %(fail)s
        }
-        """ % dict(ctx=sub['params'], fail=sub['fail'],
-                   type1=node.inputs[0].type.typecode,
-                   type2=node.inputs[1].type.typecode,
-                   nd=node.inputs[1].ndim)
+        """ % dict(params=sub['params'], fail=sub['fail'])

    def c_code(self, node, name, inputs, outputs, sub):
        if (node.inputs[0].ndim != node.inputs[1].ndim):
@@ -722,18 +734,26 @@ class GpuAdvancedIncSubtensor1(Op):

        return """
        PyGpuArrayObject *row_x, *row_y;
-        ssize_t start[%(nd)s], step[%(nd)s];
+        size_t nd = %(params)s->ndim_input_0;
+        ssize_t *start = NULL, *step = NULL;
        size_t num_indices, j;
        int ret;
        int broadcast_y;

-        for (j = 0; j < %(nd)s; j++) {
+        start = (ssize_t*)malloc(nd * sizeof(ssize_t));
+        step = (ssize_t*)malloc(nd * sizeof(ssize_t));
+        if (start == NULL || step == NULL) {
+            PyErr_NoMemory();
+            %(fail)s
+        }
+
+        for (j = 0; j < nd; ++j) {
          start[j] = 0;
          step[j] = 1;
        }
        step[0] = 0;
        num_indices = PyArray_SIZE(%(ind)s);
-        if (!%(inplace)s) {
+        if (!%(params)s->inplace) {
          %(out)s = theano_try_copy(%(out)s, %(x)s);
          if (%(out)s == NULL) {
            // Exception already set
@@ -774,7 +794,7 @@ class GpuAdvancedIncSubtensor1(Op):
              %(fail)s;
            }

-            if (%(set_instead_of_inc)s) {
+            if (%(params)s->set_instead_of_inc) {
              ret = GpuArray_setarray(&row_x->ga, &row_y->ga);
            } else {
              void *args[2];
@@ -788,13 +808,21 @@ class GpuAdvancedIncSubtensor1(Op):
              PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
          }
        }
+
+        free(start);
+        free(step);
        """ % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0],
-                   fail=sub['fail'], inplace=int(self.inplace),
-                   nd=node.inputs[0].ndim,
-                   set_instead_of_inc=int(self.set_instead_of_inc))
+                   params=sub['params'],
+                   fail="""
+                   {
+                        free(start);
+                        free(step);
+                        %(fail)s
+                   }
+                   """ % dict(fail=sub['fail']))

    def c_code_cache_version(self):
-        return (3,)
+        return (4,)


 class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
@@ -805,6 +833,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,

    """
    _f16_ok = True
+    params_type = GpuAdvancedIncSubtensor1.params_type
+    get_params = GpuAdvancedIncSubtensor1.get_params

    def make_node(self, x, y, ilist):
        """
@@ -837,14 +867,11 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,

        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])

-    def get_params(self, node):
-        return node.outputs[0].type.context
-
-    def perform(self, node, inp, out, ctx):
+    def perform(self, node, inp, out, params):
        return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)

    def c_code_cache_version(self):
-        return (12,)
+        return (13,)

    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray_helper.h>',
@@ -854,7 +881,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
        return [os.path.dirname(__file__)]

    def c_code(self, node, name, inputs, outputs, sub):
-        ctx = self.get_params(node)
+        ctx = self.get_params(node).context
        if ctx.kind != b'cuda':
            raise NotImplementedError("cuda only")
        if (node.inputs[0].ndim != node.inputs[1].ndim or
@@ -862,16 +889,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
                int(ctx.bin_id[-2]) < 2):
            raise NotImplementedError("This case does not have C code yet.")

-        x = inputs[0]
-        y = inputs[1]
-        ind = inputs[2]
-        out = outputs[0]
-        fail = sub['fail']
-        set_instead_of_inc = int(self.set_instead_of_inc)
-        inplace = int(self.inplace)
        return """
 int err;
-if (%(inplace)s) {
+if (%(params)s->inplace) {
  Py_XDECREF(%(out)s);
  %(out)s = %(x)s;
  Py_INCREF(%(out)s);
@@ -882,25 +902,19 @@ if (!%(out)s) {
  // Exception already set
  %(fail)s
 }
-if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(set_instead_of_inc)s)) {
+if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of_inc)) {
  %(fail)s
 }
-        """ % locals()
+        """ % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0], fail=sub['fail'], params=sub['params'])

    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
-        dtype_out = node.outputs[0].dtype
-        itemsize_x = np.dtype(dtype_x).itemsize
-        itemsize_y = np.dtype(dtype_y).itemsize
-        itemsize_ind = np.dtype(dtype_ind).itemsize
-        itemsize_out = np.dtype(dtype_out).itemsize
-        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_y = gpuarray.dtype_to_ctype(dtype_y)
        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
-        type_out = gpuarray.dtype_to_ctype(dtype_out)
+        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
        kname = "k_vector_add_fast"
        k_var = "k_vector_add_fast_" + nodename
        code = """
@@ -1010,7 +1024,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
             }
             return;
        }
-        """ % locals()
+        """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind)
        params = [
            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
@@ -1020,26 +1034,19 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
                       flags=flags, objvar=k_var)]

    def c_support_code_struct(self, node, nodename):
-        dtype_x = node.inputs[0].dtype
-        dtype_y = node.inputs[1].dtype
-        dtype_ind = node.inputs[2].dtype
-        dtype_out = node.outputs[0].dtype
-        itemsize_x = np.dtype(dtype_x).itemsize
-        itemsize_y = np.dtype(dtype_y).itemsize
-        itemsize_ind = np.dtype(dtype_ind).itemsize
-        itemsize_out = np.dtype(dtype_out).itemsize
-        k_var = "k_vector_add_fast_" + nodename
-
        return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """
        int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
                                     PyGpuArrayObject* py_other,
-                                     PyGpuArrayObject *indices_arr,
+                                     PyGpuArrayObject* indices_arr,
                                     const int set_instead_of_inc)
        {
            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
            size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
            gpudata *errbuf;
            int err, kerr = 0;
+            size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga);
+            size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga);
+            size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga);

            if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
              err = gpudata_property(py_self->ga.data,
@@ -1049,11 +1056,11 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
                return 1;
              }

-              ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s;
-              ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s;
-              ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s;
-              ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s;
-              ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s;
+              ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / itemsize_x;
+              ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / itemsize_x;
+              ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y;
+              ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y;
+              ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind;
              void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
                                       (void *)&PyGpuArray_DIMS(py_self)[1],
                                       (void *)&stride_X0,
@@ -1093,7 +1100,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
            }
          return 0;
        }
-        """ % locals()
+        """ % dict(k_var="k_vector_add_fast_" + nodename)


 class GpuExtractDiag(Op):

--- a/theano/tensor/nnet/neighbours.py
+++ b/theano/tensor/nnet/neighbours.py
@@ -8,6 +8,7 @@ import numpy as np

 import theano
 from theano import Op, Apply
+from theano.gof import EnumList
 import theano.tensor as T
 from theano.gradient import grad_not_implemented
 from theano.gradient import grad_undefined
@@ -39,13 +40,21 @@ class Images2Neibs(Op):
    """

    __props__ = ("mode",)
+    BORDER_MODE = EnumList(('MODE_VALID', 'valid'),
+                           ('MODE_HALF', 'half'),
+                           ('MODE_FULL', 'full'),
+                           ('MODE_WRAP_CENTERED', 'wrap_centered'),
+                           ('MODE_IGNORE_BORDERS', 'ignore_borders'))
+    params_type = BORDER_MODE
+
+    def get_params(self, node):
+        return self.mode

    def __init__(self, mode='valid'):
-        if mode not in ['valid', 'half', 'full',
-                        'wrap_centered', 'ignore_borders']:
-            raise NotImplementedError("Only the mode valid, half, full, "
-                                      "ignore_borders and wrap_centered have "
-                                      "been implemented for Images2Neibs")
+        implemented_modes = self.BORDER_MODE.get_aliases()
+        if mode not in implemented_modes:
+            raise NotImplementedError("Only modes %s have been implemented for %s"
+                                      % (', '.join(implemented_modes), type(self).__name__))
        self.mode = mode

    def __str__(self):
@@ -159,9 +168,9 @@ class Images2Neibs(Op):
                grad_undefined(self, 2, neib_step)]

    def c_code_cache_version(self):
-        return (8,)
+        return (10,)

-    def perform(self, node, inp, out_):
+    def perform(self, node, inp, out_, params):
        ten4, neib_shape, neib_step = inp
        z, = out_
        # GpuImages2Neibs should not run this perform in DebugMode
@@ -344,11 +353,6 @@ class Images2Neibs(Op):
        return [(z_dim0, z_dim1)]

    def c_code(self, node, name, inp, out, sub):
-        ten4, neib_shape, neib_step = inp
-        z, = out
-
-        fail = sub['fail']
-        mode = self.mode
        return """
 #ifndef CEIL_INTDIV
 #define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
@@ -408,7 +412,7 @@ class Images2Neibs(Op):
            %(fail)s;
        }

-        if ( "%(mode)s" == "wrap_centered") {
+        if (%(mode)s == MODE_WRAP_CENTERED) {
            if (c%%2!=1 || d%%2!=1){
                PyErr_Format(PyExc_TypeError,
                             "Images2Neibs: in mode wrap_centered"
@@ -430,7 +434,7 @@ class Images2Neibs(Op):
            grid_c = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[2]),step_x);
            grid_d = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[3]),step_y);

-        }else if ( "%(mode)s" == "valid") {
+        } else if (%(mode)s == MODE_VALID) {
            if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
                 ( (((PyArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
            {
@@ -455,12 +459,12 @@ class Images2Neibs(Op):
            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
            //number of patch in width
            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
-        }else if ( "%(mode)s" == "ignore_borders") {
+        } else if (%(mode)s == MODE_IGNORE_BORDERS) {
            //number of patch in height
            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
            //number of patch in width
            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
-        }else if ( "%(mode)s" == "half") {
+        } else if (%(mode)s == MODE_HALF) {
            if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
                 ( (((PyArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0))
            {
@@ -485,7 +489,7 @@ class Images2Neibs(Op):
            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x);
            //number of patch in width
            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y);
-        }else if ( "%(mode)s" == "full") {
+        } else if (%(mode)s == MODE_FULL) {
            if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
                 ( (((PyArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0))
            {
@@ -510,9 +514,9 @@ class Images2Neibs(Op):
            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]+c-2)/step_x);
            //number of patch in width
            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]+d-2)/step_y);
-        }else {
+        } else {
            PyErr_Format(PyExc_TypeError,
-                         "Images2Neibs: unknow mode '%(mode)s'");
+                         "Images2Neibs: unknow mode %%d", %(mode)s);
            %(fail)s;
        }

@@ -572,13 +576,13 @@ class Images2Neibs(Op):
                        for (int i = 0; i < c; i++)     // loop over c
                        {
                            int ten4_2 = i + a * step_x;
-                            if ( "%(mode)s" == "wrap_centered" ){
+                            if (%(mode)s == MODE_WRAP_CENTERED) {
                                ten4_2 -= wrap_centered_half_idx_shift_x;
                                if ( ten4_2 < 0 ) ten4_2 += height;
                                else if (ten4_2 >= height) ten4_2 -= height;
-                            } else if ( "%(mode)s" == "half" ){
+                            } else if (%(mode)s == MODE_HALF) {
                                ten4_2 -= wrap_centered_half_idx_shift_x;
-                            } else if ( "%(mode)s" == "full" ){
+                            } else if (%(mode)s == MODE_FULL) {
                                ten4_2 -= c - 1;
                            }
                            if (ten4_2 < 0 | ten4_2 >= height) {
@@ -588,13 +592,13 @@ class Images2Neibs(Op):
                                for (int j = 0; j < d; j++)  // loop over d
                                {
                                    int ten4_3 = j + b * step_y;
-                                    if ( "%(mode)s" == "wrap_centered" ){
+                                    if (%(mode)s == MODE_WRAP_CENTERED) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                        if ( ten4_3 < 0 ) ten4_3 += width;
                                        else if (ten4_3 >= width) ten4_3 -= width;
-                                    } else if ( "%(mode)s" == "half" ){
+                                    } else if (%(mode)s == MODE_HALF) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
-                                    } else if ( "%(mode)s" == "full" ){
+                                    } else if (%(mode)s == MODE_FULL) {
                                        ten4_3 -= d - 1;
                                    }
                                    int z_col = j + d * i;
@@ -609,7 +613,8 @@ class Images2Neibs(Op):
                        }
                    }
        } // END NESTED SCOPE
-        """ % locals()
+        """ % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0],
+                   fail=sub['fail'], mode=sub['params'])


 def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):

--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -12,7 +12,7 @@ import theano
 from theano.compat import izip
 from theano.gradient import DisconnectedType
 from theano import gof
-from theano.gof import Apply, hashtype, Op, Type, MethodNotDefined
+from theano.gof import Apply, hashtype, Op, Type, MethodNotDefined, ParamsType
 from theano.printing import pprint
 from theano import scalar as scal
 from theano.tensor.basic import alloc
@@ -1685,6 +1685,7 @@ class AdvancedSubtensor1(Op):
    # of the grad() method.
    __props__ = ()
    _f16_ok = True
+    check_input = False

    def __init__(self, sparse_grad=False):
        self.sparse_grad = sparse_grad
@@ -1872,10 +1873,13 @@ class AdvancedIncSubtensor1(Op):
    """

    __props__ = ('inplace', 'set_instead_of_inc')
+    check_input = False
+    params_type = ParamsType(inplace=scal.bool,
+                             set_instead_of_inc=scal.bool)

    def __init__(self, inplace=False, set_instead_of_inc=False):
-        self.inplace = inplace
-        self.set_instead_of_inc = set_instead_of_inc
+        self.inplace = bool(inplace)
+        self.set_instead_of_inc = bool(set_instead_of_inc)
        if inplace:
            self.destroy_map = {0: [0]}

@@ -1955,17 +1959,11 @@ class AdvancedIncSubtensor1(Op):
            raise NotImplementedError
        x, y, idx = input_names
        out = output_names[0]
-        fail = sub['fail']
-        inc_or_set = 1 - self.set_instead_of_inc
-        if self.inplace:  # convert bool to int
-            inplace = 1
-        else:
-            inplace = 0
        copy_of_x = self.copy_of_x(x)

        return """
        PyObject* rval = NULL;
-        if (%(inplace)s)
+        if (%(params)s->inplace)
        {
            if (%(x)s != %(out)s)
            {
@@ -1983,16 +1981,17 @@ class AdvancedIncSubtensor1(Op):
                %(fail)s
            }
        }
-        if (inplace_increment(%(out)s, (PyObject *)%(idx)s, %(y)s, %(inc_or_set)d)) {
+        if (inplace_increment(%(out)s, (PyObject *)%(idx)s, %(y)s, (1 - %(params)s->set_instead_of_inc))) {
            %(fail)s;
        }
        Py_XDECREF(rval);
-        """ % locals()
+        """ % dict(x=x, y=y, idx=idx, out=out, copy_of_x=copy_of_x,
+                   params=sub['params'], fail=sub['fail'])

    def c_code_cache_version(self):
-        return (6,)
+        return (8,)

-    def perform(self, node, inp, out_):
+    def perform(self, node, inp, out_, params):
        # TODO opt to make this inplace
        x, y, idx = inp
        out, = out_