Merge pull request #6091 from notoraptor/params-for-other-ops

Params for other ops

Merge pull request #6091 from notoraptor/params-for-other-ops
5df0cfd8 · Frédéric Bastien · GitHub · 49d99209 · 61cb8c41 · 5df0cfd8
--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -346,6 +346,18 @@ class Shape_i(gof.Op):
        i = int(i)
        self.i = i
+    # NB:
+    # 1) params_type is defined as a property to avoid
+    #    loop in Python import caused by importing theano.scalar below
+    #    when params_type is defined directly in class code.
+    # 2) We wrap scalar into ParamsType (instead of directly using scalar as op param)
+    #    to avoid Theano converting scalar param to constant that would be later
+    #    hardcoded as litteral in C code, making us loose all the advantages of
+    #    using params.
+    @property
+    def params_type(self):
+        return gof.ParamsType(i=theano.scalar.basic.int64)
    def __str__(self):
        return '%s{%i}' % (self.__class__.__name__, self.i)
@@ -360,7 +372,7 @@ class Shape_i(gof.Op):
                            (x, self.i))
        return theano.Apply(self, [x], [theano.tensor.lscalar()])
-    def perform(self, node, inp, out_):
+    def perform(self, node, inp, out_, params):
        x, = inp
        out, = out_
        if out[0] is None:
@@ -383,7 +395,7 @@ class Shape_i(gof.Op):
            version.append((str(t), v))
        if version:
-            version.append(1)
+            version.append(2)
        return tuple(version)
@@ -391,7 +403,8 @@ class Shape_i(gof.Op):
        iname, = inames
        oname, = onames
        fail = sub['fail']
-        i = self.i
+        # i is then 'params->i', not just 'params'.
+        i = sub['params'] + '->i'
        itype = node.inputs[0].type.__class__
        if itype in self.c_code_and_version:

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -10,6 +10,9 @@ except ImportError:
 from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape, infer_context_name)
 from .opt import register_opt, op_lifter, register_opt2
+from .type import gpu_context_type
+from theano.gof import ParamsType
+import theano.scalar as scalar
 class GpuCumOp(GpuKernelBase, Op):
@@ -21,9 +24,12 @@ class GpuCumOp(GpuKernelBase, Op):
    """
    SUPPORTED_NDIMS = 3
    __props__ = ('axis', 'mode')
+    params_type = ParamsType(axis=scalar.int32,
+                             context=gpu_context_type)
    def __init__(self, axis, mode='add'):
-        self.axis = axis if axis else 0
+        assert axis is not None
+        self.axis = int(axis)
        self.mode = mode
    def __eq__(self, other):
@@ -35,7 +41,7 @@ class GpuCumOp(GpuKernelBase, Op):
        return hash(self.axis) ^ hash(self.mode)
    def c_code_cache_version(self):
-        return (6,)
+        return (7,)
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
@@ -43,6 +49,9 @@ class GpuCumOp(GpuKernelBase, Op):
    def c_header_dirs(self):
        return [os.path.dirname(__file__)]
+    def get_params(self, node):
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)
    def make_node(self, x):
        assert x.type.dtype == 'float32', "Only float32 supported for GpuCumOp"
@@ -244,24 +253,18 @@ class GpuCumOp(GpuKernelBase, Op):
    def c_code(self, node, nodename, inp, out, sub):
        if node.inputs[0].type.context.kind != b'cuda':
            raise NotImplementedError("cuda only")
-        x, = inp
+        return """
-        z, = out
-        axis = self.axis if self.axis is not None else 0
-        fail = sub['fail']
-        ctx = sub['params']
-        code = """
            const size_t* shape = PyGpuArray_DIMS(%(x)s);
            bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s);
-            int axis = %(axis)s;
+            int axis = %(params)s->axis;
            if (axis < 0) {
                // Convert negative axis to positive axis.
                axis += PyGpuArray_NDIM(%(x)s);
            }
-            if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s) != 0){
+            if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s),
+                                   %(x)s->ga.typecode, GA_C_ORDER, %(params)s->context) != 0) {
                %(fail)s;
            }
@@ -270,17 +273,17 @@ class GpuCumOp(GpuKernelBase, Op):
                size_t max_grid_size1;
                size_t max_grid_size2;
                int err;
-                err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
+                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0");
                    %(fail)s;
                }
-                err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
+                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1");
                    %(fail)s;
                }
-                err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
+                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
                    %(fail)s;
@@ -289,9 +292,7 @@ class GpuCumOp(GpuKernelBase, Op):
                    %(fail)s;
                }
            }
-        """ % locals()
+        """ % dict(x=inp[0], z=out[0], nodename=nodename, fail=sub['fail'], params=sub['params'])
-        return code
    def c_support_code_struct(self, node, nodename):
        code = """

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
--- a/theano/gpuarray/rng_mrg.py
+++ b/theano/gpuarray/rng_mrg.py
@@ -7,16 +7,15 @@ http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
 """
 from __future__ import absolute_import, print_function, division
-import numpy as np
 from theano import Apply, tensor
 from theano.gof import local_optimizer
 from theano.sandbox.rng_mrg import mrg_uniform_base, mrg_uniform
 from theano.tensor import as_tensor_variable, get_vector_length
+from theano.scalar import int32 as int_t
 from .basic_ops import (GpuKernelBase, Kernel, infer_context_name,
                        host_from_gpu, as_gpuarray_variable)
-from .type import GpuArrayType
+from .type import GpuArrayType, gpu_context_type
 from .fp16_help import write_w
 from .opt import register_opt, register_opt2
@@ -24,6 +23,9 @@ from .opt import register_opt, register_opt2
 class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
    # GpuArray version
    _f16_ok = True
+    params_type = mrg_uniform_base.params_type.extended(otypecode=int_t, context=gpu_context_type)
+    otypecode = property(lambda self: self.output_type.typecode)
    def make_node(self, rstate, size):
        # error checking slightly redundant here, since
@@ -39,6 +41,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                     [rstate, size],
                     [rstate.type(), output_type])
+    def get_params(self, node):
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)
    @classmethod
    def new(cls, rstate, ndim, dtype, size):
        v_size = as_tensor_variable(size)
@@ -168,40 +173,34 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                ]
    def c_code(self, node, nodename, inp, out, sub):
-        rstate, size = inp
-        o_rstate, o_sample = out
-        inplace = int(self.inplace)
-        ndim = self.output_type.ndim
-        o_type_num = np.asarray(0, dtype=self.output_type.dtype).dtype.num
-        fail = sub['fail']
-        ctx = sub['params']
-        kname = self.gpu_kernels(node, nodename)[0].objvar
-        otypecode = str(self.output_type.typecode)
        return """
        npy_int64 M1 = 2147483647;      //2^31 - 1
-        // The +1 is to avoid odims[0] which fails on windows
-        size_t odims[%(ndim)s+1];
        size_t n_elements = 1;
        unsigned int n_streams;
        int must_alloc_sample = ((NULL == %(o_sample)s)
                || !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
                || !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
-                || (PyGpuArray_NDIM(%(o_sample)s) != %(ndim)s));
+                || (PyGpuArray_NDIM(%(o_sample)s) != %(params)s->ndim));
+        size_t* odims = (size_t*)malloc(%(params)s->ndim * sizeof(size_t));
+        if (odims == NULL) {
+            PyErr_NoMemory();
+            %(just_fail)s
+        }
        if (PyArray_NDIM(%(size)s) != 1)
        {
            PyErr_SetString(PyExc_ValueError, "size must be vector");
            %(fail)s
        }
-        if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
+        if (PyArray_DIMS(%(size)s)[0] != %(params)s->ndim)
        {
            PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
-                %(ndim)s, PyArray_DIMS(%(size)s)[0]);
+                %(params)s->ndim, PyArray_DIMS(%(size)s)[0]);
            %(fail)s
        }
-        for (int i = 0; i < %(ndim)s; ++i)
+        for (int i = 0; i < %(params)s->ndim; ++i)
        {
            odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
            n_elements *= odims[i];
@@ -219,8 +218,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        if (must_alloc_sample)
        {
            Py_XDECREF(%(o_sample)s);
-            %(o_sample)s = pygpu_empty(%(ndim)s, odims, %(otypecode)s, GA_C_ORDER,
+            %(o_sample)s = pygpu_empty(%(params)s->ndim, odims, %(params)s->otypecode, GA_C_ORDER,
-                                       %(ctx)s, Py_None);
+                                       %(params)s->context, Py_None);
            if(!%(o_sample)s)
            {
                %(fail)s;
@@ -233,7 +232,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        }
        Py_XDECREF(%(o_rstate)s);
-        if (%(inplace)s)
+        if (%(params)s->inplace)
        {
            Py_INCREF(%(rstate)s);
            %(o_rstate)s = %(rstate)s;
@@ -285,10 +284,22 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
              %(fail)s
          }
        }
-        """ % locals()
+        free(odims);
+        """ % dict(rstate=inp[0], size=inp[1],
+                   o_rstate=out[0], o_sample=out[1],
+                   kname=self.gpu_kernels(node, nodename)[0].objvar,
+                   params=sub['params'],
+                   just_fail=sub['fail'],
+                   fail="""
+                   {
+                     free(odims);
+                     %(fail)s
+                   }
+                   """ % dict(fail=sub['fail']))
    def c_code_cache_version(self):
-        return (14,)
+        return (15,)
 @register_opt2([mrg_uniform], 'fast_compile')

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
--- a/theano/tensor/nnet/neighbours.py
+++ b/theano/tensor/nnet/neighbours.py
@@ -8,6 +8,7 @@ import numpy as np
 import theano
 from theano import Op, Apply
+from theano.gof import EnumList
 import theano.tensor as T
 from theano.gradient import grad_not_implemented
 from theano.gradient import grad_undefined
@@ -39,13 +40,21 @@ class Images2Neibs(Op):
    """
    __props__ = ("mode",)
+    BORDER_MODE = EnumList(('MODE_VALID', 'valid'),
+                           ('MODE_HALF', 'half'),
+                           ('MODE_FULL', 'full'),
+                           ('MODE_WRAP_CENTERED', 'wrap_centered'),
+                           ('MODE_IGNORE_BORDERS', 'ignore_borders'))
+    params_type = BORDER_MODE
+    def get_params(self, node):
+        return self.mode
    def __init__(self, mode='valid'):
-        if mode not in ['valid', 'half', 'full',
+        implemented_modes = self.BORDER_MODE.get_aliases()
-                        'wrap_centered', 'ignore_borders']:
+        if mode not in implemented_modes:
-            raise NotImplementedError("Only the mode valid, half, full, "
+            raise NotImplementedError("Only modes %s have been implemented for %s"
-                                      "ignore_borders and wrap_centered have "
+                                      % (', '.join(implemented_modes), type(self).__name__))
-                                      "been implemented for Images2Neibs")
        self.mode = mode
    def __str__(self):
@@ -159,9 +168,9 @@ class Images2Neibs(Op):
                grad_undefined(self, 2, neib_step)]
    def c_code_cache_version(self):
-        return (8,)
+        return (10,)
-    def perform(self, node, inp, out_):
+    def perform(self, node, inp, out_, params):
        ten4, neib_shape, neib_step = inp
        z, = out_
        # GpuImages2Neibs should not run this perform in DebugMode
@@ -344,11 +353,6 @@ class Images2Neibs(Op):
        return [(z_dim0, z_dim1)]
    def c_code(self, node, name, inp, out, sub):
-        ten4, neib_shape, neib_step = inp
-        z, = out
-        fail = sub['fail']
-        mode = self.mode
        return """
 #ifndef CEIL_INTDIV
 #define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
@@ -408,7 +412,7 @@ class Images2Neibs(Op):
            %(fail)s;
        }
-        if ( "%(mode)s" == "wrap_centered") {
+        if (%(mode)s == MODE_WRAP_CENTERED) {
            if (c%%2!=1 || d%%2!=1){
                PyErr_Format(PyExc_TypeError,
                             "Images2Neibs: in mode wrap_centered"
@@ -430,7 +434,7 @@ class Images2Neibs(Op):
            grid_c = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[2]),step_x);
            grid_d = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[3]),step_y);
-        }else if ( "%(mode)s" == "valid") {
+        } else if (%(mode)s == MODE_VALID) {
            if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
                 ( (((PyArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
            {
@@ -455,12 +459,12 @@ class Images2Neibs(Op):
            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
            //number of patch in width
            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
-        }else if ( "%(mode)s" == "ignore_borders") {
+        } else if (%(mode)s == MODE_IGNORE_BORDERS) {
            //number of patch in height
            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
            //number of patch in width
            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
-        }else if ( "%(mode)s" == "half") {
+        } else if (%(mode)s == MODE_HALF) {
            if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
                 ( (((PyArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0))
            {
@@ -485,7 +489,7 @@ class Images2Neibs(Op):
            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x);
            //number of patch in width
            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y);
-        }else if ( "%(mode)s" == "full") {
+        } else if (%(mode)s == MODE_FULL) {
            if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
                 ( (((PyArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0))
            {
@@ -510,9 +514,9 @@ class Images2Neibs(Op):
            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]+c-2)/step_x);
            //number of patch in width
            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]+d-2)/step_y);
-        }else {
+        } else {
            PyErr_Format(PyExc_TypeError,
-                         "Images2Neibs: unknow mode '%(mode)s'");
+                         "Images2Neibs: unknow mode %%d", %(mode)s);
            %(fail)s;
        }
@@ -572,13 +576,13 @@ class Images2Neibs(Op):
                        for (int i = 0; i < c; i++)     // loop over c
                        {
                            int ten4_2 = i + a * step_x;
-                            if ( "%(mode)s" == "wrap_centered" ){
+                            if (%(mode)s == MODE_WRAP_CENTERED) {
                                ten4_2 -= wrap_centered_half_idx_shift_x;
                                if ( ten4_2 < 0 ) ten4_2 += height;
                                else if (ten4_2 >= height) ten4_2 -= height;
-                            } else if ( "%(mode)s" == "half" ){
+                            } else if (%(mode)s == MODE_HALF) {
                                ten4_2 -= wrap_centered_half_idx_shift_x;
-                            } else if ( "%(mode)s" == "full" ){
+                            } else if (%(mode)s == MODE_FULL) {
                                ten4_2 -= c - 1;
                            }
                            if (ten4_2 < 0 | ten4_2 >= height) {
@@ -588,13 +592,13 @@ class Images2Neibs(Op):
                                for (int j = 0; j < d; j++)  // loop over d
                                {
                                    int ten4_3 = j + b * step_y;
-                                    if ( "%(mode)s" == "wrap_centered" ){
+                                    if (%(mode)s == MODE_WRAP_CENTERED) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                        if ( ten4_3 < 0 ) ten4_3 += width;
                                        else if (ten4_3 >= width) ten4_3 -= width;
-                                    } else if ( "%(mode)s" == "half" ){
+                                    } else if (%(mode)s == MODE_HALF) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
-                                    } else if ( "%(mode)s" == "full" ){
+                                    } else if (%(mode)s == MODE_FULL) {
                                        ten4_3 -= d - 1;
                                    }
                                    int z_col = j + d * i;
@@ -609,7 +613,8 @@ class Images2Neibs(Op):
                        }
                    }
        } // END NESTED SCOPE
-        """ % locals()
+        """ % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0],
+                   fail=sub['fail'], mode=sub['params'])
 def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):

--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -12,7 +12,7 @@ import theano
 from theano.compat import izip
 from theano.gradient import DisconnectedType
 from theano import gof
-from theano.gof import Apply, hashtype, Op, Type, MethodNotDefined
+from theano.gof import Apply, hashtype, Op, Type, MethodNotDefined, ParamsType
 from theano.printing import pprint
 from theano import scalar as scal
 from theano.tensor.basic import alloc
@@ -1685,6 +1685,7 @@ class AdvancedSubtensor1(Op):
    # of the grad() method.
    __props__ = ()
    _f16_ok = True
+    check_input = False
    def __init__(self, sparse_grad=False):
        self.sparse_grad = sparse_grad
@@ -1872,10 +1873,13 @@ class AdvancedIncSubtensor1(Op):
    """
    __props__ = ('inplace', 'set_instead_of_inc')
+    check_input = False
+    params_type = ParamsType(inplace=scal.bool,
+                             set_instead_of_inc=scal.bool)
    def __init__(self, inplace=False, set_instead_of_inc=False):
-        self.inplace = inplace
+        self.inplace = bool(inplace)
-        self.set_instead_of_inc = set_instead_of_inc
+        self.set_instead_of_inc = bool(set_instead_of_inc)
        if inplace:
            self.destroy_map = {0: [0]}
@@ -1955,17 +1959,11 @@ class AdvancedIncSubtensor1(Op):
            raise NotImplementedError
        x, y, idx = input_names
        out = output_names[0]
-        fail = sub['fail']
-        inc_or_set = 1 - self.set_instead_of_inc
-        if self.inplace:  # convert bool to int
-            inplace = 1
-        else:
-            inplace = 0
        copy_of_x = self.copy_of_x(x)
        return """
        PyObject* rval = NULL;
-        if (%(inplace)s)
+        if (%(params)s->inplace)
        {
            if (%(x)s != %(out)s)
            {
@@ -1983,16 +1981,17 @@ class AdvancedIncSubtensor1(Op):
                %(fail)s
            }
        }
-        if (inplace_increment(%(out)s, (PyObject *)%(idx)s, %(y)s, %(inc_or_set)d)) {
+        if (inplace_increment(%(out)s, (PyObject *)%(idx)s, %(y)s, (1 - %(params)s->set_instead_of_inc))) {
            %(fail)s;
        }
        Py_XDECREF(rval);
-        """ % locals()
+        """ % dict(x=x, y=y, idx=idx, out=out, copy_of_x=copy_of_x,
+                   params=sub['params'], fail=sub['fail'])
    def c_code_cache_version(self):
-        return (6,)
+        return (8,)
-    def perform(self, node, inp, out_):
+    def perform(self, node, inp, out_, params):
        # TODO opt to make this inplace
        x, y, idx = inp
        out, = out_