提交 5df0cfd8 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6091 from notoraptor/params-for-other-ops

Params for other ops
......@@ -346,6 +346,18 @@ class Shape_i(gof.Op):
i = int(i)
self.i = i
# NB:
# 1) params_type is defined as a property to avoid
# loop in Python import caused by importing theano.scalar below
# when params_type is defined directly in class code.
# 2) We wrap scalar into ParamsType (instead of directly using scalar as op param)
# to avoid Theano converting scalar param to constant that would be later
# hardcoded as litteral in C code, making us loose all the advantages of
# using params.
@property
def params_type(self):
return gof.ParamsType(i=theano.scalar.basic.int64)
def __str__(self):
return '%s{%i}' % (self.__class__.__name__, self.i)
......@@ -360,7 +372,7 @@ class Shape_i(gof.Op):
(x, self.i))
return theano.Apply(self, [x], [theano.tensor.lscalar()])
def perform(self, node, inp, out_):
def perform(self, node, inp, out_, params):
x, = inp
out, = out_
if out[0] is None:
......@@ -383,7 +395,7 @@ class Shape_i(gof.Op):
version.append((str(t), v))
if version:
version.append(1)
version.append(2)
return tuple(version)
......@@ -391,7 +403,8 @@ class Shape_i(gof.Op):
iname, = inames
oname, = onames
fail = sub['fail']
i = self.i
# i is then 'params->i', not just 'params'.
i = sub['params'] + '->i'
itype = node.inputs[0].type.__class__
if itype in self.c_code_and_version:
......
......@@ -10,6 +10,9 @@ except ImportError:
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape, infer_context_name)
from .opt import register_opt, op_lifter, register_opt2
from .type import gpu_context_type
from theano.gof import ParamsType
import theano.scalar as scalar
class GpuCumOp(GpuKernelBase, Op):
......@@ -21,9 +24,12 @@ class GpuCumOp(GpuKernelBase, Op):
"""
SUPPORTED_NDIMS = 3
__props__ = ('axis', 'mode')
params_type = ParamsType(axis=scalar.int32,
context=gpu_context_type)
def __init__(self, axis, mode='add'):
self.axis = axis if axis else 0
assert axis is not None
self.axis = int(axis)
self.mode = mode
def __eq__(self, other):
......@@ -35,7 +41,7 @@ class GpuCumOp(GpuKernelBase, Op):
return hash(self.axis) ^ hash(self.mode)
def c_code_cache_version(self):
return (6,)
return (7,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
......@@ -43,6 +49,9 @@ class GpuCumOp(GpuKernelBase, Op):
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def make_node(self, x):
assert x.type.dtype == 'float32', "Only float32 supported for GpuCumOp"
......@@ -244,24 +253,18 @@ class GpuCumOp(GpuKernelBase, Op):
def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError("cuda only")
x, = inp
z, = out
axis = self.axis if self.axis is not None else 0
fail = sub['fail']
ctx = sub['params']
code = """
return """
const size_t* shape = PyGpuArray_DIMS(%(x)s);
bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s);
int axis = %(axis)s;
int axis = %(params)s->axis;
if (axis < 0) {
// Convert negative axis to positive axis.
axis += PyGpuArray_NDIM(%(x)s);
}
if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s) != 0){
if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s),
%(x)s->ga.typecode, GA_C_ORDER, %(params)s->context) != 0) {
%(fail)s;
}
......@@ -270,17 +273,17 @@ class GpuCumOp(GpuKernelBase, Op):
size_t max_grid_size1;
size_t max_grid_size2;
int err;
err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0");
%(fail)s;
}
err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1");
%(fail)s;
}
err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
%(fail)s;
......@@ -289,9 +292,7 @@ class GpuCumOp(GpuKernelBase, Op):
%(fail)s;
}
}
""" % locals()
return code
""" % dict(x=inp[0], z=out[0], nodename=nodename, fail=sub['fail'], params=sub['params'])
def c_support_code_struct(self, node, nodename):
code = """
......
from __future__ import absolute_import, print_function, division
import numpy as np
from theano import Op, Apply, config
from theano.gof import ParamsType
from theano.tensor.nnet.neighbours import Images2Neibs
import theano.tensor as T
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
......@@ -14,7 +13,7 @@ except ImportError:
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name)
from .opt import register_opt2, op_lifter, register_opt
from .type import GpuArrayType
from .type import GpuArrayType, gpu_context_type
class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
......@@ -22,13 +21,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
Images2Neibs for the GPU.
"""
def __init__(self, mode='valid'):
if mode not in ['valid', 'half', 'full',
'ignore_borders', 'wrap_centered']:
raise NotImplementedError("Only the mode valid, half, full, "
"ignore_borders and wrap_centered have "
"been implemented for GpuImages2Neibs")
self.mode = mode
params_type = ParamsType(mode=Images2Neibs.BORDER_MODE, context=gpu_context_type)
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def make_node(self, ten4, neib_shape, neib_step=None):
ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
......@@ -50,7 +46,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
context_name=ten4.type.context_name)()])
def c_code_cache_version(self):
return (12,)
return (13,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
......@@ -61,13 +57,16 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
flags = Kernel.get_flags(dtype_ten4, dtype_z)
type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
type_z = gpuarray.dtype_to_ctype(dtype_z)
mode = self.mode
# `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here.
mode_constants = self.BORDER_MODE.c_support_code()
kernels = []
kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename
code = """
// a version that uses less registers but doesn't work in all cases.
// a version that uses less registers but doesn't work in all cases.
%(mode_constants)s
KERNEL void %(kname)s(
const ga_int mode,
const ga_int nb_batch,
const ga_int nb_stack,
const ga_int height,
......@@ -110,29 +109,29 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
ga_int i = LID_1; // loop over c
{
ga_int ten4_2 = i + a * step_x;
if("%(mode)s"=="wrap_centered"){
if(mode == MODE_WRAP_CENTERED) {
ten4_2 -= wrap_centered_half_idx_shift_x;
if ( ten4_2 < 0 )
ten4_2 += height;
else if (ten4_2 >= height)
ten4_2 -= height;
} else if ("%(mode)s"=="half"){
} else if (mode == MODE_HALF) {
ten4_2 -= wrap_centered_half_idx_shift_x;
} else if ("%(mode)s"=="full"){
} else if (mode == MODE_FULL) {
ten4_2 -= c - 1;
}
ga_int j = LID_0; // loop over d
{
ga_int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){
if(mode == MODE_WRAP_CENTERED){
ten4_3 -= wrap_centered_half_idx_shift_y;
if ( ten4_3 < 0 )
ten4_3 += width;
else if (ten4_3 >= width)
ten4_3 -= width;
} else if ("%(mode)s"=="half"){
} else if (mode == MODE_HALF) {
ten4_3 -= wrap_centered_half_idx_shift_y;
} else if ("%(mode)s"=="full"){
} else if (mode == MODE_FULL) {
ten4_3 -= d - 1;
}
......@@ -150,8 +149,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
}
}
}
}""" % locals()
}""" % dict(kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants)
params = [
'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'uintp', 'uintp', 'uintp', 'uintp',
......@@ -165,7 +165,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kname = "k_multi_warp"
k_var = "k_multi_warp_" + nodename
code = """
%(mode_constants)s
KERNEL void %(kname)s(
const ga_int mode,
const ga_int nb_batch,
const ga_int nb_stack,
const ga_int height,
......@@ -209,30 +211,30 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
for (ga_int i = LID_1; i < c; i+=LDIM_1)
{
ga_int ten4_2 = i + a * step_x;
if("%(mode)s"=="wrap_centered"){
if(mode == MODE_WRAP_CENTERED) {
ten4_2 -= wrap_centered_half_idx_shift_x;
if ( ten4_2 < 0 )
ten4_2 += height;
else if (ten4_2 >= height)
ten4_2 -= height;
} else if ("%(mode)s"=="half"){
} else if (mode == MODE_HALF) {
ten4_2 -= wrap_centered_half_idx_shift_x;
} else if ("%(mode)s"=="full"){
} else if (mode == MODE_FULL) {
ten4_2 -= c - 1;
}
// loop over d
for (ga_int j = LID_0; j < d; j+=LDIM_0)
{
ga_int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){
if(mode == MODE_WRAP_CENTERED) {
ten4_3 -= wrap_centered_half_idx_shift_y;
if ( ten4_3 < 0 )
ten4_3 += width;
else if (ten4_3 >= width)
ten4_3 -= width;
} else if ("%(mode)s"=="half"){
} else if (mode == MODE_HALF) {
ten4_3 -= wrap_centered_half_idx_shift_y;
} else if ("%(mode)s"=="full"){
} else if (mode == MODE_FULL) {
ten4_3 -= d - 1;
}
......@@ -251,8 +253,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
}
}
}
""" % locals()
""" % dict(kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants)
params = [
'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'uintp', 'uintp', 'uintp', 'uintp',
......@@ -274,18 +277,6 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
"""
def c_code(self, node, name, inp, out, sub):
dtype_ten4 = node.inputs[0].dtype
dtype_neib_shape = node.inputs[1].dtype
dtype_neib_step = node.inputs[2].dtype
dtype_z = node.outputs[0].dtype
itemsize_ten4 = np.dtype(dtype_ten4).itemsize
itemsize_z = np.dtype(dtype_z).itemsize
typecode_z = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
ten4, neib_shape, neib_step = inp
z, = out
fail = sub['fail']
ctx = sub['params']
mode = self.mode
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
......@@ -293,16 +284,23 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
GpuKernel_error(fptr, err));
%(fail)s;
}
""" % locals()
""" % dict(fail=sub['fail'])
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
""" % dict(z=out[0], err_check=err_check)
# NB: To reduce C code variability:
# For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize
# For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node)
# For typecode_z, I use ten4->ga.typecode (for same reason as above)
return """
int grid_c = -1;
int grid_d = -1;
size_t itemsize_ten4 = GpuArray_ITEMSIZE(&%(ten4)s->ga);
size_t itemsize_z = itemsize_ten4;
int typecode_z = %(ten4)s->ga.typecode;
{
if (PyGpuArray_NDIM(%(ten4)s) != 4)
......@@ -351,10 +349,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
%(fail)s;
}
if ( "%(mode)s" == "wrap_centered") {
if (%(params)s->mode == MODE_WRAP_CENTERED) {
if (c%%2!=1 || d%%2!=1){
PyErr_Format(PyExc_TypeError,
"GpuImages2Neibs: in mode wrap_centered need patch with odd shapes");
"GpuImages2Neibs: in mode wrap_centered need patch with odd shapes");
%(fail)s;
}
if ( PyGpuArray_DIMS(%(ten4)s)[2] < c ||
......@@ -375,7 +373,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
(size_t)step_y);
}else if ( "%(mode)s" == "valid") {
} else if (%(params)s->mode == MODE_VALID) {
if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
{
......@@ -400,12 +398,12 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
}else if ( "%(mode)s" == "ignore_borders") {
} else if (%(params)s->mode == MODE_IGNORE_BORDERS) {
//number of patch in height
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
}else if ( "%(mode)s" == "half") {
} else if (%(params)s->mode == MODE_HALF) {
if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
((((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0))
{
......@@ -430,7 +428,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x);
//number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y);
}else if ( "%(mode)s" == "full") {
} else if (%(params)s->mode == MODE_FULL) {
if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
( (((PyGpuArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0))
{
......@@ -455,9 +453,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]+c-2)/step_x);
//number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]+d-2)/step_y);
}else{
} else {
PyErr_Format(PyExc_TypeError,
"GpuImages2Neibs:: unknown mode '%(mode)s'");
"GpuImages2Neibs:: unknown mode %%d", %(params)s->mode);
%(fail)s;
}
......@@ -476,8 +474,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
size_t dims[2];
dims[0] = z_dim0;
dims[1] = z_dim1;
%(z)s = pygpu_empty(2, dims, %(typecode_z)s,
GA_C_ORDER, %(ctx)s, Py_None);
%(z)s = pygpu_empty(2, dims, typecode_z,
GA_C_ORDER, %(params)s->context, Py_None);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
......@@ -490,6 +488,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
{ // NESTED SCOPE
const int mode = %(params)s->mode;
const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0];
const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1];
const int height = PyGpuArray_DIMS(%(ten4)s)[2];
......@@ -507,7 +506,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
size_t threads_per_block[3] = {d, c, 1};
//get the max threads per blocks
size_t max_threads_dim;
int err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
%(fail)s;
......@@ -535,14 +534,19 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
}else{
fptr = &k_multi_warp_%(name)s;
}
// printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n", max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1], n_blocks[2]);
size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s;
size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s;
size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s;
size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s;
size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
void *kernel_params[] = {(void *)&nb_batch,
/*
printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n",
max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], n_blocks[2]);
*/
size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / itemsize_ten4;
size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / itemsize_ten4;
size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / itemsize_ten4;
size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / itemsize_ten4;
size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / itemsize_z;
size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / itemsize_z;
void *kernel_params[] = {(void *)&mode,
(void *)&nb_batch,
(void *)&nb_stack,
(void *)&height, (void *)&width,
(void *)&c, (void *)&d,
......@@ -562,11 +566,18 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
%(err_check)s
%(sync)s
} // END NESTED SCOPE
""" % locals()
def perform(self, node, inp, out, ctx):
""" % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0],
dtype_neib_shape=node.inputs[1].dtype,
dtype_neib_step=node.inputs[2].dtype,
err_check=err_check,
sync=sync,
name=name,
params=sub['params'],
fail=sub['fail'])
def perform(self, node, inp, out, params):
# Disable the perform method from the CPU version
Op.perform(self, node, inp, out, ctx)
Op.perform(self, node, inp, out, params)
@register_opt('fast_compile')
......
......@@ -7,16 +7,15 @@ http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
"""
from __future__ import absolute_import, print_function, division
import numpy as np
from theano import Apply, tensor
from theano.gof import local_optimizer
from theano.sandbox.rng_mrg import mrg_uniform_base, mrg_uniform
from theano.tensor import as_tensor_variable, get_vector_length
from theano.scalar import int32 as int_t
from .basic_ops import (GpuKernelBase, Kernel, infer_context_name,
host_from_gpu, as_gpuarray_variable)
from .type import GpuArrayType
from .type import GpuArrayType, gpu_context_type
from .fp16_help import write_w
from .opt import register_opt, register_opt2
......@@ -24,6 +23,9 @@ from .opt import register_opt, register_opt2
class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
# GpuArray version
_f16_ok = True
params_type = mrg_uniform_base.params_type.extended(otypecode=int_t, context=gpu_context_type)
otypecode = property(lambda self: self.output_type.typecode)
def make_node(self, rstate, size):
# error checking slightly redundant here, since
......@@ -39,6 +41,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
[rstate, size],
[rstate.type(), output_type])
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
@classmethod
def new(cls, rstate, ndim, dtype, size):
v_size = as_tensor_variable(size)
......@@ -168,40 +173,34 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
]
def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp
o_rstate, o_sample = out
inplace = int(self.inplace)
ndim = self.output_type.ndim
o_type_num = np.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
ctx = sub['params']
kname = self.gpu_kernels(node, nodename)[0].objvar
otypecode = str(self.output_type.typecode)
return """
npy_int64 M1 = 2147483647; //2^31 - 1
// The +1 is to avoid odims[0] which fails on windows
size_t odims[%(ndim)s+1];
size_t n_elements = 1;
unsigned int n_streams;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
|| !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
|| (PyGpuArray_NDIM(%(o_sample)s) != %(ndim)s));
|| (PyGpuArray_NDIM(%(o_sample)s) != %(params)s->ndim));
size_t* odims = (size_t*)malloc(%(params)s->ndim * sizeof(size_t));
if (odims == NULL) {
PyErr_NoMemory();
%(just_fail)s
}
if (PyArray_NDIM(%(size)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "size must be vector");
%(fail)s
}
if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
if (PyArray_DIMS(%(size)s)[0] != %(params)s->ndim)
{
PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
%(ndim)s, PyArray_DIMS(%(size)s)[0]);
%(params)s->ndim, PyArray_DIMS(%(size)s)[0]);
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i)
for (int i = 0; i < %(params)s->ndim; ++i)
{
odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
n_elements *= odims[i];
......@@ -219,8 +218,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
if (must_alloc_sample)
{
Py_XDECREF(%(o_sample)s);
%(o_sample)s = pygpu_empty(%(ndim)s, odims, %(otypecode)s, GA_C_ORDER,
%(ctx)s, Py_None);
%(o_sample)s = pygpu_empty(%(params)s->ndim, odims, %(params)s->otypecode, GA_C_ORDER,
%(params)s->context, Py_None);
if(!%(o_sample)s)
{
%(fail)s;
......@@ -233,7 +232,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
}
Py_XDECREF(%(o_rstate)s);
if (%(inplace)s)
if (%(params)s->inplace)
{
Py_INCREF(%(rstate)s);
%(o_rstate)s = %(rstate)s;
......@@ -285,10 +284,22 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
%(fail)s
}
}
""" % locals()
free(odims);
""" % dict(rstate=inp[0], size=inp[1],
o_rstate=out[0], o_sample=out[1],
kname=self.gpu_kernels(node, nodename)[0].objvar,
params=sub['params'],
just_fail=sub['fail'],
fail="""
{
free(odims);
%(fail)s
}
""" % dict(fail=sub['fail']))
def c_code_cache_version(self):
return (14,)
return (15,)
@register_opt2([mrg_uniform], 'fast_compile')
......
......@@ -7,9 +7,11 @@ from six import integer_types
from six.moves import StringIO
from theano import tensor, gof, Op
from theano.gof import ParamsType
from theano.gradient import grad_not_implemented
import theano.tensor as T
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
from theano.scalar import bool as bool_t, int32 as int_t, uint32 as size_t
try:
import pygpu
......@@ -594,7 +596,15 @@ class GpuAdvancedIncSubtensor1(Op):
"""
_f16_ok = True
__props__ = ('inplace', 'set_instead_of_inc')
params_type = gpu_context_type
params_type = ParamsType(inplace=bool_t,
set_instead_of_inc=bool_t,
context=gpu_context_type,
# following params are used into c_init_code_struct(),
# as inputs are not available in that function.
ndim_input_0=size_t,
ndim_input_1=size_t,
typecode_input_0=int_t,
typecode_input_1=int_t)
def __init__(self, inplace=False, set_instead_of_inc=False):
self.inplace = inplace
......@@ -634,12 +644,17 @@ class GpuAdvancedIncSubtensor1(Op):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def get_params(self, node):
return node.outputs[0].type.context
return self.params_type.get_params(self, context=node.outputs[0].type.context,
# following params are used into c_init_code_struct().
ndim_input_0=node.inputs[0].ndim,
ndim_input_1=node.inputs[1].ndim,
typecode_input_0=node.inputs[0].type.typecode,
typecode_input_1=node.inputs[1].type.typecode)
# We can't use the parent version that loops on each index
# as we also need to loop when set_instead_of_inc is True and the
# parent doesn't loop in that case.
def perform(self, node, inp, out_, ctx=None):
def perform(self, node, inp, out_, params=None):
# TODO opt to make this inplace
x, y, idx = inp
out, = out_
......@@ -700,21 +715,18 @@ class GpuAdvancedIncSubtensor1(Op):
return """
gpuelemwise_arg args[2] = {{0}};
args[0].name = "a";
args[0].typecode = %(type1)s;
args[0].typecode = %(params)s->typecode_input_0;
args[0].flags = GE_READ|GE_WRITE;
args[1].name = "b";
args[1].typecode = %(type2)s;
args[1].typecode = %(params)s->typecode_input_1;
args[1].flags = GE_READ;
iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b",
2, args, %(nd)s, GE_CONVERT_F16);
iadd = GpuElemwise_new(%(params)s->context->ctx, "", "a += b",
2, args, %(params)s->ndim_input_1, GE_CONVERT_F16);
if (iadd == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
%(fail)s
}
""" % dict(ctx=sub['params'], fail=sub['fail'],
type1=node.inputs[0].type.typecode,
type2=node.inputs[1].type.typecode,
nd=node.inputs[1].ndim)
""" % dict(params=sub['params'], fail=sub['fail'])
def c_code(self, node, name, inputs, outputs, sub):
if (node.inputs[0].ndim != node.inputs[1].ndim):
......@@ -722,18 +734,26 @@ class GpuAdvancedIncSubtensor1(Op):
return """
PyGpuArrayObject *row_x, *row_y;
ssize_t start[%(nd)s], step[%(nd)s];
size_t nd = %(params)s->ndim_input_0;
ssize_t *start = NULL, *step = NULL;
size_t num_indices, j;
int ret;
int broadcast_y;
for (j = 0; j < %(nd)s; j++) {
start = (ssize_t*)malloc(nd * sizeof(ssize_t));
step = (ssize_t*)malloc(nd * sizeof(ssize_t));
if (start == NULL || step == NULL) {
PyErr_NoMemory();
%(fail)s
}
for (j = 0; j < nd; ++j) {
start[j] = 0;
step[j] = 1;
}
step[0] = 0;
num_indices = PyArray_SIZE(%(ind)s);
if (!%(inplace)s) {
if (!%(params)s->inplace) {
%(out)s = theano_try_copy(%(out)s, %(x)s);
if (%(out)s == NULL) {
// Exception already set
......@@ -774,7 +794,7 @@ class GpuAdvancedIncSubtensor1(Op):
%(fail)s;
}
if (%(set_instead_of_inc)s) {
if (%(params)s->set_instead_of_inc) {
ret = GpuArray_setarray(&row_x->ga, &row_y->ga);
} else {
void *args[2];
......@@ -788,13 +808,21 @@ class GpuAdvancedIncSubtensor1(Op):
PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
}
}
free(start);
free(step);
""" % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0],
fail=sub['fail'], inplace=int(self.inplace),
nd=node.inputs[0].ndim,
set_instead_of_inc=int(self.set_instead_of_inc))
params=sub['params'],
fail="""
{
free(start);
free(step);
%(fail)s
}
""" % dict(fail=sub['fail']))
def c_code_cache_version(self):
return (3,)
return (4,)
class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
......@@ -805,6 +833,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
"""
_f16_ok = True
params_type = GpuAdvancedIncSubtensor1.params_type
get_params = GpuAdvancedIncSubtensor1.get_params
def make_node(self, x, y, ilist):
"""
......@@ -837,14 +867,11 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def get_params(self, node):
return node.outputs[0].type.context
def perform(self, node, inp, out, ctx):
def perform(self, node, inp, out, params):
return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)
def c_code_cache_version(self):
return (12,)
return (13,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray_helper.h>',
......@@ -854,7 +881,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
return [os.path.dirname(__file__)]
def c_code(self, node, name, inputs, outputs, sub):
ctx = self.get_params(node)
ctx = self.get_params(node).context
if ctx.kind != b'cuda':
raise NotImplementedError("cuda only")
if (node.inputs[0].ndim != node.inputs[1].ndim or
......@@ -862,16 +889,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
int(ctx.bin_id[-2]) < 2):
raise NotImplementedError("This case does not have C code yet.")
x = inputs[0]
y = inputs[1]
ind = inputs[2]
out = outputs[0]
fail = sub['fail']
set_instead_of_inc = int(self.set_instead_of_inc)
inplace = int(self.inplace)
return """
int err;
if (%(inplace)s) {
if (%(params)s->inplace) {
Py_XDECREF(%(out)s);
%(out)s = %(x)s;
Py_INCREF(%(out)s);
......@@ -882,25 +902,19 @@ if (!%(out)s) {
// Exception already set
%(fail)s
}
if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(set_instead_of_inc)s)) {
if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of_inc)) {
%(fail)s
}
""" % locals()
""" % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0], fail=sub['fail'], params=sub['params'])
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = np.dtype(dtype_x).itemsize
itemsize_y = np.dtype(dtype_y).itemsize
itemsize_ind = np.dtype(dtype_ind).itemsize
itemsize_out = np.dtype(dtype_out).itemsize
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_y = gpuarray.dtype_to_ctype(dtype_y)
type_ind = gpuarray.dtype_to_ctype(dtype_ind)
type_out = gpuarray.dtype_to_ctype(dtype_out)
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename
code = """
......@@ -1010,7 +1024,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
}
return;
}
""" % locals()
""" % dict(type_x=type_x, type_y=type_y, type_ind=type_ind)
params = [
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
......@@ -1020,26 +1034,19 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
flags=flags, objvar=k_var)]
def c_support_code_struct(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = np.dtype(dtype_x).itemsize
itemsize_y = np.dtype(dtype_y).itemsize
itemsize_ind = np.dtype(dtype_ind).itemsize
itemsize_out = np.dtype(dtype_out).itemsize
k_var = "k_vector_add_fast_" + nodename
return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """
int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr,
PyGpuArrayObject* indices_arr,
const int set_instead_of_inc)
{
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
gpudata *errbuf;
int err, kerr = 0;
size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga);
size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga);
size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga);
if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
err = gpudata_property(py_self->ga.data,
......@@ -1049,11 +1056,11 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
return 1;
}
ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s;
ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s;
ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s;
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s;
ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / itemsize_x;
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / itemsize_x;
ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y;
ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y;
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind;
void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
(void *)&PyGpuArray_DIMS(py_self)[1],
(void *)&stride_X0,
......@@ -1093,7 +1100,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
}
return 0;
}
""" % locals()
""" % dict(k_var="k_vector_add_fast_" + nodename)
class GpuExtractDiag(Op):
......
......@@ -8,6 +8,7 @@ import numpy as np
import theano
from theano import Op, Apply
from theano.gof import EnumList
import theano.tensor as T
from theano.gradient import grad_not_implemented
from theano.gradient import grad_undefined
......@@ -39,13 +40,21 @@ class Images2Neibs(Op):
"""
__props__ = ("mode",)
BORDER_MODE = EnumList(('MODE_VALID', 'valid'),
('MODE_HALF', 'half'),
('MODE_FULL', 'full'),
('MODE_WRAP_CENTERED', 'wrap_centered'),
('MODE_IGNORE_BORDERS', 'ignore_borders'))
params_type = BORDER_MODE
def get_params(self, node):
return self.mode
def __init__(self, mode='valid'):
if mode not in ['valid', 'half', 'full',
'wrap_centered', 'ignore_borders']:
raise NotImplementedError("Only the mode valid, half, full, "
"ignore_borders and wrap_centered have "
"been implemented for Images2Neibs")
implemented_modes = self.BORDER_MODE.get_aliases()
if mode not in implemented_modes:
raise NotImplementedError("Only modes %s have been implemented for %s"
% (', '.join(implemented_modes), type(self).__name__))
self.mode = mode
def __str__(self):
......@@ -159,9 +168,9 @@ class Images2Neibs(Op):
grad_undefined(self, 2, neib_step)]
def c_code_cache_version(self):
return (8,)
return (10,)
def perform(self, node, inp, out_):
def perform(self, node, inp, out_, params):
ten4, neib_shape, neib_step = inp
z, = out_
# GpuImages2Neibs should not run this perform in DebugMode
......@@ -344,11 +353,6 @@ class Images2Neibs(Op):
return [(z_dim0, z_dim1)]
def c_code(self, node, name, inp, out, sub):
ten4, neib_shape, neib_step = inp
z, = out
fail = sub['fail']
mode = self.mode
return """
#ifndef CEIL_INTDIV
#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
......@@ -408,7 +412,7 @@ class Images2Neibs(Op):
%(fail)s;
}
if ( "%(mode)s" == "wrap_centered") {
if (%(mode)s == MODE_WRAP_CENTERED) {
if (c%%2!=1 || d%%2!=1){
PyErr_Format(PyExc_TypeError,
"Images2Neibs: in mode wrap_centered"
......@@ -430,7 +434,7 @@ class Images2Neibs(Op):
grid_c = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[2]),step_x);
grid_d = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[3]),step_y);
}else if ( "%(mode)s" == "valid") {
} else if (%(mode)s == MODE_VALID) {
if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
( (((PyArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
{
......@@ -455,12 +459,12 @@ class Images2Neibs(Op):
grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width
grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
}else if ( "%(mode)s" == "ignore_borders") {
} else if (%(mode)s == MODE_IGNORE_BORDERS) {
//number of patch in height
grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width
grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
}else if ( "%(mode)s" == "half") {
} else if (%(mode)s == MODE_HALF) {
if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
( (((PyArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0))
{
......@@ -485,7 +489,7 @@ class Images2Neibs(Op):
grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x);
//number of patch in width
grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y);
}else if ( "%(mode)s" == "full") {
} else if (%(mode)s == MODE_FULL) {
if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
( (((PyArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0))
{
......@@ -510,9 +514,9 @@ class Images2Neibs(Op):
grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]+c-2)/step_x);
//number of patch in width
grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]+d-2)/step_y);
}else {
} else {
PyErr_Format(PyExc_TypeError,
"Images2Neibs: unknow mode '%(mode)s'");
"Images2Neibs: unknow mode %%d", %(mode)s);
%(fail)s;
}
......@@ -572,13 +576,13 @@ class Images2Neibs(Op):
for (int i = 0; i < c; i++) // loop over c
{
int ten4_2 = i + a * step_x;
if ( "%(mode)s" == "wrap_centered" ){
if (%(mode)s == MODE_WRAP_CENTERED) {
ten4_2 -= wrap_centered_half_idx_shift_x;
if ( ten4_2 < 0 ) ten4_2 += height;
else if (ten4_2 >= height) ten4_2 -= height;
} else if ( "%(mode)s" == "half" ){
} else if (%(mode)s == MODE_HALF) {
ten4_2 -= wrap_centered_half_idx_shift_x;
} else if ( "%(mode)s" == "full" ){
} else if (%(mode)s == MODE_FULL) {
ten4_2 -= c - 1;
}
if (ten4_2 < 0 | ten4_2 >= height) {
......@@ -588,13 +592,13 @@ class Images2Neibs(Op):
for (int j = 0; j < d; j++) // loop over d
{
int ten4_3 = j + b * step_y;
if ( "%(mode)s" == "wrap_centered" ){
if (%(mode)s == MODE_WRAP_CENTERED) {
ten4_3 -= wrap_centered_half_idx_shift_y;
if ( ten4_3 < 0 ) ten4_3 += width;
else if (ten4_3 >= width) ten4_3 -= width;
} else if ( "%(mode)s" == "half" ){
} else if (%(mode)s == MODE_HALF) {
ten4_3 -= wrap_centered_half_idx_shift_y;
} else if ( "%(mode)s" == "full" ){
} else if (%(mode)s == MODE_FULL) {
ten4_3 -= d - 1;
}
int z_col = j + d * i;
......@@ -609,7 +613,8 @@ class Images2Neibs(Op):
}
}
} // END NESTED SCOPE
""" % locals()
""" % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0],
fail=sub['fail'], mode=sub['params'])
def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
......
......@@ -12,7 +12,7 @@ import theano
from theano.compat import izip
from theano.gradient import DisconnectedType
from theano import gof
from theano.gof import Apply, hashtype, Op, Type, MethodNotDefined
from theano.gof import Apply, hashtype, Op, Type, MethodNotDefined, ParamsType
from theano.printing import pprint
from theano import scalar as scal
from theano.tensor.basic import alloc
......@@ -1685,6 +1685,7 @@ class AdvancedSubtensor1(Op):
# of the grad() method.
__props__ = ()
_f16_ok = True
check_input = False
def __init__(self, sparse_grad=False):
self.sparse_grad = sparse_grad
......@@ -1872,10 +1873,13 @@ class AdvancedIncSubtensor1(Op):
"""
__props__ = ('inplace', 'set_instead_of_inc')
check_input = False
params_type = ParamsType(inplace=scal.bool,
set_instead_of_inc=scal.bool)
def __init__(self, inplace=False, set_instead_of_inc=False):
self.inplace = inplace
self.set_instead_of_inc = set_instead_of_inc
self.inplace = bool(inplace)
self.set_instead_of_inc = bool(set_instead_of_inc)
if inplace:
self.destroy_map = {0: [0]}
......@@ -1955,17 +1959,11 @@ class AdvancedIncSubtensor1(Op):
raise NotImplementedError
x, y, idx = input_names
out = output_names[0]
fail = sub['fail']
inc_or_set = 1 - self.set_instead_of_inc
if self.inplace: # convert bool to int
inplace = 1
else:
inplace = 0
copy_of_x = self.copy_of_x(x)
return """
PyObject* rval = NULL;
if (%(inplace)s)
if (%(params)s->inplace)
{
if (%(x)s != %(out)s)
{
......@@ -1983,16 +1981,17 @@ class AdvancedIncSubtensor1(Op):
%(fail)s
}
}
if (inplace_increment(%(out)s, (PyObject *)%(idx)s, %(y)s, %(inc_or_set)d)) {
if (inplace_increment(%(out)s, (PyObject *)%(idx)s, %(y)s, (1 - %(params)s->set_instead_of_inc))) {
%(fail)s;
}
Py_XDECREF(rval);
""" % locals()
""" % dict(x=x, y=y, idx=idx, out=out, copy_of_x=copy_of_x,
params=sub['params'], fail=sub['fail'])
def c_code_cache_version(self):
return (6,)
return (8,)
def perform(self, node, inp, out_):
def perform(self, node, inp, out_, params):
# TODO opt to make this inplace
x, y, idx = inp
out, = out_
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论