提交 5df0cfd8 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6091 from notoraptor/params-for-other-ops

Params for other ops
...@@ -346,6 +346,18 @@ class Shape_i(gof.Op): ...@@ -346,6 +346,18 @@ class Shape_i(gof.Op):
i = int(i) i = int(i)
self.i = i self.i = i
# NB:
# 1) params_type is defined as a property to avoid
# loop in Python import caused by importing theano.scalar below
# when params_type is defined directly in class code.
# 2) We wrap scalar into ParamsType (instead of directly using scalar as op param)
# to avoid Theano converting scalar param to constant that would be later
# hardcoded as litteral in C code, making us loose all the advantages of
# using params.
@property
def params_type(self):
return gof.ParamsType(i=theano.scalar.basic.int64)
def __str__(self): def __str__(self):
return '%s{%i}' % (self.__class__.__name__, self.i) return '%s{%i}' % (self.__class__.__name__, self.i)
...@@ -360,7 +372,7 @@ class Shape_i(gof.Op): ...@@ -360,7 +372,7 @@ class Shape_i(gof.Op):
(x, self.i)) (x, self.i))
return theano.Apply(self, [x], [theano.tensor.lscalar()]) return theano.Apply(self, [x], [theano.tensor.lscalar()])
def perform(self, node, inp, out_): def perform(self, node, inp, out_, params):
x, = inp x, = inp
out, = out_ out, = out_
if out[0] is None: if out[0] is None:
...@@ -383,7 +395,7 @@ class Shape_i(gof.Op): ...@@ -383,7 +395,7 @@ class Shape_i(gof.Op):
version.append((str(t), v)) version.append((str(t), v))
if version: if version:
version.append(1) version.append(2)
return tuple(version) return tuple(version)
...@@ -391,7 +403,8 @@ class Shape_i(gof.Op): ...@@ -391,7 +403,8 @@ class Shape_i(gof.Op):
iname, = inames iname, = inames
oname, = onames oname, = onames
fail = sub['fail'] fail = sub['fail']
i = self.i # i is then 'params->i', not just 'params'.
i = sub['params'] + '->i'
itype = node.inputs[0].type.__class__ itype = node.inputs[0].type.__class__
if itype in self.c_code_and_version: if itype in self.c_code_and_version:
......
...@@ -10,6 +10,9 @@ except ImportError: ...@@ -10,6 +10,9 @@ except ImportError:
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape, infer_context_name) from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape, infer_context_name)
from .opt import register_opt, op_lifter, register_opt2 from .opt import register_opt, op_lifter, register_opt2
from .type import gpu_context_type
from theano.gof import ParamsType
import theano.scalar as scalar
class GpuCumOp(GpuKernelBase, Op): class GpuCumOp(GpuKernelBase, Op):
...@@ -21,9 +24,12 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -21,9 +24,12 @@ class GpuCumOp(GpuKernelBase, Op):
""" """
SUPPORTED_NDIMS = 3 SUPPORTED_NDIMS = 3
__props__ = ('axis', 'mode') __props__ = ('axis', 'mode')
params_type = ParamsType(axis=scalar.int32,
context=gpu_context_type)
def __init__(self, axis, mode='add'): def __init__(self, axis, mode='add'):
self.axis = axis if axis else 0 assert axis is not None
self.axis = int(axis)
self.mode = mode self.mode = mode
def __eq__(self, other): def __eq__(self, other):
...@@ -35,7 +41,7 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -35,7 +41,7 @@ class GpuCumOp(GpuKernelBase, Op):
return hash(self.axis) ^ hash(self.mode) return hash(self.axis) ^ hash(self.mode)
def c_code_cache_version(self): def c_code_cache_version(self):
return (6,) return (7,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
...@@ -43,6 +49,9 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -43,6 +49,9 @@ class GpuCumOp(GpuKernelBase, Op):
def c_header_dirs(self): def c_header_dirs(self):
return [os.path.dirname(__file__)] return [os.path.dirname(__file__)]
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def make_node(self, x): def make_node(self, x):
assert x.type.dtype == 'float32', "Only float32 supported for GpuCumOp" assert x.type.dtype == 'float32', "Only float32 supported for GpuCumOp"
...@@ -244,24 +253,18 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -244,24 +253,18 @@ class GpuCumOp(GpuKernelBase, Op):
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda': if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError("cuda only") raise NotImplementedError("cuda only")
x, = inp return """
z, = out
axis = self.axis if self.axis is not None else 0
fail = sub['fail']
ctx = sub['params']
code = """
const size_t* shape = PyGpuArray_DIMS(%(x)s); const size_t* shape = PyGpuArray_DIMS(%(x)s);
bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s); bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s);
int axis = %(axis)s; int axis = %(params)s->axis;
if (axis < 0) { if (axis < 0) {
// Convert negative axis to positive axis. // Convert negative axis to positive axis.
axis += PyGpuArray_NDIM(%(x)s); axis += PyGpuArray_NDIM(%(x)s);
} }
if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s) != 0){ if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s),
%(x)s->ga.typecode, GA_C_ORDER, %(params)s->context) != 0) {
%(fail)s; %(fail)s;
} }
...@@ -270,17 +273,17 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -270,17 +273,17 @@ class GpuCumOp(GpuKernelBase, Op):
size_t max_grid_size1; size_t max_grid_size1;
size_t max_grid_size2; size_t max_grid_size2;
int err; int err;
err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0); err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0"); PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0");
%(fail)s; %(fail)s;
} }
err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1); err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1"); PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1");
%(fail)s; %(fail)s;
} }
err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2); err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2"); PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
%(fail)s; %(fail)s;
...@@ -289,9 +292,7 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -289,9 +292,7 @@ class GpuCumOp(GpuKernelBase, Op):
%(fail)s; %(fail)s;
} }
} }
""" % locals() """ % dict(x=inp[0], z=out[0], nodename=nodename, fail=sub['fail'], params=sub['params'])
return code
def c_support_code_struct(self, node, nodename): def c_support_code_struct(self, node, nodename):
code = """ code = """
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import numpy as np
from theano import Op, Apply, config from theano import Op, Apply, config
from theano.gof import ParamsType
from theano.tensor.nnet.neighbours import Images2Neibs from theano.tensor.nnet.neighbours import Images2Neibs
import theano.tensor as T import theano.tensor as T
try: try:
import pygpu
from pygpu import gpuarray from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
...@@ -14,7 +13,7 @@ except ImportError: ...@@ -14,7 +13,7 @@ except ImportError:
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name) infer_context_name)
from .opt import register_opt2, op_lifter, register_opt from .opt import register_opt2, op_lifter, register_opt
from .type import GpuArrayType from .type import GpuArrayType, gpu_context_type
class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...@@ -22,13 +21,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -22,13 +21,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
Images2Neibs for the GPU. Images2Neibs for the GPU.
""" """
def __init__(self, mode='valid'): params_type = ParamsType(mode=Images2Neibs.BORDER_MODE, context=gpu_context_type)
if mode not in ['valid', 'half', 'full',
'ignore_borders', 'wrap_centered']: def get_params(self, node):
raise NotImplementedError("Only the mode valid, half, full, " return self.params_type.get_params(self, context=node.inputs[0].type.context)
"ignore_borders and wrap_centered have "
"been implemented for GpuImages2Neibs")
self.mode = mode
def make_node(self, ten4, neib_shape, neib_step=None): def make_node(self, ten4, neib_shape, neib_step=None):
ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4)) ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
...@@ -50,7 +46,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -50,7 +46,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
context_name=ten4.type.context_name)()]) context_name=ten4.type.context_name)()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (12,) return (13,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
...@@ -61,13 +57,16 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -61,13 +57,16 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
flags = Kernel.get_flags(dtype_ten4, dtype_z) flags = Kernel.get_flags(dtype_ten4, dtype_z)
type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4) type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
type_z = gpuarray.dtype_to_ctype(dtype_z) type_z = gpuarray.dtype_to_ctype(dtype_z)
mode = self.mode # `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here.
mode_constants = self.BORDER_MODE.c_support_code()
kernels = [] kernels = []
kname = "k_multi_warp_less" kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename k_var = "k_multi_warp_less_" + nodename
code = """ code = """
// a version that uses less registers but doesn't work in all cases. // a version that uses less registers but doesn't work in all cases.
%(mode_constants)s
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_int mode,
const ga_int nb_batch, const ga_int nb_batch,
const ga_int nb_stack, const ga_int nb_stack,
const ga_int height, const ga_int height,
...@@ -110,29 +109,29 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -110,29 +109,29 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
ga_int i = LID_1; // loop over c ga_int i = LID_1; // loop over c
{ {
ga_int ten4_2 = i + a * step_x; ga_int ten4_2 = i + a * step_x;
if("%(mode)s"=="wrap_centered"){ if(mode == MODE_WRAP_CENTERED) {
ten4_2 -= wrap_centered_half_idx_shift_x; ten4_2 -= wrap_centered_half_idx_shift_x;
if ( ten4_2 < 0 ) if ( ten4_2 < 0 )
ten4_2 += height; ten4_2 += height;
else if (ten4_2 >= height) else if (ten4_2 >= height)
ten4_2 -= height; ten4_2 -= height;
} else if ("%(mode)s"=="half"){ } else if (mode == MODE_HALF) {
ten4_2 -= wrap_centered_half_idx_shift_x; ten4_2 -= wrap_centered_half_idx_shift_x;
} else if ("%(mode)s"=="full"){ } else if (mode == MODE_FULL) {
ten4_2 -= c - 1; ten4_2 -= c - 1;
} }
ga_int j = LID_0; // loop over d ga_int j = LID_0; // loop over d
{ {
ga_int ten4_3 = j + b * step_y; ga_int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){ if(mode == MODE_WRAP_CENTERED){
ten4_3 -= wrap_centered_half_idx_shift_y; ten4_3 -= wrap_centered_half_idx_shift_y;
if ( ten4_3 < 0 ) if ( ten4_3 < 0 )
ten4_3 += width; ten4_3 += width;
else if (ten4_3 >= width) else if (ten4_3 >= width)
ten4_3 -= width; ten4_3 -= width;
} else if ("%(mode)s"=="half"){ } else if (mode == MODE_HALF) {
ten4_3 -= wrap_centered_half_idx_shift_y; ten4_3 -= wrap_centered_half_idx_shift_y;
} else if ("%(mode)s"=="full"){ } else if (mode == MODE_FULL) {
ten4_3 -= d - 1; ten4_3 -= d - 1;
} }
...@@ -150,8 +149,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -150,8 +149,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
} }
} }
} }
}""" % locals() }""" % dict(kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants)
params = [ params = [
'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'uintp', 'uintp', 'uintp', 'uintp', 'uintp', 'uintp', 'uintp', 'uintp',
...@@ -165,7 +165,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -165,7 +165,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kname = "k_multi_warp" kname = "k_multi_warp"
k_var = "k_multi_warp_" + nodename k_var = "k_multi_warp_" + nodename
code = """ code = """
%(mode_constants)s
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_int mode,
const ga_int nb_batch, const ga_int nb_batch,
const ga_int nb_stack, const ga_int nb_stack,
const ga_int height, const ga_int height,
...@@ -209,30 +211,30 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -209,30 +211,30 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
for (ga_int i = LID_1; i < c; i+=LDIM_1) for (ga_int i = LID_1; i < c; i+=LDIM_1)
{ {
ga_int ten4_2 = i + a * step_x; ga_int ten4_2 = i + a * step_x;
if("%(mode)s"=="wrap_centered"){ if(mode == MODE_WRAP_CENTERED) {
ten4_2 -= wrap_centered_half_idx_shift_x; ten4_2 -= wrap_centered_half_idx_shift_x;
if ( ten4_2 < 0 ) if ( ten4_2 < 0 )
ten4_2 += height; ten4_2 += height;
else if (ten4_2 >= height) else if (ten4_2 >= height)
ten4_2 -= height; ten4_2 -= height;
} else if ("%(mode)s"=="half"){ } else if (mode == MODE_HALF) {
ten4_2 -= wrap_centered_half_idx_shift_x; ten4_2 -= wrap_centered_half_idx_shift_x;
} else if ("%(mode)s"=="full"){ } else if (mode == MODE_FULL) {
ten4_2 -= c - 1; ten4_2 -= c - 1;
} }
// loop over d // loop over d
for (ga_int j = LID_0; j < d; j+=LDIM_0) for (ga_int j = LID_0; j < d; j+=LDIM_0)
{ {
ga_int ten4_3 = j + b * step_y; ga_int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){ if(mode == MODE_WRAP_CENTERED) {
ten4_3 -= wrap_centered_half_idx_shift_y; ten4_3 -= wrap_centered_half_idx_shift_y;
if ( ten4_3 < 0 ) if ( ten4_3 < 0 )
ten4_3 += width; ten4_3 += width;
else if (ten4_3 >= width) else if (ten4_3 >= width)
ten4_3 -= width; ten4_3 -= width;
} else if ("%(mode)s"=="half"){ } else if (mode == MODE_HALF) {
ten4_3 -= wrap_centered_half_idx_shift_y; ten4_3 -= wrap_centered_half_idx_shift_y;
} else if ("%(mode)s"=="full"){ } else if (mode == MODE_FULL) {
ten4_3 -= d - 1; ten4_3 -= d - 1;
} }
...@@ -251,8 +253,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -251,8 +253,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
} }
} }
} }
""" % locals() """ % dict(kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants)
params = [ params = [
'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'uintp', 'uintp', 'uintp', 'uintp', 'uintp', 'uintp', 'uintp', 'uintp',
...@@ -274,18 +277,6 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -274,18 +277,6 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
""" """
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
dtype_ten4 = node.inputs[0].dtype
dtype_neib_shape = node.inputs[1].dtype
dtype_neib_step = node.inputs[2].dtype
dtype_z = node.outputs[0].dtype
itemsize_ten4 = np.dtype(dtype_ten4).itemsize
itemsize_z = np.dtype(dtype_z).itemsize
typecode_z = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
ten4, neib_shape, neib_step = inp
z, = out
fail = sub['fail']
ctx = sub['params']
mode = self.mode
err_check = """ err_check = """
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
...@@ -293,16 +284,23 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -293,16 +284,23 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
GpuKernel_error(fptr, err)); GpuKernel_error(fptr, err));
%(fail)s; %(fail)s;
} }
""" % locals() """ % dict(fail=sub['fail'])
sync = "" sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
sync = """ sync = """
err = GpuArray_sync(&%(z)s->ga); err = GpuArray_sync(&%(z)s->ga);
%(err_check)s %(err_check)s
""" % locals() """ % dict(z=out[0], err_check=err_check)
# NB: To reduce C code variability:
# For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize
# For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node)
# For typecode_z, I use ten4->ga.typecode (for same reason as above)
return """ return """
int grid_c = -1; int grid_c = -1;
int grid_d = -1; int grid_d = -1;
size_t itemsize_ten4 = GpuArray_ITEMSIZE(&%(ten4)s->ga);
size_t itemsize_z = itemsize_ten4;
int typecode_z = %(ten4)s->ga.typecode;
{ {
if (PyGpuArray_NDIM(%(ten4)s) != 4) if (PyGpuArray_NDIM(%(ten4)s) != 4)
...@@ -351,7 +349,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -351,7 +349,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
%(fail)s; %(fail)s;
} }
if ( "%(mode)s" == "wrap_centered") { if (%(params)s->mode == MODE_WRAP_CENTERED) {
if (c%%2!=1 || d%%2!=1){ if (c%%2!=1 || d%%2!=1){
PyErr_Format(PyExc_TypeError, PyErr_Format(PyExc_TypeError,
"GpuImages2Neibs: in mode wrap_centered need patch with odd shapes"); "GpuImages2Neibs: in mode wrap_centered need patch with odd shapes");
...@@ -375,7 +373,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -375,7 +373,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
(size_t)step_y); (size_t)step_y);
}else if ( "%(mode)s" == "valid") { } else if (%(params)s->mode == MODE_VALID) {
if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0)) ((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
{ {
...@@ -400,12 +398,12 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -400,12 +398,12 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x); grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width //number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y); grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
}else if ( "%(mode)s" == "ignore_borders") { } else if (%(params)s->mode == MODE_IGNORE_BORDERS) {
//number of patch in height //number of patch in height
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x); grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width //number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y); grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
}else if ( "%(mode)s" == "half") { } else if (%(params)s->mode == MODE_HALF) {
if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
((((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0)) ((((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0))
{ {
...@@ -430,7 +428,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -430,7 +428,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x); grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x);
//number of patch in width //number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y); grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y);
}else if ( "%(mode)s" == "full") { } else if (%(params)s->mode == MODE_FULL) {
if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
( (((PyGpuArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0)) ( (((PyGpuArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0))
{ {
...@@ -455,9 +453,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -455,9 +453,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]+c-2)/step_x); grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]+c-2)/step_x);
//number of patch in width //number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]+d-2)/step_y); grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]+d-2)/step_y);
}else{ } else {
PyErr_Format(PyExc_TypeError, PyErr_Format(PyExc_TypeError,
"GpuImages2Neibs:: unknown mode '%(mode)s'"); "GpuImages2Neibs:: unknown mode %%d", %(params)s->mode);
%(fail)s; %(fail)s;
} }
...@@ -476,8 +474,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -476,8 +474,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
size_t dims[2]; size_t dims[2];
dims[0] = z_dim0; dims[0] = z_dim0;
dims[1] = z_dim1; dims[1] = z_dim1;
%(z)s = pygpu_empty(2, dims, %(typecode_z)s, %(z)s = pygpu_empty(2, dims, typecode_z,
GA_C_ORDER, %(ctx)s, Py_None); GA_C_ORDER, %(params)s->context, Py_None);
if (!%(z)s) if (!%(z)s)
{ {
PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:" PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
...@@ -490,6 +488,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -490,6 +488,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
{ // NESTED SCOPE { // NESTED SCOPE
const int mode = %(params)s->mode;
const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0]; const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0];
const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1]; const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1];
const int height = PyGpuArray_DIMS(%(ten4)s)[2]; const int height = PyGpuArray_DIMS(%(ten4)s)[2];
...@@ -507,7 +506,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -507,7 +506,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
size_t threads_per_block[3] = {d, c, 1}; size_t threads_per_block[3] = {d, c, 1};
//get the max threads per blocks //get the max threads per blocks
size_t max_threads_dim; size_t max_threads_dim;
int err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim); int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims"); PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
%(fail)s; %(fail)s;
...@@ -535,14 +534,19 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -535,14 +534,19 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
}else{ }else{
fptr = &k_multi_warp_%(name)s; fptr = &k_multi_warp_%(name)s;
} }
// printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n", max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1], n_blocks[2]); /*
size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s; printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n",
size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s; max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2],
size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s; n_blocks[0], n_blocks[1], n_blocks[2]);
size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s; */
size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s; size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / itemsize_ten4;
size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s; size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / itemsize_ten4;
void *kernel_params[] = {(void *)&nb_batch, size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / itemsize_ten4;
size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / itemsize_ten4;
size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / itemsize_z;
size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / itemsize_z;
void *kernel_params[] = {(void *)&mode,
(void *)&nb_batch,
(void *)&nb_stack, (void *)&nb_stack,
(void *)&height, (void *)&width, (void *)&height, (void *)&width,
(void *)&c, (void *)&d, (void *)&c, (void *)&d,
...@@ -562,11 +566,18 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -562,11 +566,18 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
%(err_check)s %(err_check)s
%(sync)s %(sync)s
} // END NESTED SCOPE } // END NESTED SCOPE
""" % locals() """ % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0],
dtype_neib_shape=node.inputs[1].dtype,
def perform(self, node, inp, out, ctx): dtype_neib_step=node.inputs[2].dtype,
err_check=err_check,
sync=sync,
name=name,
params=sub['params'],
fail=sub['fail'])
def perform(self, node, inp, out, params):
# Disable the perform method from the CPU version # Disable the perform method from the CPU version
Op.perform(self, node, inp, out, ctx) Op.perform(self, node, inp, out, params)
@register_opt('fast_compile') @register_opt('fast_compile')
......
...@@ -7,16 +7,15 @@ http://www.iro.umontreal.ca/~simardr/ssj/indexe.html ...@@ -7,16 +7,15 @@ http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
""" """
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import numpy as np
from theano import Apply, tensor from theano import Apply, tensor
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.sandbox.rng_mrg import mrg_uniform_base, mrg_uniform from theano.sandbox.rng_mrg import mrg_uniform_base, mrg_uniform
from theano.tensor import as_tensor_variable, get_vector_length from theano.tensor import as_tensor_variable, get_vector_length
from theano.scalar import int32 as int_t
from .basic_ops import (GpuKernelBase, Kernel, infer_context_name, from .basic_ops import (GpuKernelBase, Kernel, infer_context_name,
host_from_gpu, as_gpuarray_variable) host_from_gpu, as_gpuarray_variable)
from .type import GpuArrayType from .type import GpuArrayType, gpu_context_type
from .fp16_help import write_w from .fp16_help import write_w
from .opt import register_opt, register_opt2 from .opt import register_opt, register_opt2
...@@ -24,6 +23,9 @@ from .opt import register_opt, register_opt2 ...@@ -24,6 +23,9 @@ from .opt import register_opt, register_opt2
class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
# GpuArray version # GpuArray version
_f16_ok = True _f16_ok = True
params_type = mrg_uniform_base.params_type.extended(otypecode=int_t, context=gpu_context_type)
otypecode = property(lambda self: self.output_type.typecode)
def make_node(self, rstate, size): def make_node(self, rstate, size):
# error checking slightly redundant here, since # error checking slightly redundant here, since
...@@ -39,6 +41,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -39,6 +41,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
[rstate, size], [rstate, size],
[rstate.type(), output_type]) [rstate.type(), output_type])
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
@classmethod @classmethod
def new(cls, rstate, ndim, dtype, size): def new(cls, rstate, ndim, dtype, size):
v_size = as_tensor_variable(size) v_size = as_tensor_variable(size)
...@@ -168,40 +173,34 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -168,40 +173,34 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
] ]
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp
o_rstate, o_sample = out
inplace = int(self.inplace)
ndim = self.output_type.ndim
o_type_num = np.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
ctx = sub['params']
kname = self.gpu_kernels(node, nodename)[0].objvar
otypecode = str(self.output_type.typecode)
return """ return """
npy_int64 M1 = 2147483647; //2^31 - 1 npy_int64 M1 = 2147483647; //2^31 - 1
// The +1 is to avoid odims[0] which fails on windows
size_t odims[%(ndim)s+1];
size_t n_elements = 1; size_t n_elements = 1;
unsigned int n_streams; unsigned int n_streams;
int must_alloc_sample = ((NULL == %(o_sample)s) int must_alloc_sample = ((NULL == %(o_sample)s)
|| !pygpu_GpuArray_Check((PyObject*)%(o_sample)s) || !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
|| !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS) || !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
|| (PyGpuArray_NDIM(%(o_sample)s) != %(ndim)s)); || (PyGpuArray_NDIM(%(o_sample)s) != %(params)s->ndim));
size_t* odims = (size_t*)malloc(%(params)s->ndim * sizeof(size_t));
if (odims == NULL) {
PyErr_NoMemory();
%(just_fail)s
}
if (PyArray_NDIM(%(size)s) != 1) if (PyArray_NDIM(%(size)s) != 1)
{ {
PyErr_SetString(PyExc_ValueError, "size must be vector"); PyErr_SetString(PyExc_ValueError, "size must be vector");
%(fail)s %(fail)s
} }
if (PyArray_DIMS(%(size)s)[0] != %(ndim)s) if (PyArray_DIMS(%(size)s)[0] != %(params)s->ndim)
{ {
PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)", PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
%(ndim)s, PyArray_DIMS(%(size)s)[0]); %(params)s->ndim, PyArray_DIMS(%(size)s)[0]);
%(fail)s %(fail)s
} }
for (int i = 0; i < %(ndim)s; ++i) for (int i = 0; i < %(params)s->ndim; ++i)
{ {
odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i); odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
n_elements *= odims[i]; n_elements *= odims[i];
...@@ -219,8 +218,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -219,8 +218,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
if (must_alloc_sample) if (must_alloc_sample)
{ {
Py_XDECREF(%(o_sample)s); Py_XDECREF(%(o_sample)s);
%(o_sample)s = pygpu_empty(%(ndim)s, odims, %(otypecode)s, GA_C_ORDER, %(o_sample)s = pygpu_empty(%(params)s->ndim, odims, %(params)s->otypecode, GA_C_ORDER,
%(ctx)s, Py_None); %(params)s->context, Py_None);
if(!%(o_sample)s) if(!%(o_sample)s)
{ {
%(fail)s; %(fail)s;
...@@ -233,7 +232,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -233,7 +232,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
} }
Py_XDECREF(%(o_rstate)s); Py_XDECREF(%(o_rstate)s);
if (%(inplace)s) if (%(params)s->inplace)
{ {
Py_INCREF(%(rstate)s); Py_INCREF(%(rstate)s);
%(o_rstate)s = %(rstate)s; %(o_rstate)s = %(rstate)s;
...@@ -285,10 +284,22 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -285,10 +284,22 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
%(fail)s %(fail)s
} }
} }
""" % locals()
free(odims);
""" % dict(rstate=inp[0], size=inp[1],
o_rstate=out[0], o_sample=out[1],
kname=self.gpu_kernels(node, nodename)[0].objvar,
params=sub['params'],
just_fail=sub['fail'],
fail="""
{
free(odims);
%(fail)s
}
""" % dict(fail=sub['fail']))
def c_code_cache_version(self): def c_code_cache_version(self):
return (14,) return (15,)
@register_opt2([mrg_uniform], 'fast_compile') @register_opt2([mrg_uniform], 'fast_compile')
......
...@@ -7,9 +7,11 @@ from six import integer_types ...@@ -7,9 +7,11 @@ from six import integer_types
from six.moves import StringIO from six.moves import StringIO
from theano import tensor, gof, Op from theano import tensor, gof, Op
from theano.gof import ParamsType
from theano.gradient import grad_not_implemented from theano.gradient import grad_not_implemented
import theano.tensor as T import theano.tensor as T
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
from theano.scalar import bool as bool_t, int32 as int_t, uint32 as size_t
try: try:
import pygpu import pygpu
...@@ -594,7 +596,15 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -594,7 +596,15 @@ class GpuAdvancedIncSubtensor1(Op):
""" """
_f16_ok = True _f16_ok = True
__props__ = ('inplace', 'set_instead_of_inc') __props__ = ('inplace', 'set_instead_of_inc')
params_type = gpu_context_type params_type = ParamsType(inplace=bool_t,
set_instead_of_inc=bool_t,
context=gpu_context_type,
# following params are used into c_init_code_struct(),
# as inputs are not available in that function.
ndim_input_0=size_t,
ndim_input_1=size_t,
typecode_input_0=int_t,
typecode_input_1=int_t)
def __init__(self, inplace=False, set_instead_of_inc=False): def __init__(self, inplace=False, set_instead_of_inc=False):
self.inplace = inplace self.inplace = inplace
...@@ -634,12 +644,17 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -634,12 +644,17 @@ class GpuAdvancedIncSubtensor1(Op):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def get_params(self, node): def get_params(self, node):
return node.outputs[0].type.context return self.params_type.get_params(self, context=node.outputs[0].type.context,
# following params are used into c_init_code_struct().
ndim_input_0=node.inputs[0].ndim,
ndim_input_1=node.inputs[1].ndim,
typecode_input_0=node.inputs[0].type.typecode,
typecode_input_1=node.inputs[1].type.typecode)
# We can't use the parent version that loops on each index # We can't use the parent version that loops on each index
# as we also need to loop when set_instead_of_inc is True and the # as we also need to loop when set_instead_of_inc is True and the
# parent doesn't loop in that case. # parent doesn't loop in that case.
def perform(self, node, inp, out_, ctx=None): def perform(self, node, inp, out_, params=None):
# TODO opt to make this inplace # TODO opt to make this inplace
x, y, idx = inp x, y, idx = inp
out, = out_ out, = out_
...@@ -700,21 +715,18 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -700,21 +715,18 @@ class GpuAdvancedIncSubtensor1(Op):
return """ return """
gpuelemwise_arg args[2] = {{0}}; gpuelemwise_arg args[2] = {{0}};
args[0].name = "a"; args[0].name = "a";
args[0].typecode = %(type1)s; args[0].typecode = %(params)s->typecode_input_0;
args[0].flags = GE_READ|GE_WRITE; args[0].flags = GE_READ|GE_WRITE;
args[1].name = "b"; args[1].name = "b";
args[1].typecode = %(type2)s; args[1].typecode = %(params)s->typecode_input_1;
args[1].flags = GE_READ; args[1].flags = GE_READ;
iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b", iadd = GpuElemwise_new(%(params)s->context->ctx, "", "a += b",
2, args, %(nd)s, GE_CONVERT_F16); 2, args, %(params)s->ndim_input_1, GE_CONVERT_F16);
if (iadd == NULL) { if (iadd == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support"); PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
%(fail)s %(fail)s
} }
""" % dict(ctx=sub['params'], fail=sub['fail'], """ % dict(params=sub['params'], fail=sub['fail'])
type1=node.inputs[0].type.typecode,
type2=node.inputs[1].type.typecode,
nd=node.inputs[1].ndim)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
if (node.inputs[0].ndim != node.inputs[1].ndim): if (node.inputs[0].ndim != node.inputs[1].ndim):
...@@ -722,18 +734,26 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -722,18 +734,26 @@ class GpuAdvancedIncSubtensor1(Op):
return """ return """
PyGpuArrayObject *row_x, *row_y; PyGpuArrayObject *row_x, *row_y;
ssize_t start[%(nd)s], step[%(nd)s]; size_t nd = %(params)s->ndim_input_0;
ssize_t *start = NULL, *step = NULL;
size_t num_indices, j; size_t num_indices, j;
int ret; int ret;
int broadcast_y; int broadcast_y;
for (j = 0; j < %(nd)s; j++) { start = (ssize_t*)malloc(nd * sizeof(ssize_t));
step = (ssize_t*)malloc(nd * sizeof(ssize_t));
if (start == NULL || step == NULL) {
PyErr_NoMemory();
%(fail)s
}
for (j = 0; j < nd; ++j) {
start[j] = 0; start[j] = 0;
step[j] = 1; step[j] = 1;
} }
step[0] = 0; step[0] = 0;
num_indices = PyArray_SIZE(%(ind)s); num_indices = PyArray_SIZE(%(ind)s);
if (!%(inplace)s) { if (!%(params)s->inplace) {
%(out)s = theano_try_copy(%(out)s, %(x)s); %(out)s = theano_try_copy(%(out)s, %(x)s);
if (%(out)s == NULL) { if (%(out)s == NULL) {
// Exception already set // Exception already set
...@@ -774,7 +794,7 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -774,7 +794,7 @@ class GpuAdvancedIncSubtensor1(Op):
%(fail)s; %(fail)s;
} }
if (%(set_instead_of_inc)s) { if (%(params)s->set_instead_of_inc) {
ret = GpuArray_setarray(&row_x->ga, &row_y->ga); ret = GpuArray_setarray(&row_x->ga, &row_y->ga);
} else { } else {
void *args[2]; void *args[2];
...@@ -788,13 +808,21 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -788,13 +808,21 @@ class GpuAdvancedIncSubtensor1(Op):
PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements"); PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
} }
} }
free(start);
free(step);
""" % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0], """ % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0],
fail=sub['fail'], inplace=int(self.inplace), params=sub['params'],
nd=node.inputs[0].ndim, fail="""
set_instead_of_inc=int(self.set_instead_of_inc)) {
free(start);
free(step);
%(fail)s
}
""" % dict(fail=sub['fail']))
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (4,)
class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
...@@ -805,6 +833,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, ...@@ -805,6 +833,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
""" """
_f16_ok = True _f16_ok = True
params_type = GpuAdvancedIncSubtensor1.params_type
get_params = GpuAdvancedIncSubtensor1.get_params
def make_node(self, x, y, ilist): def make_node(self, x, y, ilist):
""" """
...@@ -837,14 +867,11 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, ...@@ -837,14 +867,11 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def get_params(self, node): def perform(self, node, inp, out, params):
return node.outputs[0].type.context
def perform(self, node, inp, out, ctx):
return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out) return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)
def c_code_cache_version(self): def c_code_cache_version(self):
return (12,) return (13,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray_helper.h>', return ['<numpy_compat.h>', '<gpuarray_helper.h>',
...@@ -854,7 +881,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, ...@@ -854,7 +881,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
return [os.path.dirname(__file__)] return [os.path.dirname(__file__)]
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
ctx = self.get_params(node) ctx = self.get_params(node).context
if ctx.kind != b'cuda': if ctx.kind != b'cuda':
raise NotImplementedError("cuda only") raise NotImplementedError("cuda only")
if (node.inputs[0].ndim != node.inputs[1].ndim or if (node.inputs[0].ndim != node.inputs[1].ndim or
...@@ -862,16 +889,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, ...@@ -862,16 +889,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
int(ctx.bin_id[-2]) < 2): int(ctx.bin_id[-2]) < 2):
raise NotImplementedError("This case does not have C code yet.") raise NotImplementedError("This case does not have C code yet.")
x = inputs[0]
y = inputs[1]
ind = inputs[2]
out = outputs[0]
fail = sub['fail']
set_instead_of_inc = int(self.set_instead_of_inc)
inplace = int(self.inplace)
return """ return """
int err; int err;
if (%(inplace)s) { if (%(params)s->inplace) {
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = %(x)s; %(out)s = %(x)s;
Py_INCREF(%(out)s); Py_INCREF(%(out)s);
...@@ -882,25 +902,19 @@ if (!%(out)s) { ...@@ -882,25 +902,19 @@ if (!%(out)s) {
// Exception already set // Exception already set
%(fail)s %(fail)s
} }
if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(set_instead_of_inc)s)) { if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of_inc)) {
%(fail)s %(fail)s
} }
""" % locals() """ % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0], fail=sub['fail'], params=sub['params'])
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = np.dtype(dtype_x).itemsize
itemsize_y = np.dtype(dtype_y).itemsize
itemsize_ind = np.dtype(dtype_ind).itemsize
itemsize_out = np.dtype(dtype_out).itemsize
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
type_x = gpuarray.dtype_to_ctype(dtype_x) type_x = gpuarray.dtype_to_ctype(dtype_x)
type_y = gpuarray.dtype_to_ctype(dtype_y) type_y = gpuarray.dtype_to_ctype(dtype_y)
type_ind = gpuarray.dtype_to_ctype(dtype_ind) type_ind = gpuarray.dtype_to_ctype(dtype_ind)
type_out = gpuarray.dtype_to_ctype(dtype_out) flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
kname = "k_vector_add_fast" kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename k_var = "k_vector_add_fast_" + nodename
code = """ code = """
...@@ -1010,7 +1024,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1010,7 +1024,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
} }
return; return;
} }
""" % locals() """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind)
params = [ params = [
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
...@@ -1020,26 +1034,19 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1020,26 +1034,19 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
flags=flags, objvar=k_var)] flags=flags, objvar=k_var)]
def c_support_code_struct(self, node, nodename): def c_support_code_struct(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = np.dtype(dtype_x).itemsize
itemsize_y = np.dtype(dtype_y).itemsize
itemsize_ind = np.dtype(dtype_ind).itemsize
itemsize_out = np.dtype(dtype_out).itemsize
k_var = "k_vector_add_fast_" + nodename
return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """ return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """
int GpuArray_vector_add_fast(PyGpuArrayObject* py_self, int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other, PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr, PyGpuArrayObject* indices_arr,
const int set_instead_of_inc) const int set_instead_of_inc)
{ {
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1}; size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1}; size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
gpudata *errbuf; gpudata *errbuf;
int err, kerr = 0; int err, kerr = 0;
size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga);
size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga);
size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga);
if (threads_per_block[0] > 0 && n_blocks[0] > 0) { if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
err = gpudata_property(py_self->ga.data, err = gpudata_property(py_self->ga.data,
...@@ -1049,11 +1056,11 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1049,11 +1056,11 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
return 1; return 1;
} }
ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s; ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / itemsize_x;
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s; ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / itemsize_x;
ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s; ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y;
ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s; ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y;
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s; ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind;
void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0], void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
(void *)&PyGpuArray_DIMS(py_self)[1], (void *)&PyGpuArray_DIMS(py_self)[1],
(void *)&stride_X0, (void *)&stride_X0,
...@@ -1093,7 +1100,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1093,7 +1100,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
} }
return 0; return 0;
} }
""" % locals() """ % dict(k_var="k_vector_add_fast_" + nodename)
class GpuExtractDiag(Op): class GpuExtractDiag(Op):
......
...@@ -8,6 +8,7 @@ import numpy as np ...@@ -8,6 +8,7 @@ import numpy as np
import theano import theano
from theano import Op, Apply from theano import Op, Apply
from theano.gof import EnumList
import theano.tensor as T import theano.tensor as T
from theano.gradient import grad_not_implemented from theano.gradient import grad_not_implemented
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
...@@ -39,13 +40,21 @@ class Images2Neibs(Op): ...@@ -39,13 +40,21 @@ class Images2Neibs(Op):
""" """
__props__ = ("mode",) __props__ = ("mode",)
BORDER_MODE = EnumList(('MODE_VALID', 'valid'),
('MODE_HALF', 'half'),
('MODE_FULL', 'full'),
('MODE_WRAP_CENTERED', 'wrap_centered'),
('MODE_IGNORE_BORDERS', 'ignore_borders'))
params_type = BORDER_MODE
def get_params(self, node):
return self.mode
def __init__(self, mode='valid'): def __init__(self, mode='valid'):
if mode not in ['valid', 'half', 'full', implemented_modes = self.BORDER_MODE.get_aliases()
'wrap_centered', 'ignore_borders']: if mode not in implemented_modes:
raise NotImplementedError("Only the mode valid, half, full, " raise NotImplementedError("Only modes %s have been implemented for %s"
"ignore_borders and wrap_centered have " % (', '.join(implemented_modes), type(self).__name__))
"been implemented for Images2Neibs")
self.mode = mode self.mode = mode
def __str__(self): def __str__(self):
...@@ -159,9 +168,9 @@ class Images2Neibs(Op): ...@@ -159,9 +168,9 @@ class Images2Neibs(Op):
grad_undefined(self, 2, neib_step)] grad_undefined(self, 2, neib_step)]
def c_code_cache_version(self): def c_code_cache_version(self):
return (8,) return (10,)
def perform(self, node, inp, out_): def perform(self, node, inp, out_, params):
ten4, neib_shape, neib_step = inp ten4, neib_shape, neib_step = inp
z, = out_ z, = out_
# GpuImages2Neibs should not run this perform in DebugMode # GpuImages2Neibs should not run this perform in DebugMode
...@@ -344,11 +353,6 @@ class Images2Neibs(Op): ...@@ -344,11 +353,6 @@ class Images2Neibs(Op):
return [(z_dim0, z_dim1)] return [(z_dim0, z_dim1)]
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
ten4, neib_shape, neib_step = inp
z, = out
fail = sub['fail']
mode = self.mode
return """ return """
#ifndef CEIL_INTDIV #ifndef CEIL_INTDIV
#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0)) #define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
...@@ -408,7 +412,7 @@ class Images2Neibs(Op): ...@@ -408,7 +412,7 @@ class Images2Neibs(Op):
%(fail)s; %(fail)s;
} }
if ( "%(mode)s" == "wrap_centered") { if (%(mode)s == MODE_WRAP_CENTERED) {
if (c%%2!=1 || d%%2!=1){ if (c%%2!=1 || d%%2!=1){
PyErr_Format(PyExc_TypeError, PyErr_Format(PyExc_TypeError,
"Images2Neibs: in mode wrap_centered" "Images2Neibs: in mode wrap_centered"
...@@ -430,7 +434,7 @@ class Images2Neibs(Op): ...@@ -430,7 +434,7 @@ class Images2Neibs(Op):
grid_c = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[2]),step_x); grid_c = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[2]),step_x);
grid_d = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[3]),step_y); grid_d = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[3]),step_y);
}else if ( "%(mode)s" == "valid") { } else if (%(mode)s == MODE_VALID) {
if ( ((PyArray_DIMS(%(ten4)s))[2] < c) || if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
( (((PyArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0)) ( (((PyArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
{ {
...@@ -455,12 +459,12 @@ class Images2Neibs(Op): ...@@ -455,12 +459,12 @@ class Images2Neibs(Op):
grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x); grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width //number of patch in width
grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y); grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
}else if ( "%(mode)s" == "ignore_borders") { } else if (%(mode)s == MODE_IGNORE_BORDERS) {
//number of patch in height //number of patch in height
grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x); grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width //number of patch in width
grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y); grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
}else if ( "%(mode)s" == "half") { } else if (%(mode)s == MODE_HALF) {
if ( ((PyArray_DIMS(%(ten4)s))[2] < c) || if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
( (((PyArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0)) ( (((PyArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0))
{ {
...@@ -485,7 +489,7 @@ class Images2Neibs(Op): ...@@ -485,7 +489,7 @@ class Images2Neibs(Op):
grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x); grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x);
//number of patch in width //number of patch in width
grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y); grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y);
}else if ( "%(mode)s" == "full") { } else if (%(mode)s == MODE_FULL) {
if ( ((PyArray_DIMS(%(ten4)s))[2] < c) || if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
( (((PyArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0)) ( (((PyArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0))
{ {
...@@ -510,9 +514,9 @@ class Images2Neibs(Op): ...@@ -510,9 +514,9 @@ class Images2Neibs(Op):
grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]+c-2)/step_x); grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]+c-2)/step_x);
//number of patch in width //number of patch in width
grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]+d-2)/step_y); grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]+d-2)/step_y);
}else { } else {
PyErr_Format(PyExc_TypeError, PyErr_Format(PyExc_TypeError,
"Images2Neibs: unknow mode '%(mode)s'"); "Images2Neibs: unknow mode %%d", %(mode)s);
%(fail)s; %(fail)s;
} }
...@@ -572,13 +576,13 @@ class Images2Neibs(Op): ...@@ -572,13 +576,13 @@ class Images2Neibs(Op):
for (int i = 0; i < c; i++) // loop over c for (int i = 0; i < c; i++) // loop over c
{ {
int ten4_2 = i + a * step_x; int ten4_2 = i + a * step_x;
if ( "%(mode)s" == "wrap_centered" ){ if (%(mode)s == MODE_WRAP_CENTERED) {
ten4_2 -= wrap_centered_half_idx_shift_x; ten4_2 -= wrap_centered_half_idx_shift_x;
if ( ten4_2 < 0 ) ten4_2 += height; if ( ten4_2 < 0 ) ten4_2 += height;
else if (ten4_2 >= height) ten4_2 -= height; else if (ten4_2 >= height) ten4_2 -= height;
} else if ( "%(mode)s" == "half" ){ } else if (%(mode)s == MODE_HALF) {
ten4_2 -= wrap_centered_half_idx_shift_x; ten4_2 -= wrap_centered_half_idx_shift_x;
} else if ( "%(mode)s" == "full" ){ } else if (%(mode)s == MODE_FULL) {
ten4_2 -= c - 1; ten4_2 -= c - 1;
} }
if (ten4_2 < 0 | ten4_2 >= height) { if (ten4_2 < 0 | ten4_2 >= height) {
...@@ -588,13 +592,13 @@ class Images2Neibs(Op): ...@@ -588,13 +592,13 @@ class Images2Neibs(Op):
for (int j = 0; j < d; j++) // loop over d for (int j = 0; j < d; j++) // loop over d
{ {
int ten4_3 = j + b * step_y; int ten4_3 = j + b * step_y;
if ( "%(mode)s" == "wrap_centered" ){ if (%(mode)s == MODE_WRAP_CENTERED) {
ten4_3 -= wrap_centered_half_idx_shift_y; ten4_3 -= wrap_centered_half_idx_shift_y;
if ( ten4_3 < 0 ) ten4_3 += width; if ( ten4_3 < 0 ) ten4_3 += width;
else if (ten4_3 >= width) ten4_3 -= width; else if (ten4_3 >= width) ten4_3 -= width;
} else if ( "%(mode)s" == "half" ){ } else if (%(mode)s == MODE_HALF) {
ten4_3 -= wrap_centered_half_idx_shift_y; ten4_3 -= wrap_centered_half_idx_shift_y;
} else if ( "%(mode)s" == "full" ){ } else if (%(mode)s == MODE_FULL) {
ten4_3 -= d - 1; ten4_3 -= d - 1;
} }
int z_col = j + d * i; int z_col = j + d * i;
...@@ -609,7 +613,8 @@ class Images2Neibs(Op): ...@@ -609,7 +613,8 @@ class Images2Neibs(Op):
} }
} }
} // END NESTED SCOPE } // END NESTED SCOPE
""" % locals() """ % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0],
fail=sub['fail'], mode=sub['params'])
def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'): def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
......
...@@ -12,7 +12,7 @@ import theano ...@@ -12,7 +12,7 @@ import theano
from theano.compat import izip from theano.compat import izip
from theano.gradient import DisconnectedType from theano.gradient import DisconnectedType
from theano import gof from theano import gof
from theano.gof import Apply, hashtype, Op, Type, MethodNotDefined from theano.gof import Apply, hashtype, Op, Type, MethodNotDefined, ParamsType
from theano.printing import pprint from theano.printing import pprint
from theano import scalar as scal from theano import scalar as scal
from theano.tensor.basic import alloc from theano.tensor.basic import alloc
...@@ -1685,6 +1685,7 @@ class AdvancedSubtensor1(Op): ...@@ -1685,6 +1685,7 @@ class AdvancedSubtensor1(Op):
# of the grad() method. # of the grad() method.
__props__ = () __props__ = ()
_f16_ok = True _f16_ok = True
check_input = False
def __init__(self, sparse_grad=False): def __init__(self, sparse_grad=False):
self.sparse_grad = sparse_grad self.sparse_grad = sparse_grad
...@@ -1872,10 +1873,13 @@ class AdvancedIncSubtensor1(Op): ...@@ -1872,10 +1873,13 @@ class AdvancedIncSubtensor1(Op):
""" """
__props__ = ('inplace', 'set_instead_of_inc') __props__ = ('inplace', 'set_instead_of_inc')
check_input = False
params_type = ParamsType(inplace=scal.bool,
set_instead_of_inc=scal.bool)
def __init__(self, inplace=False, set_instead_of_inc=False): def __init__(self, inplace=False, set_instead_of_inc=False):
self.inplace = inplace self.inplace = bool(inplace)
self.set_instead_of_inc = set_instead_of_inc self.set_instead_of_inc = bool(set_instead_of_inc)
if inplace: if inplace:
self.destroy_map = {0: [0]} self.destroy_map = {0: [0]}
...@@ -1955,17 +1959,11 @@ class AdvancedIncSubtensor1(Op): ...@@ -1955,17 +1959,11 @@ class AdvancedIncSubtensor1(Op):
raise NotImplementedError raise NotImplementedError
x, y, idx = input_names x, y, idx = input_names
out = output_names[0] out = output_names[0]
fail = sub['fail']
inc_or_set = 1 - self.set_instead_of_inc
if self.inplace: # convert bool to int
inplace = 1
else:
inplace = 0
copy_of_x = self.copy_of_x(x) copy_of_x = self.copy_of_x(x)
return """ return """
PyObject* rval = NULL; PyObject* rval = NULL;
if (%(inplace)s) if (%(params)s->inplace)
{ {
if (%(x)s != %(out)s) if (%(x)s != %(out)s)
{ {
...@@ -1983,16 +1981,17 @@ class AdvancedIncSubtensor1(Op): ...@@ -1983,16 +1981,17 @@ class AdvancedIncSubtensor1(Op):
%(fail)s %(fail)s
} }
} }
if (inplace_increment(%(out)s, (PyObject *)%(idx)s, %(y)s, %(inc_or_set)d)) { if (inplace_increment(%(out)s, (PyObject *)%(idx)s, %(y)s, (1 - %(params)s->set_instead_of_inc))) {
%(fail)s; %(fail)s;
} }
Py_XDECREF(rval); Py_XDECREF(rval);
""" % locals() """ % dict(x=x, y=y, idx=idx, out=out, copy_of_x=copy_of_x,
params=sub['params'], fail=sub['fail'])
def c_code_cache_version(self): def c_code_cache_version(self):
return (6,) return (8,)
def perform(self, node, inp, out_): def perform(self, node, inp, out_, params):
# TODO opt to make this inplace # TODO opt to make this inplace
x, y, idx = inp x, y, idx = inp
out, = out_ out, = out_
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论