提交 5c9d3118 authored 作者: notoraptor's avatar notoraptor

Wrap Op params for theano.gpuarray.neighbours.GpuImages2Neibs:

- mode (enum list) - context (gpu_context_type) Add Kernel param for theano.gpuarray.neighbours.GpuImages2Neibs: - mode (ga_int) That helps very much to reduce C code variability (from 18 individual ops to about 6 modules with more than 1 op).
上级 1433cacb
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import numpy as np
from theano import Op, Apply, config from theano import Op, Apply, config
from theano.gof import ParamsType
from theano.tensor.nnet.neighbours import Images2Neibs from theano.tensor.nnet.neighbours import Images2Neibs
import theano.tensor as T import theano.tensor as T
try: try:
import pygpu
from pygpu import gpuarray from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
...@@ -14,7 +13,7 @@ except ImportError: ...@@ -14,7 +13,7 @@ except ImportError:
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name) infer_context_name)
from .opt import register_opt2, op_lifter, register_opt from .opt import register_opt2, op_lifter, register_opt
from .type import GpuArrayType from .type import GpuArrayType, gpu_context_type
class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...@@ -22,13 +21,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -22,13 +21,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
Images2Neibs for the GPU. Images2Neibs for the GPU.
""" """
def __init__(self, mode='valid'): params_type = ParamsType(mode=Images2Neibs.params_type, context=gpu_context_type)
if mode not in ['valid', 'half', 'full',
'ignore_borders', 'wrap_centered']: def get_params(self, node):
raise NotImplementedError("Only the mode valid, half, full, " return self.params_type.get_params(self, context=node.inputs[0].type.context)
"ignore_borders and wrap_centered have "
"been implemented for GpuImages2Neibs")
self.mode = mode
def make_node(self, ten4, neib_shape, neib_step=None): def make_node(self, ten4, neib_shape, neib_step=None):
ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4)) ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
...@@ -50,7 +46,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -50,7 +46,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
context_name=ten4.type.context_name)()]) context_name=ten4.type.context_name)()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (12,) return (13,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
...@@ -61,13 +57,17 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -61,13 +57,17 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
flags = Kernel.get_flags(dtype_ten4, dtype_z) flags = Kernel.get_flags(dtype_ten4, dtype_z)
type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4) type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
type_z = gpuarray.dtype_to_ctype(dtype_z) type_z = gpuarray.dtype_to_ctype(dtype_z)
mode = self.mode # Params type 'mode' is an enum list which c_support_code()
# contains C constants definitions that are useful here.
mode_constants = self.params_type.get_type('mode').c_support_code()
kernels = [] kernels = []
kname = "k_multi_warp_less" kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename k_var = "k_multi_warp_less_" + nodename
code = """ code = """
// a version that uses less registers but doesn't work in all cases. // a version that uses less registers but doesn't work in all cases.
%(mode_constants)s
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_int mode,
const ga_int nb_batch, const ga_int nb_batch,
const ga_int nb_stack, const ga_int nb_stack,
const ga_int height, const ga_int height,
...@@ -110,29 +110,29 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -110,29 +110,29 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
ga_int i = LID_1; // loop over c ga_int i = LID_1; // loop over c
{ {
ga_int ten4_2 = i + a * step_x; ga_int ten4_2 = i + a * step_x;
if("%(mode)s"=="wrap_centered"){ if(%(mode)s == MODE_WRAP_CENTERED) {
ten4_2 -= wrap_centered_half_idx_shift_x; ten4_2 -= wrap_centered_half_idx_shift_x;
if ( ten4_2 < 0 ) if ( ten4_2 < 0 )
ten4_2 += height; ten4_2 += height;
else if (ten4_2 >= height) else if (ten4_2 >= height)
ten4_2 -= height; ten4_2 -= height;
} else if ("%(mode)s"=="half"){ } else if (%(mode)s == MODE_HALF) {
ten4_2 -= wrap_centered_half_idx_shift_x; ten4_2 -= wrap_centered_half_idx_shift_x;
} else if ("%(mode)s"=="full"){ } else if (%(mode)s == MODE_FULL) {
ten4_2 -= c - 1; ten4_2 -= c - 1;
} }
ga_int j = LID_0; // loop over d ga_int j = LID_0; // loop over d
{ {
ga_int ten4_3 = j + b * step_y; ga_int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){ if(%(mode)s == MODE_WRAP_CENTERED){
ten4_3 -= wrap_centered_half_idx_shift_y; ten4_3 -= wrap_centered_half_idx_shift_y;
if ( ten4_3 < 0 ) if ( ten4_3 < 0 )
ten4_3 += width; ten4_3 += width;
else if (ten4_3 >= width) else if (ten4_3 >= width)
ten4_3 -= width; ten4_3 -= width;
} else if ("%(mode)s"=="half"){ } else if (%(mode)s == MODE_HALF) {
ten4_3 -= wrap_centered_half_idx_shift_y; ten4_3 -= wrap_centered_half_idx_shift_y;
} else if ("%(mode)s"=="full"){ } else if (%(mode)s == MODE_FULL) {
ten4_3 -= d - 1; ten4_3 -= d - 1;
} }
...@@ -150,8 +150,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -150,8 +150,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
} }
} }
} }
}""" % locals() }""" % dict(kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants)
params = [ params = [
'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'uintp', 'uintp', 'uintp', 'uintp', 'uintp', 'uintp', 'uintp', 'uintp',
...@@ -165,7 +166,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -165,7 +166,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kname = "k_multi_warp" kname = "k_multi_warp"
k_var = "k_multi_warp_" + nodename k_var = "k_multi_warp_" + nodename
code = """ code = """
%(mode_constants)s
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_int mode,
const ga_int nb_batch, const ga_int nb_batch,
const ga_int nb_stack, const ga_int nb_stack,
const ga_int height, const ga_int height,
...@@ -209,30 +212,30 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -209,30 +212,30 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
for (ga_int i = LID_1; i < c; i+=LDIM_1) for (ga_int i = LID_1; i < c; i+=LDIM_1)
{ {
ga_int ten4_2 = i + a * step_x; ga_int ten4_2 = i + a * step_x;
if("%(mode)s"=="wrap_centered"){ if(%(mode)s == MODE_WRAP_CENTERED) {
ten4_2 -= wrap_centered_half_idx_shift_x; ten4_2 -= wrap_centered_half_idx_shift_x;
if ( ten4_2 < 0 ) if ( ten4_2 < 0 )
ten4_2 += height; ten4_2 += height;
else if (ten4_2 >= height) else if (ten4_2 >= height)
ten4_2 -= height; ten4_2 -= height;
} else if ("%(mode)s"=="half"){ } else if (%(mode)s == MODE_HALF) {
ten4_2 -= wrap_centered_half_idx_shift_x; ten4_2 -= wrap_centered_half_idx_shift_x;
} else if ("%(mode)s"=="full"){ } else if (%(mode)s == MODE_FULL) {
ten4_2 -= c - 1; ten4_2 -= c - 1;
} }
// loop over d // loop over d
for (ga_int j = LID_0; j < d; j+=LDIM_0) for (ga_int j = LID_0; j < d; j+=LDIM_0)
{ {
ga_int ten4_3 = j + b * step_y; ga_int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){ if(%(mode)s == MODE_WRAP_CENTERED) {
ten4_3 -= wrap_centered_half_idx_shift_y; ten4_3 -= wrap_centered_half_idx_shift_y;
if ( ten4_3 < 0 ) if ( ten4_3 < 0 )
ten4_3 += width; ten4_3 += width;
else if (ten4_3 >= width) else if (ten4_3 >= width)
ten4_3 -= width; ten4_3 -= width;
} else if ("%(mode)s"=="half"){ } else if (%(mode)s == MODE_HALF) {
ten4_3 -= wrap_centered_half_idx_shift_y; ten4_3 -= wrap_centered_half_idx_shift_y;
} else if ("%(mode)s"=="full"){ } else if (%(mode)s == MODE_FULL) {
ten4_3 -= d - 1; ten4_3 -= d - 1;
} }
...@@ -251,8 +254,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -251,8 +254,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
} }
} }
} }
""" % locals() """ % dict(kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants)
params = [ params = [
'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'uintp', 'uintp', 'uintp', 'uintp', 'uintp', 'uintp', 'uintp', 'uintp',
...@@ -274,18 +278,6 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -274,18 +278,6 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
""" """
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
dtype_ten4 = node.inputs[0].dtype
dtype_neib_shape = node.inputs[1].dtype
dtype_neib_step = node.inputs[2].dtype
dtype_z = node.outputs[0].dtype
itemsize_ten4 = np.dtype(dtype_ten4).itemsize
itemsize_z = np.dtype(dtype_z).itemsize
typecode_z = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
ten4, neib_shape, neib_step = inp
z, = out
fail = sub['fail']
ctx = sub['params']
mode = self.mode
err_check = """ err_check = """
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
...@@ -293,16 +285,23 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -293,16 +285,23 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
GpuKernel_error(fptr, err)); GpuKernel_error(fptr, err));
%(fail)s; %(fail)s;
} }
""" % locals() """ % dict(fail=sub['fail'])
sync = "" sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
sync = """ sync = """
err = GpuArray_sync(&%(z)s->ga); err = GpuArray_sync(&%(z)s->ga);
%(err_check)s %(err_check)s
""" % locals() """ % dict(z=out[0], err_check=err_check)
# NB: To reduce C code variability:
# For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize
# For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node)
# For typecode_z, I use ten4->ga.typecode (for same reason as above)
return """ return """
int grid_c = -1; int grid_c = -1;
int grid_d = -1; int grid_d = -1;
size_t itemsize_ten4 = GpuArray_ITEMSIZE(&%(ten4)s->ga);
size_t itemsize_z = itemsize_ten4;
int typecode_z = %(ten4)s->ga.typecode;
{ {
if (PyGpuArray_NDIM(%(ten4)s) != 4) if (PyGpuArray_NDIM(%(ten4)s) != 4)
...@@ -351,10 +350,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -351,10 +350,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
%(fail)s; %(fail)s;
} }
if ( "%(mode)s" == "wrap_centered") { if (%(params)s->mode == MODE_WRAP_CENTERED) {
if (c%%2!=1 || d%%2!=1){ if (c%%2!=1 || d%%2!=1){
PyErr_Format(PyExc_TypeError, PyErr_Format(PyExc_TypeError,
"GpuImages2Neibs: in mode wrap_centered need patch with odd shapes"); "GpuImages2Neibs: in mode wrap_centered need patch with odd shapes");
%(fail)s; %(fail)s;
} }
if ( PyGpuArray_DIMS(%(ten4)s)[2] < c || if ( PyGpuArray_DIMS(%(ten4)s)[2] < c ||
...@@ -375,7 +374,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -375,7 +374,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
(size_t)step_y); (size_t)step_y);
}else if ( "%(mode)s" == "valid") { } else if (%(params)s->mode == MODE_VALID) {
if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0)) ((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
{ {
...@@ -400,12 +399,12 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -400,12 +399,12 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x); grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width //number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y); grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
}else if ( "%(mode)s" == "ignore_borders") { } else if (%(params)s->mode == MODE_IGNORE_BORDERS) {
//number of patch in height //number of patch in height
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x); grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width //number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y); grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
}else if ( "%(mode)s" == "half") { } else if (%(params)s->mode == MODE_HALF) {
if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
((((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0)) ((((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0))
{ {
...@@ -430,7 +429,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -430,7 +429,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x); grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x);
//number of patch in width //number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y); grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y);
}else if ( "%(mode)s" == "full") { } else if (%(params)s->mode == MODE_FULL) {
if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
( (((PyGpuArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0)) ( (((PyGpuArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0))
{ {
...@@ -455,9 +454,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -455,9 +454,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]+c-2)/step_x); grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]+c-2)/step_x);
//number of patch in width //number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]+d-2)/step_y); grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]+d-2)/step_y);
}else{ } else {
PyErr_Format(PyExc_TypeError, PyErr_Format(PyExc_TypeError,
"GpuImages2Neibs:: unknown mode '%(mode)s'"); "GpuImages2Neibs:: unknown mode %%d", %(params)s->mode);
%(fail)s; %(fail)s;
} }
...@@ -476,8 +475,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -476,8 +475,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
size_t dims[2]; size_t dims[2];
dims[0] = z_dim0; dims[0] = z_dim0;
dims[1] = z_dim1; dims[1] = z_dim1;
%(z)s = pygpu_empty(2, dims, %(typecode_z)s, %(z)s = pygpu_empty(2, dims, typecode_z,
GA_C_ORDER, %(ctx)s, Py_None); GA_C_ORDER, %(params)s->context, Py_None);
if (!%(z)s) if (!%(z)s)
{ {
PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:" PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
...@@ -490,6 +489,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -490,6 +489,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
{ // NESTED SCOPE { // NESTED SCOPE
const int mode = %(params)s->mode;
const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0]; const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0];
const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1]; const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1];
const int height = PyGpuArray_DIMS(%(ten4)s)[2]; const int height = PyGpuArray_DIMS(%(ten4)s)[2];
...@@ -507,7 +507,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -507,7 +507,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
size_t threads_per_block[3] = {d, c, 1}; size_t threads_per_block[3] = {d, c, 1};
//get the max threads per blocks //get the max threads per blocks
size_t max_threads_dim; size_t max_threads_dim;
int err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim); int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims"); PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
%(fail)s; %(fail)s;
...@@ -535,14 +535,19 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -535,14 +535,19 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
}else{ }else{
fptr = &k_multi_warp_%(name)s; fptr = &k_multi_warp_%(name)s;
} }
// printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n", max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1], n_blocks[2]); /*
size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s; printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n",
size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s; max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2],
size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s; n_blocks[0], n_blocks[1], n_blocks[2]);
size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s; */
size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s; size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / itemsize_ten4;
size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s; size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / itemsize_ten4;
void *kernel_params[] = {(void *)&nb_batch, size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / itemsize_ten4;
size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / itemsize_ten4;
size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / itemsize_z;
size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / itemsize_z;
void *kernel_params[] = {(void *)&mode,
(void *)&nb_batch,
(void *)&nb_stack, (void *)&nb_stack,
(void *)&height, (void *)&width, (void *)&height, (void *)&width,
(void *)&c, (void *)&d, (void *)&c, (void *)&d,
...@@ -562,11 +567,18 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -562,11 +567,18 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
%(err_check)s %(err_check)s
%(sync)s %(sync)s
} // END NESTED SCOPE } // END NESTED SCOPE
""" % locals() """ % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0],
dtype_neib_shape=node.inputs[1].dtype,
def perform(self, node, inp, out, ctx): dtype_neib_step=node.inputs[2].dtype,
err_check=err_check,
sync=sync,
name=name,
params=sub['params'],
fail=sub['fail'])
def perform(self, node, inp, out, params):
# Disable the perform method from the CPU version # Disable the perform method from the CPU version
Op.perform(self, node, inp, out, ctx) Op.perform(self, node, inp, out, params)
@register_opt('fast_compile') @register_opt('fast_compile')
......
...@@ -52,8 +52,8 @@ class Images2Neibs(Op): ...@@ -52,8 +52,8 @@ class Images2Neibs(Op):
def __init__(self, mode='valid'): def __init__(self, mode='valid'):
implemented_modes = self.params_type.get_aliases() implemented_modes = self.params_type.get_aliases()
if mode not in implemented_modes: if mode not in implemented_modes:
raise NotImplementedError("Only modes %s have been implemented for Images2Neibs" raise NotImplementedError("Only modes %s have been implemented for %s"
% ', '.join(implemented_modes)) % (', '.join(implemented_modes), type(self).__name__))
self.mode = mode self.mode = mode
def __str__(self): def __str__(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论