提交 a7900bd8 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5198 from aam-at/gpu_pool

Gpuarray average and max pooling
......@@ -66,7 +66,7 @@ from theano.gof.opt import (
OpKeyOptimizer)
from theano.gof.optdb import \
DB, Query, \
DB, LocalGroupDB, Query, \
EquilibriumDB, SequenceDB, ProxyDB
from theano.gof.toolbox import \
......
......@@ -1384,6 +1384,41 @@ class LocalOptGroup(LocalOptimizer):
opt.add_requirements(fgraph)
class GraphToGPULocalOptGroup(LocalOptGroup):
"""This is the equivalent of LocalOptGroup for GraphToGPU.
The main different is the function signature of the local
optimizer that use the GraphToGPU signature and not the normal
LocalOptimizer signature.
apply_all_opts=True is not supported
"""
def __init__(self, *optimizers, **kwargs):
super(GraphToGPULocalOptGroup, self).__init__(*optimizers, **kwargs)
assert self.apply_all_opts is False
def transform(self, op, context_name, inputs, outputs):
if len(self.opts) == 0:
return
fgraph = outputs[0].fgraph
opts = self.track_map[type(op)] + self.track_map[op] + self.track_map[None]
for opt in opts:
opt_start = time.time()
new_repl = opt.transform(op, context_name, inputs, outputs)
opt_finish = time.time()
if self.profile:
self.time_opts[opt] += opt_start - opt_finish
self.process_count[opt] += 1
if not new_repl:
continue
if self.profile:
self.node_created[opt] += len(graph.ops(fgraph.variables, new_repl))
self.applied_true[opt] += 1
return new_repl
class OpSub(LocalOptimizer):
"""
......
......@@ -405,12 +405,14 @@ class LocalGroupDB(DB):
"""
def __init__(self, apply_all_opts=False, profile=False):
def __init__(self, apply_all_opts=False, profile=False,
local_opt=opt.LocalOptGroup):
super(LocalGroupDB, self).__init__()
self.failure_callback = None
self.apply_all_opts = apply_all_opts
self.profile = profile
self.__position__ = {}
self.local_opt = local_opt
def register(self, name, obj, *tags, **kwargs):
super(LocalGroupDB, self).register(name, obj, *tags)
......@@ -429,7 +431,7 @@ class LocalGroupDB(DB):
opts = list(super(LocalGroupDB, self).query(*tags, **kwtags))
opts.sort(key=lambda obj: (self.__position__[obj.name], obj.name))
ret = opt.LocalOptGroup(*opts,
ret = self.local_opt(*opts,
apply_all_opts=self.apply_all_opts,
profile=self.profile)
return ret
......
......@@ -1550,52 +1550,6 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
return [[1], [1], [0], [0], [0]] # no connection to height, width, depth
class GpuDownsampleFactorMaxGradGrad(CGpuKernelBase):
"""
Implement the grad of downsample with max on the gpu.
"""
__props__ = ('ignore_border', 'mode', 'ndim')
def __init__(self, ignore_border, mode='max', ndim=2):
self.ndim = ndim
self.ignore_border = ignore_border
self.mode = mode
CGpuKernelBase.__init__(self, ['pool_grad_grad.c'],
'APPLY_SPECIFIC(pool_grad_grad)')
assert self.mode == 'max'
assert self.ndim in [2, 3]
def c_headers(self):
return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']
def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include()]
def make_node(self, inp, out, out_grad, ws, stride, pad):
ctx_name = infer_context_name(inp, out, out_grad)
inp = as_gpuarray_variable(inp, ctx_name)
assert (inp.ndim in [4, 5])
out = as_gpuarray_variable(out, ctx_name)
assert (out_grad.ndim in [4, 5])
out_grad = as_gpuarray_variable(out_grad, ctx_name)
assert(out.ndim in [4, 5])
assert (out_grad.ndim == inp.ndim)
assert (inp.ndim == out.ndim)
ws = as_tensor_variable(ws)
stride = as_tensor_variable(stride)
pad = as_tensor_variable(pad)
assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
assert ws.type.ndim == 1
return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])
def get_params(self, node):
return node.inputs[0].type.context
@inplace_allocempty(GpuGemv, 0)
def local_inplace_gpuagemv(node, inputs):
return [gpugemv_inplace(*inputs)]
......
......@@ -38,7 +38,7 @@ from .elemwise import GpuElemwise
# These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from .nnet import GpuSoftmax
from .opt import (gpu_seqopt, register_opt,
from .opt import (gpu_seqopt, register_opt, pool_db, pool_db2,
op_lifter, register_opt2)
from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims
......@@ -2736,9 +2736,6 @@ def local_dnn_convi_output_merge(node, *inputs):
return [gpu_dnn_conv_gradI(algo=node.op.algo)(*inputs)]
@register_opt('cudnn', 'fast_compile')
@op_lifter([Pool])
@register_opt2([Pool], 'fast_compile', 'cudnn')
def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn()
......@@ -2758,11 +2755,16 @@ def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
img_padded = pad_dims(img, 2, nd)
ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
return unpad_dims(ret_padded, img, 2, nd)
pool_db.register("local_gpua_pool_dnn_alternative",
op_lifter([Pool])(local_gpua_pool_dnn_alternative),
'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
position=0)
pool_db2.register("local_gpua_pool_dnn_alternative",
local_optimizer([Pool])(local_gpua_pool_dnn_alternative),
'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
position=0)
@register_opt('cudnn', 'fast_compile')
@op_lifter([MaxPoolGrad])
@register_opt2([MaxPoolGrad], 'fast_compile', 'cudnn')
def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn()
......@@ -2797,11 +2799,16 @@ def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
stride,
pad)
return unpad_dims(ret_padded, inp, 2, nd)
pool_db.register("local_gpua_pool_dnn_grad_stride",
op_lifter([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
position=0)
pool_db2.register("local_gpua_pool_dnn_grad_stride",
local_optimizer([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
position=0)
@register_opt('cudnn', 'fast_compile')
@op_lifter([AveragePoolGrad])
@register_opt2([AveragePoolGrad], 'fast_compile', 'cudnn')
def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn()
......@@ -2832,6 +2839,14 @@ def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
stride,
pad)
return unpad_dims(ret_padded, inp, 2, nd)
pool_db.register("local_gpua_avg_pool_dnn_grad_stride",
op_lifter([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
position=0)
pool_db2.register("local_gpua_avg_pool_dnn_grad_stride",
local_optimizer([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
position=0)
@register_opt('cudnn', 'fast_compile')
......
......@@ -13,6 +13,7 @@ from theano import tensor, scalar, gof, config
from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
LocalGroupDB,
SequenceDB, Optimizer, DB, toolbox, graph)
from theano.ifelse import IfElse
from theano.misc.ordered_set import OrderedSet
......@@ -47,7 +48,8 @@ from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
gpugemmbatch_no_inplace,
gpugemv_no_inplace, gpugemv_inplace,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights,
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
from .pool import (GpuPool, GpuMaxPoolGrad, GpuAveragePoolGrad,
GpuDownsampleFactorMaxGradGrad)
from .blocksparse import (GpuSparseBlockGemv, GpuSparseBlockOuter,
gpu_sparse_block_outer,
......@@ -129,6 +131,9 @@ def register_opt2(tracks, *tags, **kwargs):
'''
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
if isinstance(local_opt, theano.gof.DB):
opt = local_opt
else:
opt = theano.gof.local_optimizer(tracks)(local_opt)
gpu_optimizer2.register(name, opt, 'fast_run', 'gpuarray', *tags)
return local_opt
......@@ -1594,6 +1599,99 @@ def local_gpua_lift_abstractconv_graph(op, context_name, inputs, outputs):
return [op(*inps)]
def local_gpu_pool(op, ctx_name, inputs, outputs):
assert op.__props__ == ('ignore_border', 'mode', 'ndim')
inp, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
op = GpuPool(op.ignore_border, op.mode, op.ndim)
if inp.ndim == nd + 2:
return op(inp, ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
ret_padded = op(inp_padded, ws, stride, pad)
return unpad_dims(ret_padded, inp, 2, nd)
pool_db = LocalGroupDB()
pool_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
pool_db2.__name__ = "pool_db2"
lifter = op_lifter([pool.Pool])(local_gpu_pool)
pool_db.register("local_gpu_pool", lifter,
'gpuarray', 'fast_compile', 'fast_run',
position=1)
pool_db2.register("local_gpu_pool",
local_optimizer([pool.Pool])(local_gpu_pool),
'gpuarray', 'fast_compile', 'fast_run',
position=1)
register_opt('fast_compile', name='pool_db')(pool_db)
register_opt2([pool.Pool], 'fast_compile', name='pool_db2')(pool_db2)
def local_gpu_max_pool_grad(op, ctx_name, inputs, outputs):
assert op.__props__ == ('ignore_border', 'mode', 'ndim')
inp, out, out_grad, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
op = GpuMaxPoolGrad(op.ignore_border, op.mode, op.ndim)
if inp.ndim == nd + 2:
return op(inp, out, out_grad, ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
out_padded = pad_dims(out, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = op(inp_padded, out_padded, out_grad_padded,
ws, stride, pad)
return unpad_dims(ret_padded, inp, 2, nd)
lifter = op_lifter([pool.MaxPoolGrad])(local_gpu_max_pool_grad)
pool_db.register("local_gpu_max_pool_grad", lifter,
'gpuarray', 'fast_compile', 'fast_run',
position=1)
pool_db2.register("local_gpu_max_pool_grad",
local_optimizer([pool.MaxPoolGrad])(local_gpu_max_pool_grad),
'gpuarray', 'fast_compile', 'fast_run',
position=1)
def local_gpu_average_pool_grad(op, ctx_name, inputs, outputs):
assert op.__props__ == ('ignore_border', 'mode', 'ndim')
inp, out_grad, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
op = GpuAveragePoolGrad(op.ignore_border, op.mode, op.ndim)
if inp.ndim == nd + 2:
return op(inp, out_grad, ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = op(inp_padded, out_grad_padded,
ws, stride, pad)
return unpad_dims(ret_padded, inp, 2, nd)
lifter = op_lifter([pool.AveragePoolGrad])(local_gpu_average_pool_grad)
pool_db.register("local_gpu_average_pool_grad", lifter,
'gpuarray', 'fast_compile', 'fast_run',
position=1)
pool_db2.register("local_gpu_average_pool_grad",
local_optimizer([pool.AveragePoolGrad])(local_gpu_average_pool_grad),
'gpuarray', 'fast_compile', 'fast_run',
position=1)
@register_opt()
@op_lifter([pool.DownsampleFactorMaxGradGrad])
@register_opt2([pool.DownsampleFactorMaxGradGrad])
......
差异被折叠。
差异被折叠。
#section kernels
#kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, * :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size height,
const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *gz,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, const ga_bool inc_pad, const ga_bool sum_mode,
GLOBAL_MEM DTYPE_o0 *gx)
{
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) {
const ga_size w = index % width;
const ga_size h = (index / width) % height;
const ga_size c = (index / width / height) % channels;
const ga_size n = (index / width / height / channels);
const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
const DTYPE_i1* gz_slice = gz + offset;
DTYPE_o0 collector = 0;
for (ga_size ph=phstart; ph < phend; ++ph) {
for (ga_size pw=pwstart; pw < pwend; ++pw) {
if (sum_mode) {
collector += gz[ph*pooled_width + pw];
} else {
// figure out the pooling size
const ga_size hstart = ph * stride_h - pad_h;
const ga_size wstart = pw * stride_w - pad_w;
const ga_size hend = min(hstart + kernel_h, height + pad_h);
const ga_size wend = min(wstart + kernel_w, width + pad_w);
const ga_size pool_size = (hend - hstart) * (wend - wstart);
collector += gz_slice[ph*pooled_width + pw] / pool_size;
}
}
}
gx[index] = collector;
}
}
#kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, size, size, size, * :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size depth,
const ga_size height, const ga_size width, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *gz,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
const ga_bool inc_pad, const ga_bool sum_mode, GLOBAL_MEM DTYPE_o0 *gx)
{
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) {
const ga_size w = index % width;
const ga_size h = (index / width) % height;
const ga_size d = (index / width / height) % depth;
const ga_size c = (index / width / height / depth) % channels;
const ga_size n = (index / width / height / depth / channels);
const ga_size pdstart = (d + pad_d < kernel_d) ? 0 : (d + pad_d - kernel_d) / stride_d + 1;
const ga_size pdend = min((d + pad_d) / stride_d + 1, pooled_depth);
const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
const DTYPE_i1* gz_slice = gz + offset;
DTYPE_o0 collector = 0;
for (ga_size pd=pdstart; pd < pdend; ++pd) {
for (ga_size ph=phstart; ph < phend; ++ph) {
for (ga_size pw=pwstart; pw < pwend; ++pw) {
if (sum_mode) {
collector += gz[ph*pooled_width + pw];
} else {
// figure out the pooling size
const ga_size dstart = pd * stride_d - pad_d;
const ga_size hstart = ph * stride_h - pad_h;
const ga_size wstart = pw * stride_w - pad_w;
const ga_size dend = min(dstart + kernel_d, depth + pad_d);
const ga_size hend = min(hstart + kernel_h, height + pad_h);
const ga_size wend = min(wstart + kernel_w, width + pad_w);
const ga_size pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
collector += gz[ph*pooled_width + pw] / pool_size;
}
}
}
}
gx[index] = collector;
}
}
#section support_code_struct
int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
PyGpuArrayObject *gz,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **gx,
PyGpuContextObject *ctx) {
if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
|| !GpuArray_IS_C_CONTIGUOUS(&gz->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuMaxPoolGrad: requires data to be C-contiguous");
return 1;
}
size_t ndims = PyArray_DIM(ws, 0);
if (PyGpuArray_NDIM(x) != ndims + 2
|| PyGpuArray_NDIM(gz) != ndims + 2)
{
PyErr_SetString(PyExc_ValueError, "GpuMaxPoolGrad: rank error");
return 1;
}
if (theano_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x),
x->ga.typecode, GA_C_ORDER, ctx) != 0)
{
PyErr_SetString(PyExc_RuntimeError,
"GpuMaxPoolGrad: failed to allocate memory");
return 1;
}
{
// scope for running kernel
size_t w[3];
size_t s[3];
size_t p[3];
for(int i = 0; i < ndims; i++) {
w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
}
int err;
const size_t* z_dims = PyGpuArray_DIMS(gz);
const size_t* x_dims = PyGpuArray_DIMS(x);
if (ndims == 2) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3],
x->ga.data, gz->ga.data,
w[0], w[1], s[0], s[1], p[0], p[1],
INC_PAD, SUM_MODE, (*gx)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.",
GpuKernel_error(&k_ave_pool2d_grad_kernel, err));
return 1;
}
} else if (ndims == 3) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4],
x->ga.data, gz->ga.data,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], INC_PAD, SUM_MODE,
(*gx)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.",
GpuKernel_error(&k_ave_pool3d_grad_kernel, err));
return 1;
}
}
}
return 0;
}
......@@ -18,11 +18,11 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
const ga_size c = (index / pooled_width / pooled_height) % channels;
const ga_size n = (index / pooled_width / pooled_height / channels);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
hstart = max(hstart, 0);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
wstart = max(wstart, 0);
const ga_size wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * height * width;
......@@ -63,14 +63,14 @@ KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d);
dstart = max(dstart, 0);
const ga_size dend = min(dstart + kernel_d, depth);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
hstart = max(hstart, 0);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
wstart = max(wstart, 0);
const ga_size wend = min(wstart + kernel_w, width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * depth * height * width;
......@@ -137,24 +137,13 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
}
size_t max_threads_dim;
int err;
const size_t* z_dims = PyGpuArray_DIMS(z);
const size_t* x_dims = PyGpuArray_DIMS(x);
// Get the max threads per blocks
err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
return 1;
}
size_t threads_per_block = max_threads_dim;
if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
err = max_pool2d_grad_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
err = max_pool2d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, z->ga.data, gx->ga.data,
......@@ -169,9 +158,7 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
}
else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
err = max_pool3d_grad_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
err = max_pool3d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, z->ga.data, gx->ga.data,
......
#section kernels
#kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, * :
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size height,
const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *z, GLOBAL_MEM const DTYPE_i2 *gz,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, GLOBAL_MEM DTYPE_o0 *gx)
{
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) {
const ga_size w = index % width;
const ga_size h = (index / width) % height;
const ga_size c = (index / width / height) % channels;
const ga_size n = (index / width / height / channels);
const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
const DTYPE_i1* z_slice = z + offset;
const DTYPE_i2* gz_slice = gz + offset;
DTYPE_o0 gradient = 0;
for (ga_size ph=phstart; ph < phend; ++ph) {
for (ga_size pw=pwstart; pw < pwend; ++pw) {
if (x[index] == z_slice[ph * pooled_width + pw]) {
gradient += gz_slice[ph * pooled_width + pw];
}
}
}
gx[index] = gradient;
}
}
#kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, size, size, size, * :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size depth,
const ga_size height, const ga_size width, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *z, GLOBAL_MEM const DTYPE_i2 *gz,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *gx)
{
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) {
const ga_size w = index % width;
const ga_size h = (index / width) % height;
const ga_size d = (index / width / height) % depth;
const ga_size c = (index / width / height / depth) % channels;
const ga_size n = (index / width / height / depth / channels);
const ga_size pdstart = (d + pad_d < kernel_d) ? 0 : (d + pad_d - kernel_d) / stride_d + 1;
const ga_size pdend = min((d + pad_d) / stride_d + 1, pooled_depth);
const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
const DTYPE_i1* z_slice = z + offset;
const DTYPE_i2* gz_slice = gz + offset;
DTYPE_o0 gradient = 0;
for (ga_size pd=pdstart; pd < pdend; ++pd) {
for (ga_size ph=phstart; ph < phend; ++ph) {
for (ga_size pw=pwstart; pw < pwend; ++pw) {
if (x[index] == z_slice[(pd * pooled_height + ph) * pooled_width + pw]) {
gradient += gz_slice[(pd * pooled_height + ph) * pooled_width + pw];
}
}
}
}
gx[index] = gradient;
}
}
#section support_code_struct
int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
PyGpuArrayObject *z,
PyGpuArrayObject *gz,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **gx,
PyGpuContextObject *ctx) {
if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
|| !GpuArray_IS_C_CONTIGUOUS(&z->ga)
|| !GpuArray_IS_C_CONTIGUOUS(&gz->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuMaxPoolGrad: requires data to be C-contiguous");
return 1;
}
size_t ndims = PyArray_DIM(ws, 0);
if (PyGpuArray_NDIM(x) != ndims + 2
|| PyGpuArray_NDIM(z) != ndims + 2
|| PyGpuArray_NDIM(gz) != ndims + 2)
{
PyErr_SetString(PyExc_ValueError, "GpuMaxPoolGrad: rank error");
return 1;
}
if (theano_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x),
x->ga.typecode, GA_C_ORDER, ctx) != 0)
{
PyErr_SetString(PyExc_RuntimeError,
"GpuMaxPoolGrad: failed to allocate memory");
return 1;
}
{
// scope for running kernel
size_t w[3];
size_t s[3];
size_t p[3];
for(int i = 0; i < ndims; i++) {
w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
}
int err;
const size_t* z_dims = PyGpuArray_DIMS(z);
const size_t* x_dims = PyGpuArray_DIMS(x);
if (ndims == 2) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
err = max_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3],
x->ga.data, z->ga.data, gz->ga.data,
w[0], w[1], s[0], s[1], p[0], p[1],
(*gx)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolGrad: max_pool2d_grad_kernel %s.",
GpuKernel_error(&k_max_pool2d_grad_kernel, err));
return 1;
}
} else if (ndims == 3) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
err = max_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4],
x->ga.data, z->ga.data, gz->ga.data,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*gx)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolGrad: max_pool3d_grad_kernel %s.",
GpuKernel_error(&k_max_pool3d_grad_kernel, err));
return 1;
}
}
}
return 0;
}
......@@ -2,26 +2,21 @@ from __future__ import absolute_import, print_function, division
from unittest import TestCase
from nose.plugins.skip import SkipTest
import itertools
import copy
import numpy
import theano
from theano import gradient
from theano import tensor
from theano.tests import unittest_tools as utt
from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22, batched_dot
from theano.tensor.tests.test_blas import TestGer, BaseGemv
from theano.tensor.signal.pool import Pool, DownsampleFactorMaxGradGrad
from .. import gpuarray_shared_constructor
from .config import mode_with_gpu, mode_without_gpu
from .config import mode_with_gpu
from .test_basic_ops import makeTester, rand
from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemmbatch_no_inplace,
gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22, GpuDownsampleFactorMaxGradGrad)
GpuGer, gpu_dot22)
GpuGemvTester = makeTester(
......@@ -133,129 +128,3 @@ GpuDot22Tester = makeTester(
# test9=[rand(0, 0), rand(0, 0)],
)
)
def test_max_pool2d_grad_grad():
shps = [(1, 12),
(1, 1, 12),
(1, 1, 1, 12),
(1, 1, 2, 2),
(1, 1, 1, 1),
(1, 1, 4, 4),
(1, 1, 10, 11),
(1, 2, 2, 2),
(3, 5, 4, 4),
(25, 1, 7, 7),
(1, 1, 12, 12),
(1, 1, 2, 14),
(1, 1, 12, 14),
(1, 1, 14, 14),
(1, 1, 16, 16),
(1, 1, 18, 18),
(1, 1, 24, 24),
(1, 6, 24, 24),
(10, 1, 24, 24),
(10, 6, 24, 24),
(30, 6, 12, 12),
(30, 2, 24, 24),
(30, 6, 24, 24),
(10, 10, 10, 11),
(1, 1, 10, 1025),
(1, 1, 10, 1023),
(1, 1, 1025, 10),
(1, 1, 1023, 10), ]
numpy.random.RandomState(utt.fetch_seed()).shuffle(shps)
test_ds = (2, 2), (3, 2), (1, 1)
test_st = (2, 2), (3, 2), (1, 1)
for shp in shps:
for ds, st in itertools.product(test_ds, test_st):
if ds[0] > shp[-2] or ds[1] > shp[-1]:
continue
for ignore_border, pad in zip((True, False), [(1, 1), (0, 0)]):
if pad[0] >= ds[0] or pad[1] >= ds[1]:
continue
# print('test_downsample', shp, ds, st, pad, ignore_border)
ds_op = Pool(ndim=len(ds), ignore_border=ignore_border)
a = theano.shared(rand(*shp), 'a')
ggf = gradient.Lop(tensor.grad((ds_op(
tensor.as_tensor_variable(a), ds, st, pad)**2).sum(), a), a, a)
ref_mode = copy.copy(mode_without_gpu)
ref_mode.check_py_code = False
gpu_mode = copy.copy(mode_with_gpu)
gpu_mode.check_py_code = False
gg = theano.function([], ggf, mode=gpu_mode)
gg2 = theano.function([], ggf, mode=ref_mode)
assert any([
isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
for node in gg.maker.fgraph.toposort()
])
assert any([
isinstance(node.op, DownsampleFactorMaxGradGrad)
for node in gg2.maker.fgraph.toposort()
])
assert numpy.allclose(gg(), gg2()), (shp, ds, st,
ignore_border)
def test_max_pool3d_grad_grad():
shps = [(1, 1, 12),
(1, 1, 1, 1, 1),
(1, 1, 1, 1, 1025),
(1, 1, 2, 2, 2),
(1, 1, 7, 7, 7),
(1, 1, 9, 10, 11),
(1, 6, 18, 18, 18),
(1, 1, 6, 24, 24),
(1, 10, 1, 24, 24),
(1, 10, 6, 24, 24),
(1, 30, 6, 12, 12),
(1, 30, 2, 24, 24),
(1, 30, 6, 24, 24),
(1, 10, 10, 10, 11),
(1, 1, 10, 10, 1025),
(1, 1, 10, 10, 1023),
(1, 1, 10, 1025, 10),
(1, 1, 10, 1023, 10), ]
numpy.random.RandomState(utt.fetch_seed()).shuffle(shps)
test_ds = (2, 2, 2), (3, 2, 3), (1, 1, 1)
test_st = (2, 2, 2), (2, 3, 2), (1, 1, 1)
for shp in shps:
for ds, st in itertools.product(test_ds, test_st):
if ds[0] > shp[-3] or ds[1] > shp[-2] or ds[2] > shp[-1]:
continue
for ignore_border, pad in zip((True, False), [(1, 1, 1), (0, 0, 0)]):
if pad[0] >= ds[0] or pad[1] >= ds[1] or pad[2] >= ds[2]:
continue
# print('test_downsample', shp, ds, st, pad, ignore_border)
ds_op = Pool(ndim=len(ds), ignore_border=ignore_border)
a = theano.shared(rand(*shp), 'a')
ggf = gradient.Lop(tensor.grad((ds_op(
tensor.as_tensor_variable(a), ds, st, pad)**2).sum(), a), a, a)
ref_mode = copy.copy(mode_without_gpu)
ref_mode.check_py_code = False
gpu_mode = copy.copy(mode_with_gpu)
gpu_mode.check_py_code = False
gg = theano.function([], ggf, mode=gpu_mode)
gg2 = theano.function([], ggf, mode=ref_mode)
assert any([
isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
for node in gg.maker.fgraph.toposort()
])
assert any([
isinstance(node.op, DownsampleFactorMaxGradGrad)
for node in gg2.maker.fgraph.toposort()
])
assert numpy.allclose(gg(), gg2()), (shp, ds, st,
ignore_border)
from __future__ import absolute_import, print_function, division
import unittest
import copy
import itertools
import numpy
import theano
from theano import gradient
from theano import tensor
from theano.tensor.signal.pool import (Pool, MaxPoolGrad, AveragePoolGrad,
DownsampleFactorMaxGradGrad)
from theano.tests import unittest_tools as utt
from .config import mode_with_gpu, mode_without_gpu
from .test_basic_ops import rand
from ..pool import (GpuPool, GpuMaxPoolGrad, GpuAveragePoolGrad,
GpuDownsampleFactorMaxGradGrad)
class TestPool(unittest.TestCase):
def test_pool_py_interface(self):
shp = (2, 2, 2, 2)
inp = theano.shared(rand(*shp), 'a')
inp = tensor.as_tensor_variable(inp)
with self.assertRaises(ValueError):
# test when pad >= ws
ds_op = GpuPool(ignore_border=True, ndim=2)
ds_op(inp, [2, 2], pad=[3, 3])
with self.assertRaises(ValueError):
# test when ignore_border and pad >= 0
ds_op = GpuPool(ignore_border=False, ndim=2)
ds_op(inp, [2, 2], pad=[1, 1])
def test_pool_c_interface(self):
gpu_mode = copy.copy(mode_with_gpu).excluding("cudnn")
gpu_mode.check_py_code = False
shp = (2, 2, 2, 2)
inp = theano.shared(rand(*shp), 'a')
inp = tensor.as_tensor_variable(inp)
with self.assertRaises(ValueError):
# test when ignore_border and pad >= 0
ds_op = GpuPool(ignore_border=False, ndim=2)
pad = tensor.as_tensor_variable([1, 1])
f = theano.function([], ds_op(inp, [2, 2], pad=pad), mode=gpu_mode)
f()
def test_pool2d():
shps = [(1, 12),
(1, 1, 12),
(1, 1, 1, 12),
(1, 1, 2, 2),
(1, 1, 1, 1),
(1, 1, 4, 4),
(1, 1, 10, 11),
(1, 2, 2, 2),
(3, 5, 4, 4),
(25, 1, 7, 7),
(1, 1, 12, 12),
(1, 1, 2, 14),
(1, 1, 12, 14),
(1, 1, 14, 14),
(1, 1, 16, 16),
(1, 1, 18, 18),
(1, 1, 24, 24),
(1, 6, 24, 24),
(10, 1, 24, 24),
(10, 6, 24, 24),
(30, 6, 12, 12),
(30, 2, 24, 24),
(30, 6, 24, 24),
(10, 10, 10, 11),
(1, 1, 10, 1025),
(1, 1, 10, 1023),
(1, 1, 1025, 10),
(1, 1, 1023, 10),
(3, 2, 16, 16, 16),
(3, 2, 6, 6, 6, 5),
(3, 2, 6, 6, 6, 5, 7), ]
numpy.random.RandomState(utt.fetch_seed()).shuffle(shps)
test_ws = (2, 2), (3, 2), (1, 1)
test_st = (2, 2), (3, 2), (1, 1)
test_mode = ['max', 'sum', 'average_inc_pad', 'average_exc_pad']
ref_mode = copy.copy(mode_without_gpu)
ref_mode.check_py_code = False
gpu_mode = copy.copy(mode_with_gpu).excluding("cudnn")
gpu_mode.check_py_code = False
for shp in shps:
for mode, ws, st in itertools.product(test_mode, test_ws, test_st):
if ws[0] > shp[-2] or ws[1] > shp[-1]:
continue
for ignore_border, pad in zip((True, False), [(1, 1), (0, 0)]):
if pad[0] >= ws[0] or pad[1] >= ws[1]:
continue
if mode == 'average_exc_pad' and (pad[0] > 0 or pad[1] > 0):
continue
# print('test_pool2d', shp, ws, st, pad, mode, ignore_border)
ds_op = Pool(ndim=len(ws), mode=mode, ignore_border=ignore_border)
a = theano.shared(rand(*shp), 'a')
a_pooled = ds_op(tensor.as_tensor_variable(a), ws, st, pad)
f = theano.function([], a_pooled, mode=gpu_mode)
f2 = theano.function([], a_pooled, mode=ref_mode)
assert any([isinstance(node.op, GpuPool)
for node in f.maker.fgraph.toposort()])
assert any([isinstance(node.op, Pool)
for node in f2.maker.fgraph.toposort()])
assert numpy.allclose(f(), f2()), (shp, ws, st, pad, mode, ignore_border)
a_pooled_grad = tensor.grad(a_pooled.sum(), a)
g = theano.function([], a_pooled_grad, mode=gpu_mode)
g2 = theano.function([], a_pooled_grad, mode=ref_mode)
if mode == 'max':
gop = GpuMaxPoolGrad
gop2 = MaxPoolGrad
else:
gop = GpuAveragePoolGrad
gop2 = AveragePoolGrad
assert any([isinstance(node.op, gop)
for node in g.maker.fgraph.toposort()])
assert any([isinstance(node.op, gop2)
for node in g2.maker.fgraph.toposort()])
assert numpy.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border)
# test grad grad for max pooling
# for average pooling grad grad is just average pooling grad
if mode != 'max':
continue
ggf = gradient.Lop(tensor.grad((a_pooled**2).sum(), a), a, a)
gg = theano.function([], ggf, mode=gpu_mode)
gg2 = theano.function([], ggf, mode=ref_mode)
assert any([
isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
for node in gg.maker.fgraph.toposort()
])
assert any([
isinstance(node.op, DownsampleFactorMaxGradGrad)
for node in gg2.maker.fgraph.toposort()
])
assert numpy.allclose(gg(), gg2()), (shp, ws, st, pad, mode, ignore_border)
def test_pool3d():
shps = [(1, 1, 12),
(1, 1, 1, 1, 1),
(1, 1, 1, 1, 1025),
(1, 1, 2, 2, 2),
(1, 1, 7, 7, 7),
(1, 1, 9, 10, 11),
(1, 6, 18, 18, 18),
(1, 1, 6, 24, 24),
(1, 10, 1, 24, 24),
(1, 10, 6, 24, 24),
(1, 30, 6, 12, 12),
(1, 30, 2, 24, 24),
(1, 30, 6, 24, 24),
(1, 10, 10, 10, 11),
(1, 1, 10, 10, 1025),
(1, 1, 10, 10, 1023),
(1, 1, 10, 1025, 10),
(1, 1, 10, 1023, 10),
(3, 2, 6, 6, 6, 5),
(3, 2, 6, 6, 6, 5, 7), ]
numpy.random.RandomState(utt.fetch_seed()).shuffle(shps)
test_ws = (2, 2, 2), (3, 2, 3), (1, 1, 1)
test_st = (2, 2, 2), (2, 3, 2), (1, 1, 1)
test_mode = ['max', 'sum', 'average_inc_pad', 'average_exc_pad']
ref_mode = copy.copy(mode_without_gpu)
ref_mode.check_py_code = False
gpu_mode = copy.copy(mode_with_gpu).excluding("cudnn")
gpu_mode.check_py_code = False
for shp in shps:
for mode, ws, st in itertools.product(test_mode, test_ws, test_st):
if ws[0] > shp[-3] or ws[1] > shp[-2] or ws[2] > shp[-1]:
continue
for ignore_border, pad in zip((True, False), [(1, 1, 1), (0, 0, 0)]):
if pad[0] >= ws[0] or pad[1] >= ws[1] or pad[2] >= ws[2]:
continue
if mode == 'average_exc_pad' and (pad[0] > 0 or pad[1] > 0 or pad[2] > 0):
continue
# print('test_pool3d', shp, ws, st, pad, mode, ignore_border)
ds_op = Pool(ndim=len(ws), mode=mode, ignore_border=ignore_border)
a = theano.shared(rand(*shp), 'a')
a_pooled = ds_op(tensor.as_tensor_variable(a), ws, st, pad)
f = theano.function([], a_pooled, mode=gpu_mode)
f2 = theano.function([], a_pooled, mode=ref_mode)
assert any([isinstance(node.op, GpuPool)
for node in f.maker.fgraph.toposort()])
assert any([isinstance(node.op, Pool)
for node in f2.maker.fgraph.toposort()])
assert numpy.allclose(f(), f2()), (shp, ws, st, pad, mode, ignore_border)
a_pooled_grad = tensor.grad(a_pooled.sum(), a)
g = theano.function([], a_pooled_grad, mode=gpu_mode)
g2 = theano.function([], a_pooled_grad, mode=ref_mode)
if mode == 'max':
gop = GpuMaxPoolGrad
gop2 = MaxPoolGrad
else:
gop = GpuAveragePoolGrad
gop2 = AveragePoolGrad
assert any([isinstance(node.op, gop)
for node in g.maker.fgraph.toposort()])
assert any([isinstance(node.op, gop2)
for node in g2.maker.fgraph.toposort()])
assert numpy.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border)
# test grad grad for max pooling
# for average pooling grad grad is just average pooling grad
if mode != 'max':
continue
ggf = gradient.Lop(tensor.grad((a_pooled**2).sum(), a), a, a)
gg = theano.function([], ggf, mode=gpu_mode)
gg2 = theano.function([], ggf, mode=ref_mode)
assert any([
isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
for node in gg.maker.fgraph.toposort()
])
assert any([
isinstance(node.op, DownsampleFactorMaxGradGrad)
for node in gg2.maker.fgraph.toposort()
])
assert numpy.allclose(gg(), gg2()), (shp, ws, st, pad, mode, ignore_border)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论