提交 e514f4d3 authored 作者: Vikram's avatar Vikram

GPU code and some more tests

上级 cd10a53c
...@@ -468,16 +468,33 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -468,16 +468,33 @@ class BaseGpuCorrMM(CGpuKernelBase):
def __init__(self, border_mode="valid", subsample=(1, 1), def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1), num_groups=1, unshared=False): filter_dilation=(1, 1), num_groups=1, unshared=False):
if isinstance(border_mode, integer_types): if isinstance(border_mode, integer_types):
border_mode = (border_mode, border_mode) if border_mode < 0:
if isinstance(border_mode, tuple): raise ValueError(
pad_h, pad_w = map(int, border_mode) 'invalid border_mode {}, which must be a '
border_mode = (pad_h, pad_w) 'non-negative integer'.format(border_mode))
if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or border_mode = ((border_mode, border_mode),) * 2
border_mode in ('valid', 'full', 'half')): elif isinstance(border_mode, tuple):
if len(border_mode) != 2:
raise ValueError(
'invalid border_mode {} which must be a '
'tuple of length 2'.format(border_mode))
border = ()
for mode in border_mode:
if isinstance(mode, integer_types) and mode >= 0:
border += ((mode, mode),)
elif isinstance(mode, tuple) and len(mode) == 2 and \
min(mode) >= 0:
border += ((int(mode[0]), int(mode[1])),)
else:
raise ValueError(
'invalid border mode {}. The tuple can only contain '
'integers or tuples of length 2'.format(border_mode))
border_mode = border
elif border_mode not in ('valid', 'full', 'half'):
raise ValueError( raise ValueError(
'invalid border_mode {}, which must be either ' 'invalid border_mode {}, which must be either '
'"valid", "full", "half", an integer or a pair of' '"valid", "full", "half", an integer or a tuple '
' integers'.format(border_mode)) 'of length 2'.format(border_mode))
self.border_mode = border_mode self.border_mode = border_mode
if len(subsample) != 2: if len(subsample) != 2:
raise ValueError("subsample must have two elements") raise ValueError("subsample must have two elements")
...@@ -495,7 +512,7 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -495,7 +512,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
def pad(self): def pad(self):
if self.border_mode != 'valid': if self.border_mode != 'valid':
return self.border_mode return self.border_mode
return (0, 0) return ((0, 0),) * 2
def __str__(self): def __str__(self):
return '%s{%s, %s, %s, %s, %s}' % ( return '%s{%s, %s, %s, %s, %s}' % (
...@@ -537,7 +554,7 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -537,7 +554,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
def c_code_cache_version(self): def c_code_cache_version(self):
# Raise this whenever modifying the C code (including the file). # Raise this whenever modifying the C code (including the file).
return (11,) return (12,)
def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None): def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
""" """
...@@ -587,14 +604,14 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -587,14 +604,14 @@ class BaseGpuCorrMM(CGpuKernelBase):
numgroups = self.num_groups numgroups = self.num_groups
unshared = int(self.unshared) unshared = int(self.unshared)
if self.border_mode == "half": if self.border_mode == "half":
padH = padW = -1 padH_l = padH_r = padW_l = padW_r = -1
elif self.border_mode == "full": elif self.border_mode == "full":
padH = padW = -2 padH_l = padH_r = padW_l = padW_r = -2
elif isinstance(self.border_mode, tuple): elif isinstance(self.border_mode, tuple):
padH, padW = self.border_mode (padH_l, padH_r), (padW_l, padW_r) = self.border_mode
else: else:
assert self.border_mode == "valid" assert self.border_mode == "valid"
padH = padW = 0 padH_l = padH_r = padW_l = padW_r = 0
if direction == "forward": if direction == "forward":
direction = 0 direction = 0
out = top out = top
...@@ -613,13 +630,13 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -613,13 +630,13 @@ class BaseGpuCorrMM(CGpuKernelBase):
if height: if height:
height = '(*(npy_int*)(PyArray_DATA(%s)))' % height height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
else: else:
if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)): if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH_l == -1)):
raise ValueError("height must be given for backprop with vertical sampling or pad='half'") raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
height = '-1' height = '-1'
if width: if width:
width = '(*(npy_int*)(PyArray_DATA(%s)))' % width width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
else: else:
if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)): if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW_l == -1)):
raise ValueError("width must be given for backprop with horizontal sampling or pad='half'") raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
width = '-1' width = '-1'
...@@ -635,8 +652,10 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -635,8 +652,10 @@ class BaseGpuCorrMM(CGpuKernelBase):
size_t dW = %(dW)s; size_t dW = %(dW)s;
size_t dilH = %(dilH)s; size_t dilH = %(dilH)s;
size_t dilW = %(dilW)s; size_t dilW = %(dilW)s;
int padH = %(padH)s; int padH_l = %(padH_l)s;
int padW = %(padW)s; int padH_r = %(padH_r)s;
int padW_l = %(padW_l)s;
int padW_r = %(padW_r)s;
int numgroups = %(numgroups)s; int numgroups = %(numgroups)s;
int unshared = %(unshared)s; int unshared = %(unshared)s;
...@@ -662,22 +681,22 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -662,22 +681,22 @@ class BaseGpuCorrMM(CGpuKernelBase):
// kernel height is specified (perhaps vertical subsampling or half padding) // kernel height is specified (perhaps vertical subsampling or half padding)
kH = %(height)s; kH = %(height)s;
} }
else if (padH == -2) { else if (padH_l == -2 || padH_r == -2) {
// vertical full padding, we can infer the kernel height // vertical full padding, we can infer the kernel height
kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1; kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1;
} }
else { else {
// explicit padding, we can infer the kernel height // explicit padding, we can infer the kernel height
kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ; kH = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
} }
if (%(width)s != -1) { if (%(width)s != -1) {
kW = %(width)s; kW = %(width)s;
} }
else if (padW == -2) { else if (padW_l == -2 || padW_r == -2) {
kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1; kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
} }
else { else {
kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1; kW = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
} }
} }
...@@ -686,23 +705,23 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -686,23 +705,23 @@ class BaseGpuCorrMM(CGpuKernelBase):
dil_kW = (kW - 1) * dilW + 1; dil_kW = (kW - 1) * dilW + 1;
// Auto-padding if requested // Auto-padding if requested
if (padH == -1) { // vertical half padding if (padH_l == -1 || padH_r == -1) { // vertical half padding
padH = dil_kH / 2; padH_l = padH_r = dil_kH / 2;
} }
else if (padH == -2) { // vertical full padding else if (padH_l == -2 || padH_r == -2) { // vertical full padding
padH = dil_kH - 1; padH_l = padH_r = dil_kH - 1;
} }
else if (padH < 0) { else if (padH_l < 0 || padH_r < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2"); PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2");
%(fail)s %(fail)s
} }
if (padW == -1) { // horizontal half padding if (padW_l == -1 || padW_r == -1) { // horizontal half padding
padW = dil_kW / 2; padW_l = padW_r = dil_kW / 2;
} }
else if (padW == -2) { // horizontal full padding else if (padW_l == -2 || padW_r == -2) { // horizontal full padding
padW = dil_kW - 1; padW_l = padW_r = dil_kW - 1;
} }
else if (padW < 0) { else if (padW_l < 0 || padW_r < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2"); PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2");
%(fail)s %(fail)s
} }
...@@ -718,11 +737,11 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -718,11 +737,11 @@ class BaseGpuCorrMM(CGpuKernelBase):
switch(direction) { switch(direction) {
case 0: // forward pass case 0: // forward pass
// output is top: (batchsize, num_filters, height, width) // output is top: (batchsize, num_filters, height, width)
// height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1 // height and width: top = (bottom + pad_l + pad_r - ((weight-1)*dil + 1)) / sample + 1
out_dim[0] = PyGpuArray_DIMS(bottom)[0]; out_dim[0] = PyGpuArray_DIMS(bottom)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[0]; out_dim[1] = PyGpuArray_DIMS(weights)[0];
out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1; out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1;
out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1; out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1;
out_typecode = bottom->ga.typecode; out_typecode = bottom->ga.typecode;
out_context = bottom->context; out_context = bottom->context;
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
...@@ -810,8 +829,8 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -810,8 +829,8 @@ class BaseGpuCorrMM(CGpuKernelBase):
// height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = PyGpuArray_DIMS(top)[0]; out_dim[0] = PyGpuArray_DIMS(top)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[wdim-3] * numgroups; out_dim[1] = PyGpuArray_DIMS(weights)[wdim-3] * numgroups;
out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - 2*padH; out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - padH_l - padH_r;
out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - 2*padW; out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - padW_l - padW_r;
out_typecode = top->ga.typecode; out_typecode = top->ga.typecode;
out_context = top->context; out_context = top->context;
if (unshared) { if (unshared) {
...@@ -884,7 +903,8 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -884,7 +903,8 @@ class BaseGpuCorrMM(CGpuKernelBase):
} }
// Call GPU code // Call GPU code
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups, unshared); out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW,
padH_l, padH_r, padW_l, padW_r, numgroups, unshared);
if (out2==NULL){ if (out2==NULL){
%(fail)s %(fail)s
} }
......
...@@ -42,7 +42,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n, ...@@ -42,7 +42,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
const ga_size height, const ga_size width, const ga_size height, const ga_size width,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_h, const ga_size dilation_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_hl, const ga_size pad_wl,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col, const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_col, GLOBAL_MEM DTYPE_INPUT_0 * data_col,
...@@ -57,8 +57,8 @@ KERNEL void dilated_im2col_kernel(const ga_size n, ...@@ -57,8 +57,8 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
const ga_size w_col = index % width_col; const ga_size w_col = index % width_col;
const ga_size c_im = h_index / height_col; const ga_size c_im = h_index / height_col;
const ga_size c_col = c_im * kernel_h * kernel_w; const ga_size c_col = c_im * kernel_h * kernel_w;
const ga_size h_offset = h_col * stride_h - pad_h; const ga_size h_offset = h_col * stride_h - pad_hl;
const ga_size w_offset = w_col * stride_w - pad_w; const ga_size w_offset = w_col * stride_w - pad_wl;
GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col; GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset; GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
...@@ -86,7 +86,7 @@ KERNEL void im2col_kernel(const ga_size n, ...@@ -86,7 +86,7 @@ KERNEL void im2col_kernel(const ga_size n,
// data_im_offset is an offset of elements in the array // data_im_offset is an offset of elements in the array
const ga_size height, const ga_size width, const ga_size height, const ga_size width,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_hl, const ga_size pad_wl,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col, const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_col, GLOBAL_MEM DTYPE_INPUT_0 * data_col,
...@@ -101,8 +101,8 @@ KERNEL void im2col_kernel(const ga_size n, ...@@ -101,8 +101,8 @@ KERNEL void im2col_kernel(const ga_size n,
const ga_size w_col = index % width_col; const ga_size w_col = index % width_col;
const ga_size c_im = h_index / height_col; const ga_size c_im = h_index / height_col;
const ga_size c_col = c_im * kernel_h * kernel_w; const ga_size c_col = c_im * kernel_h * kernel_w;
const ga_size h_offset = h_col * stride_h - pad_h; const ga_size h_offset = h_col * stride_h - pad_hl;
const ga_size w_offset = w_col * stride_w - pad_w; const ga_size w_offset = w_col * stride_w - pad_wl;
GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col; GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset; GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
...@@ -127,7 +127,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n, ...@@ -127,7 +127,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
const ga_size height, const ga_size width, const ga_size channels, const ga_size height, const ga_size width, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_h, const ga_size dilation_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_hl, const ga_size pad_wl,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col, const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_im, GLOBAL_MEM DTYPE_INPUT_0 * data_im,
...@@ -141,8 +141,8 @@ KERNEL void dilated_col2im_kernel(const ga_size n, ...@@ -141,8 +141,8 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_INPUT_0 val = 0; DTYPE_INPUT_0 val = 0;
const ga_size w_im = index % width + pad_w; const ga_size w_im = index % width + pad_wl;
const ga_size h_im = (index / width) % height + pad_h; const ga_size h_im = (index / width) % height + pad_hl;
const ga_size c_im = index / (width * height); const ga_size c_im = index / (width * height);
ga_size kernel_extent_w = (kernel_w - 1) * dilation_w + 1; ga_size kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
ga_size kernel_extent_h = (kernel_h - 1) * dilation_h + 1; ga_size kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
...@@ -177,7 +177,7 @@ KERNEL void col2im_kernel(const ga_size n, ...@@ -177,7 +177,7 @@ KERNEL void col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size channels, const ga_size height, const ga_size width, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_hl, const ga_size pad_wl,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col, const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_im, GLOBAL_MEM DTYPE_INPUT_0 * data_im,
...@@ -191,8 +191,8 @@ KERNEL void col2im_kernel(const ga_size n, ...@@ -191,8 +191,8 @@ KERNEL void col2im_kernel(const ga_size n,
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_INPUT_0 val = 0; DTYPE_INPUT_0 val = 0;
const ga_size w_im = index % width + pad_w; const ga_size w_im = index % width + pad_wl;
const ga_size h_im = (index / width) % height + pad_h; const ga_size h_im = (index / width) % height + pad_hl;
const ga_size c_im = index / (width * height); const ga_size c_im = index / (width * height);
// compute the start and end of the output // compute the start and end of the output
const ga_size w_col_start = const ga_size w_col_start =
...@@ -254,15 +254,16 @@ int rgemm(cb_order o, cb_transpose tA, cb_transpose tB, ...@@ -254,15 +254,16 @@ int rgemm(cb_order o, cb_transpose tA, cb_transpose tB,
int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels, int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels,
const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
const size_t dilation_h, const size_t dilation_w, const size_t dilation_h, const size_t dilation_w,
const size_t pad_h, const size_t pad_w, const size_t pad_hl, const size_t pad_hr,
const size_t pad_wl, const size_t pad_wr,
const size_t stride_h, const size_t stride_w, const size_t stride_h, const size_t stride_w,
GpuArray *data_col) { GpuArray *data_col) {
// We are going to launch channels * height_col * width_col kernels, each // We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid. // kernel responsible for copying a single-channel grid.
size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1; size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1; size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1; size_t height_col = (height + pad_hl + pad_hr - dil_kernel_h) / stride_h + 1;
size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1; size_t width_col = (width + pad_wl + pad_wr - dil_kernel_w) / stride_w + 1;
size_t num_kernels = channels * height_col * width_col; size_t num_kernels = channels * height_col * width_col;
int err; int err;
if (dilation_h != 1 || dilation_w != 1) { if (dilation_h != 1 || dilation_w != 1) {
...@@ -270,7 +271,7 @@ int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels ...@@ -270,7 +271,7 @@ int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels
1, &num_kernels, 0, 1, &num_kernels, 0,
num_kernels, data_im->data, data_im->offset, data_im_offset, num_kernels, data_im->data, data_im->offset, data_im_offset,
height, width, kernel_h, kernel_w, height, width, kernel_h, kernel_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col, dilation_h, dilation_w, pad_hl, pad_wl, stride_h, stride_w, height_col,
width_col, data_col->data, data_col->offset); width_col, data_col->data, data_col->offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
...@@ -282,7 +283,7 @@ int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels ...@@ -282,7 +283,7 @@ int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels
1, &num_kernels, 0, 1, &num_kernels, 0,
num_kernels, data_im->data, data_im->offset, data_im_offset, num_kernels, data_im->data, data_im->offset, data_im_offset,
height, width, kernel_h, kernel_w, height, width, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, height_col, pad_hl, pad_wl, stride_h, stride_w, height_col,
width_col, data_col->data, data_col->offset); width_col, data_col->data, data_col->offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
...@@ -296,12 +297,12 @@ int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels ...@@ -296,12 +297,12 @@ int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels
int col2im(GpuArray *data_col, const size_t channels, int col2im(GpuArray *data_col, const size_t channels,
const size_t height, const size_t width, const size_t patch_h, const size_t patch_w, const size_t height, const size_t width, const size_t patch_h, const size_t patch_w,
const size_t dilation_h, const size_t dilation_w, const size_t dilation_h, const size_t dilation_w,
const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t pad_hl, const size_t pad_hr, const size_t pad_wl, const size_t pad_wr,
const size_t stride_w, GpuArray *data_im, const size_t data_im_offset) { const size_t stride_h, const size_t stride_w, GpuArray *data_im, const size_t data_im_offset) {
size_t dil_patch_h = (patch_h - 1) * dilation_h + 1; size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
size_t dil_patch_w = (patch_w - 1) * dilation_w + 1; size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1; size_t height_col = (height + pad_hl + pad_hr - dil_patch_h) / stride_h + 1;
size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1; size_t width_col = (width + pad_wl + pad_wr - dil_patch_w) / stride_w + 1;
size_t num_kernels = channels * height * width; size_t num_kernels = channels * height * width;
// To avoid involving atomic operations, we will launch one kernel per // To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions. // bottom dimension, and then in the kernel add up the top dimensions.
...@@ -311,7 +312,7 @@ int col2im(GpuArray *data_col, const size_t channels, ...@@ -311,7 +312,7 @@ int col2im(GpuArray *data_col, const size_t channels,
1, &num_kernels, 0, 1, &num_kernels, 0,
num_kernels, data_col->data, data_col->offset, num_kernels, data_col->data, data_col->offset,
height, width, channels, patch_h, patch_w, height, width, channels, patch_h, patch_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, pad_hl, pad_wl, stride_h, stride_w,
height_col, width_col, data_im->data, data_im->offset, data_im_offset); height_col, width_col, data_im->data, data_im->offset, data_im_offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
...@@ -323,7 +324,7 @@ int col2im(GpuArray *data_col, const size_t channels, ...@@ -323,7 +324,7 @@ int col2im(GpuArray *data_col, const size_t channels,
1, &num_kernels, 0, 1, &num_kernels, 0,
num_kernels, data_col->data, data_col->offset, num_kernels, data_col->data, data_col->offset,
height, width, channels, patch_h, patch_w, height, width, channels, patch_h, patch_w,
pad_h, pad_w, stride_h, stride_w, pad_hl, pad_wl, stride_h, stride_w,
height_col, width_col, data_im->data, data_im->offset, data_im_offset); height_col, width_col, data_im->data, data_im->offset, data_im_offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
...@@ -347,8 +348,10 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -347,8 +348,10 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const size_t dW = 1, const size_t dW = 1,
const size_t dilH = 1, const size_t dilH = 1,
const size_t dilW = 1, const size_t dilW = 1,
const size_t padH = 0, const size_t padH_l = 0,
const size_t padW = 0, const size_t padH_r = 0,
const size_t padW_l = 0,
const size_t padW_r = 0,
const size_t numgroups = 1, const size_t numgroups = 1,
const size_t unshared = 0) const size_t unshared = 0)
{ {
...@@ -443,8 +446,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -443,8 +446,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const size_t dil_kH = (kH - 1) * dilH + 1; const size_t dil_kH = (kH - 1) * dilH + 1;
const size_t dil_kW = (kW - 1) * dilW + 1; const size_t dil_kW = (kW - 1) * dilW + 1;
// top: (batchSize, nFilters, topHeight, topWidth) // top: (batchSize, nFilters, topHeight, topWidth)
const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH); const size_t topHeightNoDH = (bottomHeight + padH_l + padH_r - dil_kH);
const size_t topWidthNoDW = (bottomWidth + 2*padW - dil_kW); const size_t topWidthNoDW = (bottomWidth + padW_l + padW_r - dil_kW);
// the above values might be negative so we need to use Python-like // the above values might be negative so we need to use Python-like
// flooring integer division to be compatible with get_conv_output. // flooring integer division to be compatible with get_conv_output.
// note: this macro implements Python's // for negative x only // note: this macro implements Python's // for negative x only
...@@ -558,7 +561,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -558,7 +561,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
err = im2col(&bottom->ga, n * batch_bottom_stride, err = im2col(&bottom->ga, n * batch_bottom_stride,
nChannels, bottomHeight, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW, bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, &col->ga); padH_l, padH_r, padW_l, padW_r, dH, dW, &col->ga);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
...@@ -618,7 +621,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -618,7 +621,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
err = im2col(&bottom->ga, n * batch_bottom_stride, err = im2col(&bottom->ga, n * batch_bottom_stride,
nChannels, bottomHeight, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW, bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, &col->ga); padH_l, padH_r, padW_l, padW_r, dH, dW, &col->ga);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
...@@ -712,7 +715,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -712,7 +715,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
} }
// col2im back to the data // col2im back to the data
err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth, err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
kH, kW, dilH, dilW, padH, padW, kH, kW, dilH, dilW, padH_l, padH_r, padW_l, padW_r,
dH, dW, &bottom->ga, n * batch_bottom_stride); dH, dW, &bottom->ga, n * batch_bottom_stride);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
......
...@@ -12,6 +12,7 @@ from ..type import gpuarray_shared_constructor ...@@ -12,6 +12,7 @@ from ..type import gpuarray_shared_constructor
from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
from .config import mode_with_gpu, mode_without_gpu, ref_cast from .config import mode_with_gpu, mode_without_gpu, ref_cast
from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
from theano.tensor.nnet.tests.test_abstract_conv import TestAsymmetricPadding
class TestCorrMM(unittest.TestCase): class TestCorrMM(unittest.TestCase):
...@@ -272,3 +273,10 @@ class TestUnsharedGpuCorr2d(TestUnsharedConv): ...@@ -272,3 +273,10 @@ class TestUnsharedGpuCorr2d(TestUnsharedConv):
conv2d_op = GpuCorrMM conv2d_op = GpuCorrMM
conv2d_gradw_op = GpuCorrMM_gradWeights conv2d_gradw_op = GpuCorrMM_gradWeights
conv2d_gradi_op = GpuCorrMM_gradInputs conv2d_gradi_op = GpuCorrMM_gradInputs
class TestAsymmetricGpu(TestAsymmetricPadding):
mode = mode_with_gpu.excluding('cudnn')
conv2d_op = GpuCorrMM
conv2d_gradw_op = GpuCorrMM_gradWeights
conv2d_gradi_op = GpuCorrMM_gradInputs
...@@ -195,7 +195,7 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -195,7 +195,7 @@ class BaseCorrMM(gof.OpenMPOp):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (9, self.openmp, blas_header_version()) return (10, self.openmp, blas_header_version())
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
...@@ -439,7 +439,7 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -439,7 +439,7 @@ class BaseCorrMM(gof.OpenMPOp):
break; break;
case 1: // backprop wrt. weights case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width) // output is weights: (num_filters, num_channels, height, width)
// height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1 // height and width: weights = (bottom + pad_l + pad_r - (top - 1) * sample - 1) / dil + 1
out_dim[0] = (npy_intp)PyArray_DIMS(top)[1]; out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
if (unshared){ if (unshared){
odim = 6; odim = 6;
......
...@@ -11,6 +11,7 @@ import theano.tensor as T ...@@ -11,6 +11,7 @@ import theano.tensor as T
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tensor.nnet import corr, conv from theano.tensor.nnet import corr, conv
from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
from theano.tensor.nnet.tests.test_abstract_conv import TestAsymmetricPadding
class TestCorr2D(utt.InferShapeTester): class TestCorr2D(utt.InferShapeTester):
...@@ -462,6 +463,16 @@ class TestUnsharedCorr2d(TestUnsharedConv): ...@@ -462,6 +463,16 @@ class TestUnsharedCorr2d(TestUnsharedConv):
conv2d_gradi_op = corr.CorrMM_gradInputs conv2d_gradi_op = corr.CorrMM_gradInputs
class TestAsymmetricCorr(TestAsymmetricPadding):
if theano.config.mode == "FAST_COMPILE":
mode = theano.compile.get_mode("FAST_RUN").excluding('gpuarray')
else:
mode = None
conv2d_op = corr.CorrMM
conv2d_gradw_op = corr.CorrMM_gradWeights
conv2d_gradi_op = corr.CorrMM_gradInputs
if __name__ == '__main__': if __name__ == '__main__':
t = TestCorr2D('setUp') t = TestCorr2D('setUp')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论