提交 2dcf3753 authored 作者: Nizar Assaf's avatar Nizar Assaf

Implementation of 2D dilated convolution/correlation.

上级 c0b294ec
......@@ -1393,6 +1393,9 @@ def local_abstractconv_cudnn(node):
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if (node.op.filter_dilation != (1, 1)):
return None
if not isinstance(inp1.type, GpuArrayType):
return None
......
......@@ -855,15 +855,17 @@ class BaseGpuCorrMM(GpuOp):
or a pair of integers
subsample
Perform subsampling of the output (default: (1, 1)).
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1)).
pad
*deprecated*, now you should always use border_mode.
"""
check_broadcast = False
__props__ = ('border_mode', 'subsample')
__props__ = ('border_mode', 'subsample', 'filter_dilation')
def __init__(self, border_mode="valid", subsample=(1, 1), pad=(0, 0)):
def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1), pad=(0, 0)):
if pad != (0, 0):
_logger.warning(
'do not use pad for BaseGpuCorrMM; please set padding in '
......@@ -885,7 +887,10 @@ class BaseGpuCorrMM(GpuOp):
self.border_mode = border_mode
if len(subsample) != 2:
raise ValueError("subsample must have two elements")
self.subsample = subsample
if len(filter_dilation) != 2:
raise ValueError("filter_dilation must have two elements")
self.subsample = tuple(subsample)
self.filter_dilation = tuple(filter_dilation)
@property
def pad(self):
......@@ -894,10 +899,11 @@ class BaseGpuCorrMM(GpuOp):
return (0, 0)
def __str__(self):
return '%s{%s, %s}' % (
return '%s{%s, %s, %s}' % (
self.__class__.__name__,
self.border_mode,
str(self.subsample))
str(self.subsample),
str(self.filter_dilation))
def flops(self, inp, outp):
"""
......@@ -922,7 +928,7 @@ class BaseGpuCorrMM(GpuOp):
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 24)
return (0, 26)
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of
......@@ -976,6 +982,7 @@ class BaseGpuCorrMM(GpuOp):
"""
dH, dW = self.subsample
dilH, dilW = self.filter_dilation
if self.border_mode == "half":
padH = padW = -1
elif self.border_mode == "full":
......@@ -1022,6 +1029,8 @@ class BaseGpuCorrMM(GpuOp):
// Optional args
int dH = %(dH)s;
int dW = %(dW)s;
int dilH = %(dilH)s;
int dilW = %(dilW)s;
int padH = %(padH)s;
int padW = %(padW)s;
......@@ -1045,39 +1054,43 @@ class BaseGpuCorrMM(GpuOp):
}
else if (padH == -2) {
// vertical full padding, we can infer the kernel height
kH = 2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
kH = (2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1;
}
else {
// explicit padding, we can infer the kernel height
kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
kH = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1 ;
}
if ((dW != 1) || (padW == -1)) {
kW = %(width)s;
}
else if (padW == -2) {
kW = 2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
kW = (2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
else {
kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
kW = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
}
// Implicit dilated kernel size
int dil_kH = (kH - 1) * dilH + 1;
int dil_kW = (kW - 1) * dilW + 1;
// Auto-padding if requested
if (padH == -1) { // vertical half padding
padH = kH / 2;
padH = dil_kH / 2;
}
else if (padH == -2) { // vertical full padding
padH = kH - 1;
padH = dil_kH - 1;
}
else if (padH < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2");
%(fail)s
}
if (padW == -1) { // horizontal half padding
padW = kW / 2;
padW = dil_kW / 2;
}
else if (padW == -2) { // horizontal full padding
padW = kW - 1;
padW = dil_kW - 1;
}
else if (padW < 0) {
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2");
......@@ -1089,15 +1102,15 @@ class BaseGpuCorrMM(GpuOp):
switch(direction) {
case 0: // forward pass
// output is top: (batchsize, num_filters, height, width)
// height and width: top = (bottom + 2*pad - weight) / sample + 1
// height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - ((CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - ((CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
break;
case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width)
// height and width: weights = bottom + 2*pad - (top - 1) * sample
// height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = CudaNdarray_HOST_DIMS(top)[1];
out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1];
out_dim[2] = kH; // already inferred further above
......@@ -1105,11 +1118,11 @@ class BaseGpuCorrMM(GpuOp):
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width)
// height and width: bottom = (top - 1) * sample + weights - 2*pad
// height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
break;
default:
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
......@@ -1137,7 +1150,7 @@ class BaseGpuCorrMM(GpuOp):
}
// Call CUDA code
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, padH, padW);
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW);
if (out2==NULL){
%(fail)s
}
......@@ -1168,6 +1181,10 @@ class GpuCorrMM(BaseGpuCorrMM):
`(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
but faster.
Set to `(1, 1)` to disable subsampling.
filter_dilation
The filter dilation operation applied to each input image.
Should be a tuple with 2 elements.
Set to `(1, 1)` to disable filter dilation.
pad
Deprecated alias for `border_mode`.
......@@ -1198,8 +1215,10 @@ class GpuCorrMM(BaseGpuCorrMM):
"""
def __init__(self, border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1),
pad=(0, 0)):
super(GpuCorrMM, self).__init__(border_mode, subsample, pad)
super(GpuCorrMM, self).__init__(border_mode, subsample,
filter_dilation, pad)
def make_node(self, img, kern):
img = as_cuda_ndarray_variable(img)
......@@ -1223,9 +1242,13 @@ class GpuCorrMM(BaseGpuCorrMM):
bottom, weights = inp
top, = grads
top = gpu_contiguous(top)
d_bottom = GpuCorrMM_gradInputs(self.border_mode, self.subsample)(
d_bottom = GpuCorrMM_gradInputs(self.border_mode,
self.subsample,
self.filter_dilation)(
weights, top, bottom.shape[-2:])
d_weights = GpuCorrMM_gradWeights(self.border_mode, self.subsample)(
d_weights = GpuCorrMM_gradWeights(self.border_mode,
self.subsample,
self.filter_dilation)(
bottom, top, weights.shape[-2:])
return d_bottom, d_weights
......@@ -1243,8 +1266,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
def __init__(self, border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1),
pad=(0, 0)):
super(GpuCorrMM_gradWeights, self).__init__(border_mode, subsample, pad)
super(GpuCorrMM_gradWeights, self).__init__(border_mode,
subsample,
filter_dilation,
pad)
def make_node(self, img, topgrad, shape=None):
img = as_cuda_ndarray_variable(img)
......@@ -1278,12 +1305,13 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
bottom, top = inp[:2]
weights, = grads
weights = gpu_contiguous(weights)
d_bottom = GpuCorrMM_gradInputs(
self.border_mode, self.subsample)(weights,
d_bottom = GpuCorrMM_gradInputs(self.border_mode,
self.subsample,
self.filter_dilation)(weights,
top,
bottom.shape[-2:])
d_top = GpuCorrMM(
self.border_mode, self.subsample)(bottom, weights)
self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
d_height_width = (
theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else ()
......@@ -1309,8 +1337,10 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
def __init__(self, border_mode="valid",
subsample=(1, 1),
filter_dilation=(1, 1),
pad=(0, 0)):
super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample, pad)
super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
filter_dilation, pad)
def make_node(self, kern, topgrad, shape=None):
kern = as_cuda_ndarray_variable(kern)
......@@ -1341,11 +1371,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
weights, top = inp[:2]
bottom, = grads
bottom = gpu_contiguous(bottom)
d_weights = GpuCorrMM_gradWeights(
self.border_mode, self.subsample)(
bottom, top, weights.shape[-2:])
d_top = GpuCorrMM(
self.border_mode, self.subsample)(bottom, weights)
d_weights = GpuCorrMM_gradWeights(self.border_mode,
self.subsample,
self.filter_dilation)(bottom,
top,
weights.shape[-2:])
d_top = GpuCorrMM(self.border_mode,
self.subsample,
self.filter_dilation)(bottom, weights)
d_height_width = (
theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else ()
......@@ -1871,12 +1904,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
weights, top = inp[:2]
bottom, = grads
bottom = gpu_contiguous(bottom)
d_weights = GpuCorr3dMM_gradWeights(
self.border_mode, self.subsample, self.pad)(
bottom, top, weights.shape[-3:])
d_top = GpuCorr3dMM(
self.border_mode, self.subsample, self.pad)(
bottom, weights)
d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
self.subsample,
self.pad)(bottom,
top,
weights.shape[-3:])
d_top = GpuCorr3dMM(self.border_mode,
self.subsample,
self.pad)(bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
* 3 if len(inp) == 5 else ()
return (d_weights, d_top) + d_height_width_depth
......
......@@ -52,6 +52,39 @@ inline int GET_BLOCKS(const int N) {
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
// Kernels for fast unfold + copy
// CUDA kernel for the case of dilation
__global__ void dilated_im2col_kernel(const int n, const float* data_im,
const int height, const int width, const int kernel_h, const int kernel_w,
const int dilation_h, const int dilation_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int height_col, const int width_col,
float* data_col) {
CUDA_KERNEL_LOOP(index, n) {
const int h_index = index / width_col;
const int h_col = h_index % height_col;
const int w_col = index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
float* data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const float* data_im_ptr = data_im;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
int h_im = h_offset + i * dilation_h;
int w_im = w_offset + j * dilation_w;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
data_col_ptr += height_col * width_col;
}
}
}
}
__global__ void im2col_kernel(const int n, const float* data_im,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
......@@ -59,22 +92,23 @@ __global__ void im2col_kernel(const int n, const float* data_im,
const int height_col, const int width_col,
float* data_col) {
CUDA_KERNEL_LOOP(index, n) {
int w_out = index % width_col;
int h_index = index / width_col;
int h_out = h_index % height_col;
int channel_in = h_index / height_col;
int channel_out = channel_in * kernel_h * kernel_w;
int h_in = h_out * stride_h - pad_h;
int w_in = w_out * stride_w - pad_w;
const int h_index = index / width_col;
const int h_col = h_index % height_col;
const int w_col = index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
float* data_col_ptr = data_col;
data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const float* data_im_ptr = data_im;
data_im_ptr += (channel_in * height + h_in) * width + w_in;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
int h = h_in + i;
int w = w_in + j;
*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
int h_im = h_offset + i ;
int w_im = w_offset + j ;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
data_im_ptr[i * width + j] : 0;
data_col_ptr += height_col * width_col;
}
......@@ -84,52 +118,97 @@ __global__ void im2col_kernel(const int n, const float* data_im,
void im2col(const float* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int dilation_h, const int dilation_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
float* data_col) {
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
int num_kernels = channels * height_col * width_col;
if(dilation_h != 1 || dilation_w != 1){
dilated_im2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(
num_kernels, data_im, height, width, kernel_h, kernel_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col);
}
else{
im2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(
num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
pad_w, stride_h, stride_w, height_col,
num_kernels, data_im, height, width, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col);
}
}
__global__ void col2im_kernel(const int n, const float* data_col,
// CUDA kernel for the case of dilation
__global__ void dilated_col2im_kernel(const int n, const float* data_col,
const int height, const int width, const int channels,
const int patch_h, const int patch_w,
const int kernel_h, const int kernel_w,
const int dilation_h, const int dilation_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int height_col, const int width_col,
float* data_im) {
CUDA_KERNEL_LOOP(index, n) {
float val = 0;
int w = index % width + pad_w;
int h = (index / width) % height + pad_h;
int c = index / (width * height);
const int w_im = index % width + pad_w;
const int h_im = (index / width) % height + pad_h;
const int c_im = index / (width * height);
int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
// compute the start and end of the output
int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
int w_col_end = min(w / stride_w + 1, width_col);
int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
int h_col_end = min(h / stride_h + 1, height_col);
/*
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
// the col location: [c * width * height + h_out, w_out]
int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize
+ (w - w_col * stride_w);
val += data_col[(c_col * height_col + h_col) * width_col + w_col];
const int w_col_start =
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const int w_col_end = min(w_im / stride_w + 1, width_col);
const int h_col_start =
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const int h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
int h_k = (h_im - h_col * stride_h);
int w_k = (w_im - w_col * stride_w);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
height_col + h_col) * width_col + w_col;
val += data_col[data_col_index];
}
}
*/
// equivalent implementation
}
data_im[index] = val;
}
}
__global__ void col2im_kernel(const int n, const float* data_col,
const int height, const int width, const int channels,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int height_col, const int width_col,
float* data_im) {
CUDA_KERNEL_LOOP(index, n) {
float val = 0;
const int w_im = index % width + pad_w;
const int h_im = (index / width) % height + pad_h;
const int c_im = index / (width * height);
// compute the start and end of the output
const int w_col_start =
(w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
const int w_col_end = min(w_im / stride_w + 1, width_col);
const int h_col_start =
(h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
const int h_col_end = min(h_im / stride_h + 1, height_col);
// equivalent implementation, no dilation
int offset =
(c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
(c_im * kernel_h * kernel_w + h_im * kernel_w + w_im) * height_col * width_col;
int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
......@@ -142,18 +221,30 @@ __global__ void col2im_kernel(const int n, const float* data_col,
void col2im(const float* data_col, const int channels,
const int height, const int width, const int patch_h, const int patch_w,
const int dilation_h, const int dilation_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, float* data_im) {
int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
int dil_patch_h = (patch_h - 1) * dilation_h + 1;
int dil_patch_w = (patch_w - 1) * dilation_w + 1;
int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
int num_kernels = channels * height * width;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
if(dilation_h != 1 || dilation_w != 1){
dilated_col2im_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(
num_kernels, data_col, height, width, channels, patch_h, patch_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im);
}
else{
col2im_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(
num_kernels, data_col, height, width, channels, patch_h, patch_w,
pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im);
}
}
......@@ -167,6 +258,8 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
const int direction,
const int dH = 1,
const int dW = 1,
const int dilH = 1,
const int dilW = 1,
const int padH = 0,
const int padW = 0)
{
......@@ -236,9 +329,12 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
"GpuCorrMM images and kernel must have the same stack size\n");
return NULL;
}
// implicit dilated filter
const int dil_kH = (kH - 1) * dilH + 1;
const int dil_kW = (kW - 1) * dilW + 1;
// top: (batchSize, nFilters, topHeight, topWidth)
const int topHeight = (bottomHeight + 2*padH - kH) / dH + 1;
const int topWidth = (bottomWidth + 2*padW - kW) / dW + 1;
const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
const int topWidth = (bottomWidth + 2*padW - dil_kW) / dW + 1;
if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
......@@ -286,7 +382,8 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
for (int n = 0; n < batchSize; n++) {
// First, im2col
im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, col->devdata);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
......@@ -353,7 +450,8 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
for (int n = 0; n < batchSize; n++) {
// First, im2col
im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, col->devdata);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
......@@ -438,7 +536,8 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
}
// col2im back to the data
col2im(col->devdata, nChannels, bottomHeight, bottomWidth,
kH, kW, padH, padW, dH, dW, bottom->devdata + n * bottom_stride);
kH, kW, dilH, dilW, padH, padW,
dH, dW, bottom->devdata + n * bottom_stride);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
......
......@@ -2612,6 +2612,8 @@ def local_abstractconv_cudnn(node):
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs))):
return None
if (node.op.filter_dilation != (1, 1)):
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
......@@ -2619,6 +2621,7 @@ def local_abstractconv_cudnn(node):
if (not isinstance(inp1.type, CudaNdarrayType) or
not isinstance(inp2.type, CudaNdarrayType)):
return None
if not dnn_available():
return None
......
......@@ -1622,7 +1622,8 @@ def local_conv_gemm(node):
# because we are not allowed to replace a CudaNdarray with
# a DimShuffle instance in a graph optimization)
rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
GpuCorrMM_gradWeights(border_mode, subsample)(
GpuCorrMM_gradWeights(border_mode,
subsample)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
).dimshuffle(1, 0, 2, 3))
......@@ -2769,20 +2770,25 @@ def local_abstractconv_gemm(node):
border_mode = node.op.border_mode
subsample = node.op.subsample
if (border_mode == 'full') and (subsample == (1, 1)):
filter_dilation = node.op.filter_dilation
if ((border_mode == 'full') and (subsample == (1, 1))):
if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
# need to dimshuffle the kernel for full convolution
kern = kern.dimshuffle(1, 0, 2, 3)
# call GpuCorrMM_gradInputs
rval = GpuCorrMM_gradInputs('valid', subsample)(
rval = GpuCorrMM_gradInputs('valid',
subsample,
filter_dilation)(
gpu_contiguous(kern), gpu_contiguous(img))
else:
# need to flip the kernel if necessary
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
# By default use GpuCorrMM
rval = GpuCorrMM(border_mode, subsample)(gpu_contiguous(img),
rval = GpuCorrMM(border_mode,
subsample,
filter_dilation)(gpu_contiguous(img),
gpu_contiguous(kern))
# call GpuCorrMM_gradWeights if good
......@@ -2790,7 +2796,7 @@ def local_abstractconv_gemm(node):
# is larger than inputChannels * outputHeight * outputWidth.
# GpuConv does not always store information on the batchsize and
# channels, though, so we only use what information we have.)
if ((subsample == (1, 1)) and
if ((subsample == (1, 1)) and (filter_dilation == (1, 1)) and
(node.op.imshp is not None) and
(None not in node.op.imshp[-2:]) and
(node.op.kshp is not None) and
......@@ -2810,7 +2816,9 @@ def local_abstractconv_gemm(node):
# because we are not allowed to replace a CudaNdarray with
# a DimShuffle instance in a graph optimization)
rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
GpuCorrMM_gradWeights(border_mode, subsample)(
GpuCorrMM_gradWeights(border_mode,
subsample,
filter_dilation)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
).dimshuffle(1, 0, 2, 3))
......@@ -2827,7 +2835,8 @@ def local_abstractconv_gradweight_gemm(node):
return None
rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample)(
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)(
gpu_contiguous(img), gpu_contiguous(topgrad), shape)
if node.op.filter_flip:
rval = rval[:, :, ::-1, ::-1]
......@@ -2849,7 +2858,8 @@ def local_abstractconv_gradinputs_gemm(node):
kern = kern[:, :, ::-1, ::-1]
rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample)(
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)(
gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
return [rval]
......@@ -2870,10 +2880,12 @@ conv_groupopt.register('local_abstractconv_dnn',
conv_groupopt.register('local_abstractconv_gemm', local_abstractconv_gemm, 30,
'conv_gemm',
'gpu', 'fast_compile', 'fast_run')
conv_groupopt.register('local_abstractconv_gradweight_gemm',
local_abstractconv_gradweight_gemm, 30,
'conv_gemm',
'gpu', 'fast_compile', 'fast_run')
conv_groupopt.register('local_abstractconv_gradinputs_gemm',
local_abstractconv_gradinputs_gemm, 30,
'conv_gemm',
......
......@@ -29,25 +29,30 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
self.provide_shape = [False]
self.shared = gpu_shared
def tcase(self, i, f, s, b, flip, provide_shape):
def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
if fd != (1, 1):
raise SkipTest("No dilation implementation for cuDNN ConvOp.")
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_with_gpu
o = self.get_output_shape(i, f, s, b)
o = self.get_output_shape(i, f, s, b, fd)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip, target_op=GpuDnnConv)
filter_flip=flip, target_op=GpuDnnConv,
filter_dilation=fd)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip, target_op=GpuDnnConvGradW)
filter_flip=flip, target_op=GpuDnnConvGradW,
filter_dilation=fd)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip, target_op=GpuDnnConvGradI)
filter_flip=flip, target_op=GpuDnnConvGradI,
filter_dilation=fd)
class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
......@@ -56,28 +61,30 @@ class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
self.shared = gpu_shared
self.mode = mode_with_gpu.excluding('cudnn')
def tcase(self, i, f, s, b, flip, provide_shape):
def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
mode = self.mode
o = self.get_output_shape(i, f, s, b)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode,
o = self.get_output_shape(i, f, s, b, fd)
self.run_fwd(inputs_shape=i, filters_shape=f,
subsample=s, verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip,
target_op=(GpuCorrMM,
filter_flip=flip, target_op=(GpuCorrMM,
GpuCorrMM_gradWeights,
GpuCorrMM_gradInputs))
GpuCorrMM_gradInputs),
filter_dilation=fd)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip,
target_op=GpuCorrMM_gradWeights)
target_op=GpuCorrMM_gradWeights,
filter_dilation=fd)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip,
target_op=GpuCorrMM_gradInputs)
target_op=GpuCorrMM_gradInputs,
filter_dilation=fd)
class TestDnnConvTypes(test_abstract_conv.TestConvTypes):
......
......@@ -35,7 +35,7 @@ from .abstract_conv import conv2d as abstract_conv2d
def conv2d(input, filters, input_shape=None, filter_shape=None,
border_mode='valid', subsample=(1, 1), filter_flip=True,
image_shape=None, **kwargs):
image_shape=None, filter_dilation=(1, 1), **kwargs):
"""
This function will build the symbolic graph for convolving a mini-batch of a
stack of 2D inputs with a set of 2D filters. The implementation is modelled
......@@ -95,6 +95,10 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
image_shape: None, tuple/list of len 4 of int or Constant variable
Deprecated alias for input_shape.
filter_dilation: tuple of len 2
Factor by which to subsample (stride) the input.
Also called dilation elsewhere.
kwargs: Any other keyword arguments are accepted for backwards
compatibility, but will be ignored.
......@@ -140,4 +144,5 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
" be provided at the same time.")
return abstract_conv2d(input, filters, input_shape, filter_shape,
border_mode, subsample, filter_flip)
border_mode, subsample, filter_flip,
filter_dilation)
......@@ -32,7 +32,8 @@ _logger = logging.getLogger("theano.tensor.nnet.abstract_conv")
def get_conv_output_shape(image_shape, kernel_shape,
border_mode, subsample):
border_mode, subsample,
filter_dilation=(1, 1)):
"""
This function compute the output shape of convolution operation.
......@@ -53,6 +54,8 @@ def get_conv_output_shape(image_shape, kernel_shape,
subsample: tuple of int (symbolic or numeric). Its or three elements
espectively correspond to the subsampling on height and width (and
possibly depth) axis.
filter_dilation: tuple of int (symbolic or numeric). Its two elements
correspond respectively to the dilation on height and width axis.
Returns
-------
......@@ -65,17 +68,19 @@ def get_conv_output_shape(image_shape, kernel_shape,
nkern, kshp = kernel_shape[0], kernel_shape[2:]
if isinstance(border_mode, tuple):
out_shp = tuple(get_conv_shape_1axis(
imshp[i], kshp[i], border_mode[i], subsample[i])
for i in range(len(subsample)))
imshp[i], kshp[i], border_mode[i],
subsample[i], filter_dilation[i]) for i in range(len(subsample)))
else:
out_shp = tuple(get_conv_shape_1axis(
imshp[i], kshp[i], border_mode, subsample[i])
for i in range(len(subsample)))
imshp[i], kshp[i], border_mode,
subsample[i], filter_dilation[i]) for i in range(len(subsample)))
return (bsize, nkern) + out_shp
def get_conv_shape_1axis(image_shape, kernel_shape,
border_mode, subsample):
# filter dilation set by default to 1
# for compatibility with other tests.
def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
subsample, dilation=1):
"""
This function compute the output shape of convolution operation.
......@@ -90,6 +95,8 @@ def get_conv_shape_1axis(image_shape, kernel_shape,
the padding on the considered axis.
subsample: int. It must correspond to the subsampling on the
considered axis.
dilation: int. It must correspond to the dilation on the
considered axis.
Returns
-------
......@@ -97,19 +104,22 @@ def get_conv_shape_1axis(image_shape, kernel_shape,
considered axis. None if undefined.
"""
if None in [image_shape, kernel_shape, border_mode, subsample]:
if None in [image_shape, kernel_shape, border_mode,
subsample, dilation]:
return None
# Implicit dilated kernel shape
dil_kernel_shape = (kernel_shape - 1) * dilation + 1
if border_mode == "half":
pad = kernel_shape // 2
pad = dil_kernel_shape // 2
elif border_mode == "full":
pad = kernel_shape - 1
pad = dil_kernel_shape - 1
elif border_mode == "valid":
pad = 0
else:
pad = border_mode
if pad < 0:
raise ValueError("border_mode must be >= 0")
out_shp = (image_shape + 2 * pad - kernel_shape) // subsample + 1
out_shp = (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1
return out_shp
......@@ -120,7 +130,8 @@ def conv2d(input,
filter_shape=None,
border_mode='valid',
subsample=(1, 1),
filter_flip=True):
filter_flip=True,
filter_dilation=(1, 1)):
"""This function will build the symbolic graph for convolving a mini-batch of a
stack of 2D inputs with a set of 2D filters. The implementation is modelled
after Convolutional Neural Networks (CNN).
......@@ -134,7 +145,8 @@ def conv2d(input,
kshp=filter_shape,
border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip)
filter_flip=filter_flip,
filter_dilation=filter_dilation)
return conv_op(input, filters)
......@@ -144,7 +156,8 @@ def conv2d_grad_wrt_inputs(output_grad,
filter_shape=None,
border_mode='valid',
subsample=(1, 1),
filter_flip=True):
filter_flip=True,
filter_dilation=(1, 1)):
"""Compute conv output gradient w.r.t its inputs
This function builds the symbolic graph for getting the
......@@ -214,6 +227,9 @@ def conv2d_grad_wrt_inputs(output_grad,
referred to as a convolution, and this is the default. If
``False``, the filters are not flipped and the operation is
referred to as a cross-correlation.
filter_dilation : tuple of len 2
The filter dilation used in the forward pass.
Also known as input striding.
Returns
-------
......@@ -263,7 +279,8 @@ def conv2d_grad_wrt_inputs(output_grad,
kshp=filter_shape,
border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip)
filter_flip=filter_flip,
filter_dilation=filter_dilation)
return grad_input_op(filters, output_grad, input_shape[-2:])
......@@ -274,7 +291,8 @@ def conv2d_grad_wrt_weights(input,
input_shape=None,
border_mode='valid',
subsample=(1, 1),
filter_flip=True):
filter_flip=True,
filter_dilation=(1, 1)):
"""Compute conv output gradient w.r.t its weights
This function will build the symbolic graph for getting the
......@@ -327,7 +345,6 @@ def conv2d_grad_wrt_weights(input,
``(int1, int2)``
pad input with a symmetric border of ``int1`` rows and
``int2`` columns, then perform a valid convolution.
subsample : tuple of len 2
The subsampling used in the forward pass of the convolutional
operation. Also called strides elsewhere.
......@@ -337,6 +354,9 @@ def conv2d_grad_wrt_weights(input,
referred to as a convolution, and this is the default. If
``False``, the filters are not flipped and the operation is
referred to as a cross-correlation.
filter_dilation : tuple of len 2
The filter dilation used in the forward pass.
Also known as input striding.
Returns
-------
......@@ -386,7 +406,8 @@ def conv2d_grad_wrt_weights(input,
kshp=numerical_filter_shape,
border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip)
filter_flip=filter_flip,
filter_dilation=filter_dilation)
return gradWeight_op(input, output_grad, filter_shape[:-2])
......@@ -541,7 +562,8 @@ def bilinear_upsampling(input,
filter_shape=(1, 1, None, 1),
border_mode=(pad, 0),
subsample=(ratio, 1),
filter_flip=True)
filter_flip=True,
filter_dilation=(1, 1))
# upsampling cols
upsampled_mat = conv2d_grad_wrt_inputs(output_grad=upsampled_row,
filters=kern[np.newaxis,
......@@ -553,7 +575,8 @@ def bilinear_upsampling(input,
filter_shape=(1, 1, 1, None),
border_mode=(0, pad),
subsample=(1, ratio),
filter_flip=True)
filter_flip=True,
filter_dilation=(1, 1))
else:
kern = bilinear_kernel_2D(ratio=ratio, normalize=True)
upsampled_mat = conv2d_grad_wrt_inputs(output_grad=concat_mat,
......@@ -565,7 +588,8 @@ def bilinear_upsampling(input,
filter_shape=(1, 1, None, None),
border_mode=(pad, pad),
subsample=(ratio, ratio),
filter_flip=True)
filter_flip=True,
filter_dilation=(1, 1))
return upsampled_mat.reshape((input.shape[0], input.shape[1],
row * ratio, col * ratio))
......@@ -620,14 +644,18 @@ class BaseAbstractConv2d(Op):
are not flipped and the operation is referred to as a
cross-correlation.
filter_dilation: tuple of len 2
Factor by which to subsample (stride) the input.
Also called dilation factor.
"""
check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_flip', 'imshp', 'kshp')
__props__ = ('border_mode', 'subsample', 'filter_flip',
'imshp', 'kshp', 'filter_dilation')
def __init__(self,
imshp=None, kshp=None,
border_mode="valid", subsample=(1, 1),
filter_flip=True):
imshp=None, kshp=None, border_mode="valid",
subsample=(1, 1), filter_flip=True,
filter_dilation=(1, 1)):
if isinstance(border_mode, integer_types):
border_mode = (border_mode, border_mode)
......@@ -673,6 +701,9 @@ class BaseAbstractConv2d(Op):
if len(subsample) != 2:
raise ValueError("subsample must have two elements")
self.subsample = tuple(subsample)
if len(filter_dilation) != 2:
raise ValueError("filter_dilation must have two elements")
self.filter_dilation = tuple(filter_dilation)
def flops(self, inp, outp):
""" Useful with the hack in profilemode to print the MFlops"""
......@@ -694,7 +725,7 @@ class BaseAbstractConv2d(Op):
# This may change in the future.
return False
def conv2d(self, img, kern, mode="valid"):
def conv2d(self, img, kern, mode="valid", dilation=(1, 1)):
"""
Basic slow python implementatation for DebugMode
"""
......@@ -708,8 +739,16 @@ class BaseAbstractConv2d(Op):
'invalid mode {}, which must be either '
'"valid" or "full"'.format(mode))
out_shape = get_conv_output_shape(img.shape, kern.shape, mode, [1, 1])
out_shape = get_conv_output_shape(img.shape, kern.shape,
mode, [1, 1], dilation)
out = numpy.zeros(out_shape, dtype=img.dtype)
dil_kern_shp = kern.shape[:-2] + ((kern.shape[-2] - 1) * dilation[0] + 1,
(kern.shape[-1] - 1) * dilation[1] + 1)
dilated_kern = numpy.zeros(dil_kern_shp, dtype=kern.dtype)
dilated_kern[:, :,
::dilation[0],
::dilation[1]] = kern
val = _valfrommode(mode)
bval = _bvalfromboundary('fill')
......@@ -720,7 +759,7 @@ class BaseAbstractConv2d(Op):
for im0 in xrange(img.shape[1]):
# some cast generates a warning here
out[b, n, ...] += _convolve2d(img[b, im0, ...],
kern[n, im0, ...],
dilated_kern[n, im0, ...],
1, val, bval, 0)
return out
......@@ -736,10 +775,11 @@ class AbstractConv2d(BaseAbstractConv2d):
kshp=None,
border_mode="valid",
subsample=(1, 1),
filter_flip=True):
super(AbstractConv2d, self).__init__(imshp, kshp,
border_mode, subsample,
filter_flip)
filter_flip=True,
filter_dilation=(1, 1)):
super(AbstractConv2d, self).__init__(imshp, kshp, border_mode,
subsample, filter_flip,
filter_dilation)
def make_node(self, img, kern):
# Make sure both inputs are Variables with the same Type
......@@ -766,6 +806,8 @@ class AbstractConv2d(BaseAbstractConv2d):
img, kern = inp
img = numpy.asarray(img)
kern = numpy.asarray(kern)
dil_kernshp = ((kern.shape[2] - 1) * self.filter_dilation[0] + 1,
(kern.shape[3] - 1) * self.filter_dilation[1] + 1)
o, = out_
mode = self.border_mode
......@@ -777,9 +819,9 @@ class AbstractConv2d(BaseAbstractConv2d):
' integers'.format(mode))
if mode == "full":
mode = (kern.shape[2] - 1, kern.shape[3] - 1)
mode = (dil_kernshp[0] - 1, dil_kernshp[1] - 1)
elif mode == "half":
mode = (kern.shape[2] // 2, kern.shape[3] // 2)
mode = (dil_kernshp[0] // 2, dil_kernshp[1] // 2)
if isinstance(mode, tuple):
pad_h, pad_w = map(int, mode)
mode = "valid"
......@@ -790,7 +832,7 @@ class AbstractConv2d(BaseAbstractConv2d):
img = new_img
if not self.filter_flip:
kern = kern[:, :, ::-1, ::-1]
conv_out = self.conv2d(img, kern, mode="valid")
conv_out = self.conv2d(img, kern, mode="valid", dilation=self.filter_dilation)
conv_out = conv_out[:, :, ::self.subsample[0], ::self.subsample[1]]
o[0] = node.outputs[0].type.filter(conv_out)
......@@ -812,12 +854,14 @@ class AbstractConv2d(BaseAbstractConv2d):
d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
self.border_mode,
self.subsample,
self.filter_flip)(
self.filter_flip,
self.filter_dilation)(
weights, top, bottom.shape[-2:])
d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
self.border_mode,
self.subsample,
self.filter_flip)(
self.filter_flip,
self.filter_dilation)(
bottom, top, weights.shape[-2:])
......@@ -844,7 +888,7 @@ class AbstractConv2d(BaseAbstractConv2d):
kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
for i in range(4)]
res = get_conv_output_shape(imshp, kshp, self.border_mode,
self.subsample)
self.subsample, self.filter_dilation)
return [res]
......@@ -863,11 +907,13 @@ class AbstractConv2d_gradWeights(BaseAbstractConv2d):
kshp=None,
border_mode="valid",
subsample=(1, 1),
filter_flip=True):
filter_flip=True,
filter_dilation=(1, 1)):
super(AbstractConv2d_gradWeights, self).__init__(imshp, kshp,
border_mode,
subsample,
filter_flip)
filter_flip,
filter_dilation)
# Update shape/height_width
def make_node(self, img, topgrad, shape):
......@@ -943,15 +989,16 @@ class AbstractConv2d_gradWeights(BaseAbstractConv2d):
d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
self.border_mode,
self.subsample,
self.filter_flip)(
weights,
self.filter_flip,
self.filter_dilation)(weights,
top,
bottom.shape[-2:])
d_top = AbstractConv2d(self.imshp,
self.kshp,
self.border_mode,
self.subsample,
self.filter_flip)(bottom, weights)
self.filter_flip,
self.filter_dilation)(bottom, weights)
# Make sure that the broadcastable pattern of the inputs is used
# for the gradients, even if the grad opts are not able to infer
# that the dimensions are broadcastable.
......@@ -998,11 +1045,13 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
kshp=None,
border_mode="valid",
subsample=(1, 1),
filter_flip=True):
filter_flip=True,
filter_dilation=(1, 1)):
super(AbstractConv2d_gradInputs, self).__init__(imshp, kshp,
border_mode,
subsample,
filter_flip)
filter_flip,
filter_dilation)
# Update shape/height_width
def make_node(self, kern, topgrad, shape):
......@@ -1070,12 +1119,15 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
bottom, = grads
d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
self.border_mode,
self.subsample)(
bottom, top,
self.subsample,
self.filter_flip,
self.filter_dilation)(bottom, top,
weights.shape[-2:])
d_top = AbstractConv2d(self.imshp, self.kshp,
self.border_mode, self.subsample)(
bottom, weights)
self.border_mode,
self.subsample,
self.filter_flip,
self.filter_dilation)(bottom, weights)
# Make sure that the broadcastable pattern of the inputs is used
# for the gradients, even if the grad opts are not able to infer
# that the dimensions are broadcastable.
......
......@@ -27,12 +27,14 @@ class BaseCorrMM(gof.Op):
or a pair of integers
subsample
Perform subsampling of the output (default: (1, 1)).
filter_dilation
Perform dilated correlation (default: (1,1))
"""
check_broadcast = False
__props__ = ('border_mode', 'subsample')
__props__ = ('border_mode', 'subsample', 'filter_dilation')
def __init__(self, border_mode="valid", subsample=(1, 1)):
def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1)):
if isinstance(border_mode, integer_types):
if border_mode < 0:
raise ValueError(
......@@ -55,7 +57,10 @@ class BaseCorrMM(gof.Op):
self.border_mode = border_mode
if len(subsample) != 2:
raise ValueError("subsample must have two elements")
if len(filter_dilation) != 2:
raise ValueError("filter_dilation must have two elements")
self.subsample = tuple(subsample)
self.filter_dilation = tuple(filter_dilation)
@property
def pad(self):
......@@ -64,10 +69,11 @@ class BaseCorrMM(gof.Op):
return (0, 0)
def __str__(self):
return '%s{%s, %s}' % (
return '%s{%s, %s, %s}' % (
self.__class__.__name__,
self.border_mode,
str(self.subsample))
str(self.subsample),
str(self.filter_dilation))
def c_support_code(self):
return blas_header_text()
......@@ -89,7 +95,7 @@ class BaseCorrMM(gof.Op):
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (1, 1)
return (1, 2)
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of
......@@ -155,6 +161,7 @@ class BaseCorrMM(gof.Op):
if not theano.config.blas.ldflags:
raise NotImplementedError("C code for CorrMM* classes need a blas library.")
dH, dW = self.subsample
dilH, dilW = self.filter_dilation
if self.border_mode == "half":
padH = padW = -1
elif self.border_mode == "full":
......@@ -201,6 +208,8 @@ class BaseCorrMM(gof.Op):
// Optional args
int dH = %(dH)s;
int dW = %(dW)s;
int dilH = %(dilH)s;
int dilW = %(dilW)s;
int padH = %(padH)s;
int padW = %(padW)s;
......@@ -224,39 +233,43 @@ class BaseCorrMM(gof.Op):
}
else if (padH == -2) {
// vertical full padding, we can infer the kernel height
kH = 2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH;
kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1;
}
else {
// explicit padding, we can infer the kernel height
kH = PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH;
kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
}
if ((dW != 1) || (padW == -1)) {
kW = %(width)s;
}
else if (padW == -2) {
kW = 2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW;
kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
else {
kW = PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW;
kW = (PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
}
// Implicit dilated kernel size
int dil_kH = (kH - 1) * dilH + 1;
int dil_kW = (kW - 1) * dilW + 1;
// Auto-padding if requested
if (padH == -1) { // vertical half padding
padH = kH / 2;
padH = dil_kH / 2;
}
else if (padH == -2) { // vertical full padding
padH = kH - 1;
padH = dil_kH - 1;
}
else if (padH < 0) {
PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padH must be >= -2");
%(fail)s
}
if (padW == -1) { // horizontal half padding
padW = kW / 2;
padW = dil_kW / 2;
}
else if (padW == -2) { // horizontal full padding
padW = kW - 1;
padW = dil_kW - 1;
}
else if (padW < 0) {
PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padW must be >= -2");
......@@ -268,15 +281,15 @@ class BaseCorrMM(gof.Op):
switch(direction) {
case 0: // forward pass
// output is top: (batchsize, num_filters, height, width)
// height and width: top = (bottom + 2*pad - weight) / sample + 1
// height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - PyArray_DIMS(weights)[2]) / dH + 1);
out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - PyArray_DIMS(weights)[3]) / dW + 1);
out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1);
out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1);
break;
case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width)
// height and width: weights = bottom + 2*pad - (top - 1) * sample
// height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1];
out_dim[2] = (npy_intp)kH; // already inferred further above
......@@ -284,11 +297,11 @@ class BaseCorrMM(gof.Op):
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width)
// height and width: bottom = (top - 1) * sample + weights - 2*pad
// height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1];
out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + PyArray_DIMS(weights)[2] - 2*padH);
out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + PyArray_DIMS(weights)[3] - 2*padW);
out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
break;
default:
PyErr_SetString(PyExc_ValueError, "BaseCorrMM: direction must be 0, 1, or 2\\n");
......@@ -326,7 +339,7 @@ class BaseCorrMM(gof.Op):
}
// Call corrMM code
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, padH, padW);
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW);
if (out2==NULL){
%(fail)s
}
......@@ -357,10 +370,15 @@ class CorrMM(BaseCorrMM):
`(sv, sh)` is equivalent to `CorrMM(...)(...)[:,:,::sv, ::sh]`,
but faster.
Set to `(1, 1)` to disable subsampling.
filter_dilation
The filter dilation operation applied to each input image.
Should be a tuple with 2 elements.
Set to `(1, 1)` to disable filter dilation.
"""
def __init__(self, border_mode="valid", subsample=(1, 1)):
super(CorrMM, self).__init__(border_mode, subsample)
def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1)):
super(CorrMM, self).__init__(border_mode, subsample, filter_dilation)
def make_node(self, img, kern):
img = as_tensor_variable(img)
......@@ -382,7 +400,8 @@ class CorrMM(BaseCorrMM):
imshp,
kshp,
self.border_mode,
self.subsample)
self.subsample,
self.filter_dilation)
return [res]
def c_code(self, node, nodename, inp, out_, sub):
......@@ -395,10 +414,12 @@ class CorrMM(BaseCorrMM):
bottom, weights = inp
top, = grads
d_bottom = CorrMM_gradInputs(self.border_mode,
self.subsample)(weights, top,
self.subsample,
self.filter_dilation)(weights, top,
bottom.shape[-2:])
d_weights = CorrMM_gradWeights(self.border_mode,
self.subsample)(bottom, top,
self.subsample,
self.filter_dilation)(bottom, top,
weights.shape[-2:])
return d_bottom, d_weights
......@@ -415,8 +436,11 @@ class CorrMM_gradWeights(BaseCorrMM):
"""
def __init__(self, border_mode="valid", subsample=(1, 1)):
super(CorrMM_gradWeights, self).__init__(border_mode, subsample)
def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1)):
super(CorrMM_gradWeights, self).__init__(border_mode,
subsample,
filter_dilation)
def make_node(self, img, topgrad, shape=None):
img = as_tensor_variable(img)
......@@ -485,10 +509,12 @@ class CorrMM_gradWeights(BaseCorrMM):
bottom, top = inp[:2]
weights, = grads
d_bottom = CorrMM_gradInputs(self.border_mode,
self.subsample)(weights, top,
self.subsample,
self.filter_dilation)(weights, top,
bottom.shape[-2:])
d_top = CorrMM(self.border_mode,
self.subsample)(bottom, weights)
self.subsample,
self.filter_dilation)(bottom, weights)
d_height_width = ((theano.gradient.DisconnectedType()(),) * 2
if len(inp) == 4 else ())
return (d_bottom, d_top) + d_height_width
......@@ -512,8 +538,10 @@ class CorrMM_gradInputs(BaseCorrMM):
"""
def __init__(self, border_mode="valid", subsample=(1, 1)):
super(CorrMM_gradInputs, self).__init__(border_mode, subsample)
def __init__(self, border_mode="valid", subsample=(1, 1), filter_dilation=(1, 1)):
super(CorrMM_gradInputs, self).__init__(border_mode,
subsample,
filter_dilation)
def make_node(self, kern, topgrad, shape=None):
kern = as_tensor_variable(kern)
......@@ -586,11 +614,13 @@ class CorrMM_gradInputs(BaseCorrMM):
weights, top = inp[:2]
bottom, = grads
d_weights = CorrMM_gradWeights(self.border_mode,
self.subsample)(bottom,
self.subsample,
self.filter_dilation)(bottom,
top,
weights.shape[-2:])
d_top = CorrMM(self.border_mode,
self.subsample)(bottom, weights)
self.subsample,
self.filter_dilation)(bottom, weights)
d_height_width = ((theano.gradient.DisconnectedType()(),) *
2 if len(inp) == 4 else ())
return (d_weights, d_top) + d_height_width
......
......@@ -31,20 +31,24 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Loops for fast unfold + copy
void im2col(const %(float_type)s* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int dilation_h, const int dilation_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
%(float_type)s* data_col) {
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
// Implicit dilated kernel size
int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
for (int c = 0; c < channels_col; ++c) {
int w_offset = c %% kernel_w;
int h_offset = (c / kernel_w) %% kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h) {
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
for (int w = 0; w < width_col; ++w) {
int h_pad = h * stride_h - pad_h + h_offset;
int w_pad = w * stride_w - pad_w + w_offset;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(npy_intp)(c * height_col + h) * width_col + w] =
data_im[(npy_intp)(c_im * height + h_pad) * width + w_pad];
......@@ -60,10 +64,14 @@ void im2col(const %(float_type)s* data_im, const int channels,
// accumulated into data_im.
void col2im(const %(float_type)s* data_col, const int channels,
const int height, const int width, const int patch_h, const int patch_w,
const int dilation_h, const int dilation_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, %(float_type)s* data_im) {
int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
// Implicit dilated patch
int dil_patch_h = (patch_h - 1) * dilation_h + 1;
int dil_patch_w = (patch_w - 1) * dilation_w + 1;
int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
int num_kernels = channels * height * width;
int channels_col = channels * patch_h * patch_w;
for (int c = 0; c < channels_col; ++c) {
......@@ -71,9 +79,9 @@ void col2im(const %(float_type)s* data_col, const int channels,
int h_offset = (c / patch_w) %% patch_h;
int c_im = c / patch_h / patch_w;
for (int h = 0; h < height_col; ++h) {
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
for (int w = 0; w < width_col; ++w) {
int h_pad = h * stride_h - pad_h + h_offset;
int w_pad = w * stride_w - pad_w + w_offset;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_im[(npy_intp)(c_im * height + h_pad) * width + w_pad] +=
data_col[(npy_intp)(c * height_col + h) * width_col + w];
......@@ -96,6 +104,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
const int direction,
const int dH = 1,
const int dW = 1,
const int dilH = 1,
const int dilW = 1,
const int padH = 0,
const int padW = 0)
{
......@@ -151,9 +161,12 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
"CorrMM images and kernel must have the same stack size\n");
return NULL;
}
// implicit dilated filter
const int dil_kH = (kH - 1) * dilH + 1;
const int dil_kW = (kW - 1) * dilW + 1;
// top: (batchSize, nFilters, topHeight, topWidth)
const int topHeight = (bottomHeight + 2*padH - kH) / dH + 1;
const int topWidth = (bottomWidth + 2*padW - kW) / dW + 1;
const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
const int topWidth = (bottomWidth + 2*padW - dil_kW) / dW + 1;
if (batchSize != PyArray_DIMS(top)[0] ||
nFilters != PyArray_DIMS(top)[1] ||
topHeight != PyArray_DIMS(top)[2] ||
......@@ -206,7 +219,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
for (int n = 0; n < batchSize; n++) {
// First, im2col
im2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels, bottomHeight,
bottomWidth, kH, kW, padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
// Second, gemm
%(gemm)s(&NTrans, &NTrans,
&N_, &M_, &K_,
......@@ -255,7 +269,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
for (int n = 0; n < batchSize; n++) {
// First, im2col
im2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels, bottomHeight,
bottomWidth, kH, kW, padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
// Second, gemm
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
......@@ -314,7 +329,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
(%(float_type)s*)PyArray_DATA(col), &N_);
// col2im back to the data
col2im((%(float_type)s*)PyArray_DATA(col), nChannels, bottomHeight, bottomWidth,
kH, kW, padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride);
kH, kW, dilH, dilW, padH, padW,
dH, dW, (%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride);
}
/*
// Original caffe code for comparison
......
......@@ -79,7 +79,8 @@ def local_abstractconv_gemm(node):
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
rval = CorrMM(border_mode=node.op.border_mode,
subsample=node.op.subsample)(img, kern)
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)(img, kern)
copy_stack_trace(node.outputs[0], rval)
return [rval]
......@@ -97,7 +98,8 @@ def local_abstractconv_gradweight_gemm(node):
return None
rval = CorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample)(img, topgrad, shape)
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)(img, topgrad, shape)
copy_stack_trace(node.outputs[0], rval)
# need to flip the kernel if necessary
......@@ -124,7 +126,8 @@ def local_abstractconv_gradinputs_gemm(node):
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
rval = CorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample)(kern, topgrad,
subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)(kern, topgrad,
shape)
copy_stack_trace(node.outputs[0], rval)
......@@ -221,7 +224,9 @@ def local_conv2d_gradweight_cpu(node):
assert len(op_imshp) == 4 and len(op_kshp) == 4
outshp = get_conv_output_shape(op_imshp, op_kshp,
node.op.border_mode, node.op.subsample)[2:]
node.op.border_mode,
node.op.subsample,
node.op.filter_dilation)[2:]
fulloutshp = get_conv_output_shape(op_imshp, op_kshp,
node.op.border_mode, (1, 1))[2:]
......@@ -334,7 +339,9 @@ def local_conv2d_gradinputs_cpu(node):
filters = filters[:, :, ::-1, ::-1]
outshp = get_conv_output_shape(op_imshp, op_kshp,
node.op.border_mode, node.op.subsample)[2:]
node.op.border_mode,
node.op.subsample,
node.op.filter_dilation)[2:]
fulloutshp = get_conv_output_shape(op_imshp, op_kshp,
node.op.border_mode, (1, 1))[2:]
......
......@@ -4,7 +4,6 @@ import numpy
import numpy as np
from nose.plugins.skip import SkipTest
from nose.tools import assert_raises
import theano
from theano import tensor
from theano.gof.opt import check_stack_trace
......@@ -24,46 +23,57 @@ from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
def conv_corr(inputs, filters, border_mode="valid", subsample=(1, 1),
conv_mode='conv'):
def conv_corr(inputs, filters, border_mode="valid",
subsample=(1, 1), conv_mode='conv',
filter_dilation=(1, 1)):
if conv_mode == 'conv':
filters = filters[:, :, ::-1, ::-1]
return corr.CorrMM(border_mode, subsample)(inputs, filters)
return corr.CorrMM(border_mode,
subsample,
filter_dilation)(inputs, filters)
def conv_corr_gw(inputs, topgrad, filters_shape, border_mode="valid",
subsample=(1, 1), conv_mode='conv'):
rval = corr.CorrMM_gradWeights(border_mode, subsample)(inputs, topgrad,
def conv_corr_gw(inputs, topgrad, filters_shape,
border_mode="valid", subsample=(1, 1),
conv_mode='conv', filter_dilation=(1, 1)):
rval = corr.CorrMM_gradWeights(border_mode,
subsample,
filter_dilation)(inputs, topgrad,
filters_shape[2:])
if conv_mode == 'conv':
rval = rval[:, :, ::-1, ::-1]
return rval
def conv_corr_gi(filters, topgrad, inputs_shape, border_mode="valid",
subsample=(1, 1), conv_mode='conv'):
def conv_corr_gi(filters, topgrad, inputs_shape,
border_mode="valid", subsample=(1, 1),
conv_mode='conv', filter_dilation=(1, 1)):
if conv_mode == 'conv':
filters = filters[:, :, ::-1, ::-1]
return corr.CorrMM_gradInputs(border_mode, subsample)(filters, topgrad,
return corr.CorrMM_gradInputs(border_mode,
subsample,
filter_dilation)(filters,
topgrad,
inputs_shape[2:])
class TestGetConvOutShape(unittest.TestCase):
def test_basic(self):
image_shape, kernel_shape = (3, 2, 8, 9), (4, 2, 5, 6)
image_shape, kernel_shape = (3, 2, 12, 9), (4, 2, 5, 6)
sub_sample = (1, 2)
filter_dilation = (2, 1)
test1_params = get_conv_output_shape(
image_shape, kernel_shape, 'valid', sub_sample)
image_shape, kernel_shape, 'valid', sub_sample, filter_dilation)
test2_params = get_conv_output_shape(
image_shape, kernel_shape, 'half', sub_sample)
image_shape, kernel_shape, 'half', sub_sample, filter_dilation)
test3_params = get_conv_output_shape(
image_shape, kernel_shape, 'full', sub_sample)
image_shape, kernel_shape, 'full', sub_sample, filter_dilation)
test4_params = get_conv_output_shape(
image_shape, kernel_shape, (1, 2), sub_sample)
image_shape, kernel_shape, (1, 2), sub_sample, filter_dilation)
self.assertTrue(test1_params == (3, 4, 4, 2))
self.assertTrue(test2_params == (3, 4, 8, 5))
self.assertTrue(test3_params == (3, 4, 12, 7))
self.assertTrue(test2_params == (3, 4, 12, 5))
self.assertTrue(test3_params == (3, 4, 20, 7))
self.assertTrue(test4_params == (3, 4, 6, 4))
......@@ -71,35 +81,41 @@ class BaseTestConv2d(unittest.TestCase):
def setUp(self):
if theano.config.blas.ldflags == '':
raise SkipTest("BLAS required for reference")
self.inputs_shapes = [(8, 1, 12, 12), (8, 1, 18, 18), (2, 1, 4, 4),
self.inputs_shapes = [(8, 1, 6, 6), (8, 1, 8, 8), (2, 1, 7, 7),
(6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
self.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
(1, 1, 2, 5), (4, 1, 2, 2), (4, 5, 2, 2)]
(1, 1, 2, 3), (4, 1, 1, 3), (4, 5, 3, 2)]
self.subsamples = [(1, 1), (2, 2), (2, 4)]
self.filters_dilations = [(1, 1), (1, 2), (2, 1)]
self.border_modes = ["valid", "full", (0, 0), (1, 1), (5, 5), (5, 2)]
self.filter_flip = [True, False]
self.provide_shape = [True, False]
self.shared = theano.compile.shared
def get_output_shape(self, inputs_shape, filters_shape, subsample,
border_mode):
def get_output_shape(self, inputs_shape, filters_shape,
subsample, border_mode, filter_dilation):
dil_filters = ((filters_shape[2] - 1) * filter_dilation[0] + 1,
(filters_shape[3] - 1) * filter_dilation[1] + 1)
if border_mode == "valid":
border_mode = (0, 0)
if border_mode == "full":
border_mode = (filters_shape[2] - 1, filters_shape[3] - 1)
border_mode = (dil_filters[0] - 1,
dil_filters[1] - 1)
batch_size = inputs_shape[0]
num_filters = filters_shape[0]
return ((batch_size, num_filters,) +
tuple(None if i is None or k is None
else ((i + 2 * pad - k) // d + 1)
for i, k, d, pad in zip(inputs_shape[2:],
else ((i + 2 * pad - ((k - 1) * fd + 1)) // d + 1)
for i, k, d, pad, fd in zip(inputs_shape[2:],
filters_shape[2:],
subsample, border_mode)))
subsample, border_mode,
filter_dilation)))
def run_fwd(self, inputs_shape, filters_shape, ref=conv_corr,
subsample=(1, 1), verify_grad=True, mode=None,
border_mode='valid', filter_flip=True, provide_shape=False,
target_op=None, check_trace=False):
border_mode='valid', filter_flip=True,
provide_shape=False, target_op=None,
check_trace=False, filter_dilation=(1, 1)):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
......@@ -120,13 +136,15 @@ class BaseTestConv2d(unittest.TestCase):
c_ref = ref(inputs, filters,
border_mode=border_mode,
subsample=subsample,
conv_mode=conv_mode)
conv_mode=conv_mode,
filter_dilation=filter_dilation)
c = conv.conv2d(inputs, filters,
border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip,
input_shape=imshp,
filter_shape=kshp)
filter_shape=kshp,
filter_dilation=filter_dilation)
f_ref = theano.function([], c_ref, mode='FAST_RUN')
f = theano.function([], c, mode=mode)
......@@ -143,15 +161,17 @@ class BaseTestConv2d(unittest.TestCase):
if verify_grad:
utt.verify_grad(conv.AbstractConv2d(border_mode=border_mode,
imshp=imshp, kshp=kshp,
subsample=subsample),
subsample=subsample,
filter_dilation=filter_dilation),
[inputs_val, filters_val],
mode=mode)
def run_gradweight(self, inputs_shape, filters_shape, output_shape,
ref=conv_corr_gw, subsample=(1, 1), filter_flip=True,
verify_grad=True, mode=None, border_mode='valid',
provide_shape=False, target_op=None, check_trace=False):
ref=conv_corr_gw, subsample=(1, 1),
filter_flip=True, verify_grad=True, mode=None,
border_mode='valid', provide_shape=False,
target_op=None, check_trace=False,
filter_dilation=(1, 1)):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
output_val = numpy.random.random(output_shape).astype('float32')
......@@ -171,13 +191,15 @@ class BaseTestConv2d(unittest.TestCase):
c = conv.AbstractConv2d_gradWeights(border_mode=border_mode,
filter_flip=filter_flip,
subsample=subsample,
imshp=imshp, kshp=kshp)
imshp=imshp, kshp=kshp,
filter_dilation=filter_dilation)
c = c(inputs, output, filters_shape[-2:])
c_ref = ref(inputs, output,
filters_shape,
border_mode=border_mode,
subsample=subsample,
conv_mode=conv_mode)
conv_mode=conv_mode,
filter_dilation=filter_dilation)
f = theano.function([], c, mode=mode)
f_ref = theano.function([], c_ref, mode='FAST_RUN')
......@@ -193,7 +215,8 @@ class BaseTestConv2d(unittest.TestCase):
def abstract_conv2d_gradweight(inputs_val, output_val):
conv_op = conv.AbstractConv2d_gradWeights(border_mode=border_mode,
subsample=subsample)
subsample=subsample,
filter_dilation=filter_dilation)
return conv_op(inputs_val, output_val, filters_shape[-2:])
if verify_grad:
......@@ -204,8 +227,8 @@ class BaseTestConv2d(unittest.TestCase):
def run_gradinput(self, inputs_shape, filters_shape, output_shape,
ref=conv_corr_gi, subsample=(1, 1), filter_flip=True,
verify_grad=True, mode=None, border_mode='valid',
provide_shape=False, target_op=None, check_trace=False):
provide_shape=False, target_op=None,
check_trace=False, filter_dilation=(1, 1)):
output_val = numpy.random.random(output_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
output = self.shared(output_val)
......@@ -224,11 +247,12 @@ class BaseTestConv2d(unittest.TestCase):
c = conv.AbstractConv2d_gradInputs(border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip,
imshp=imshp, kshp=kshp)
imshp=imshp, kshp=kshp,
filter_dilation=filter_dilation)
c = c(filters, output, inputs_shape[-2:])
c_ref = ref(filters, output, inputs_shape,
border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)
conv_mode=conv_mode, filter_dilation=filter_dilation)
f = theano.function([], c, mode=mode)
f_ref = theano.function([], c_ref, mode='FAST_RUN')
......@@ -244,7 +268,8 @@ class BaseTestConv2d(unittest.TestCase):
def abstract_conv2d_gradinputs(filters_val, output_val):
conv_op = conv.AbstractConv2d_gradInputs(border_mode=border_mode,
subsample=subsample)
subsample=subsample,
filter_dilation=filter_dilation)
return conv_op(filters_val, output_val, inputs_shape[-2:])
if verify_grad:
......@@ -266,15 +291,18 @@ class BaseTestConv2d(unittest.TestCase):
self.tcase(i, f, ds, db, dflip, provide_shape)
except SkipTest as e:
skipped = e
for fd in self.filters_dilations:
for s in self.subsamples:
for b in self.border_modes:
try:
self.tcase(i, f, s, db, dflip, dprovide_shape)
self.tcase(i, f, s, db, dflip,
dprovide_shape, fd)
except SkipTest as e:
skipped = e
for flip in self.filter_flip:
try:
self.tcase(i, f, ds, db, flip, dprovide_shape)
self.tcase(i, f, ds, db, flip,
dprovide_shape)
except SkipTest as e:
skipped = e
if skipped:
......@@ -287,26 +315,27 @@ class TestCorrConv2d(BaseTestConv2d):
raise SkipTest()
return super(TestCorrConv2d, self).setUp()
def tcase(self, i, f, s, b, flip, provide_shape):
o = self.get_output_shape(i, f, s, b)
def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
o = self.get_output_shape(i, f, s, b, fd)
if (not theano.config.blas.ldflags or
not theano.config.cxx or
theano.config.mode == "FAST_COMPILE"):
raise SkipTest("Need blas to test conv2d")
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, provide_shape=provide_shape,
border_mode=b, filter_flip=flip, target_op=CorrMM,
check_trace=True)
border_mode=b, filter_flip=flip,
target_op=CorrMM, check_trace=True,
filter_dilation=fd)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s, verify_grad=True,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip, target_op=CorrMM_gradWeights,
check_trace=True)
check_trace=True, filter_dilation=fd)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s, verify_grad=True,
provide_shape=provide_shape, border_mode=b,
filter_flip=flip, target_op=CorrMM_gradInputs,
check_trace=True)
check_trace=True, filter_dilation=fd)
class TestCpuConv2d(BaseTestConv2d):
......@@ -319,9 +348,11 @@ class TestCpuConv2d(BaseTestConv2d):
def tearDown(self):
theano.config.on_opt_error = self.opt_err
def tcase(self, i, f, s, b, flip, provide_shape):
def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
if fd != (1, 1):
raise SkipTest("No dilation implementation for basic cpu ConvOp.")
mode = self.mode
o = self.get_output_shape(i, f, s, b)
o = self.get_output_shape(i, f, s, b, fd)
fwd_OK = True
gradweight_OK = True
gradinput_OK = True
......@@ -347,11 +378,12 @@ class TestCpuConv2d(BaseTestConv2d):
if fwd_OK:
if not theano.config.blas.ldflags:
raise SkipTest("Need blas to test conv2d")
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=(gradweight_OK and gradinput_OK),
self.run_fwd(inputs_shape=i, filters_shape=f,
subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
mode=mode, provide_shape=provide_shape,
border_mode=b, filter_flip=flip, target_op=ConvOp,
check_trace=True)
check_trace=True, filter_dilation=fd)
else:
self.assertRaises(AssertionError,
self.run_fwd,
......@@ -363,7 +395,8 @@ class TestCpuConv2d(BaseTestConv2d):
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
check_trace=True)
check_trace=True,
filter_dilation=fd)
if gradweight_OK:
if not theano.config.blas.ldflags:
......@@ -374,7 +407,8 @@ class TestCpuConv2d(BaseTestConv2d):
provide_shape=provide_shape, border_mode=b,
filter_flip=flip,
target_op=(ConvOp, ConvGrad3D),
check_trace=True)
check_trace=True,
filter_dilation=fd)
else:
self.assertRaises(AssertionError,
self.run_gradweight,
......@@ -387,7 +421,8 @@ class TestCpuConv2d(BaseTestConv2d):
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
check_trace=True)
check_trace=True,
filter_dilation=fd)
if gradinput_OK:
if not theano.config.blas.ldflags:
......@@ -398,7 +433,8 @@ class TestCpuConv2d(BaseTestConv2d):
provide_shape=provide_shape, border_mode=b,
filter_flip=flip,
target_op=(ConvOp, ConvTransp3D),
check_trace=True)
check_trace=True,
filter_dilation=fd)
else:
self.assertRaises(AssertionError,
self.run_gradinput,
......@@ -411,7 +447,8 @@ class TestCpuConv2d(BaseTestConv2d):
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip,
check_trace=True)
check_trace=True,
filter_dilation=fd)
def test_constant_shapes():
......
......@@ -32,8 +32,8 @@ class TestCorr2D(utt.InferShapeTester):
def validate(self, image_shape, filter_shape,
border_mode='valid', subsample=(1, 1),
input=None, filters=None,
verify_grad=True, non_contiguous=False):
input=None, filters=None, verify_grad=True,
non_contiguous=False, filter_dilation=(1, 1)):
"""
:param image_shape: The constant shape info passed to corrMM.
:param filter_shape: The constant shape info passed to corrMM.
......@@ -55,7 +55,8 @@ class TestCorr2D(utt.InferShapeTester):
# define theano graph and function
input.name = 'input'
filters.name = 'filters'
rval = corr.CorrMM(border_mode, subsample)(input, filters)
rval = corr.CorrMM(border_mode, subsample,
filter_dilation)(input, filters)
rval.name = 'corr_output'
return rval
......@@ -86,20 +87,22 @@ class TestCorr2D(utt.InferShapeTester):
orig_image_data = image_data
img_shape2d = numpy.array(N_image_shape[-2:])
fil_shape2d = numpy.array(N_filter_shape[-2:])
dil_shape2d = numpy.array(filter_dilation)
dil_fil_shape2d = (fil_shape2d - 1) * dil_shape2d + 1
subsample2d = numpy.array(subsample)
if border_mode == 'full':
padHW = (fil_shape2d - 1)
padHW = (dil_fil_shape2d - 1)
elif border_mode == 'valid':
padHW = numpy.array([0, 0])
elif border_mode == 'half':
padHW = numpy.floor(fil_shape2d / 2).astype('int32')
padHW = numpy.floor(dil_fil_shape2d / 2).astype('int32')
elif isinstance(border_mode, tuple):
padHW = numpy.array(border_mode)
elif isinstance(border_mode, integer_types):
padHW = numpy.array([border_mode, border_mode])
else:
raise NotImplementedError('Unsupported border_mode {}'.format(border_mode))
out_shape2d = numpy.floor((img_shape2d + 2 * (padHW) - fil_shape2d) / subsample2d) + 1
out_shape2d = numpy.floor((img_shape2d + 2 * (padHW) - dil_fil_shape2d) / subsample2d) + 1
# avoid numpy deprecation
out_shape2d = out_shape2d.astype('int32')
out_shape = (N_image_shape[0], N_filter_shape[0]) + tuple(out_shape2d)
......@@ -124,8 +127,8 @@ class TestCorr2D(utt.InferShapeTester):
for col in range(ref_output.shape[3]):
icol = col * subsample[1] # image col
ref_output[bb, nn, row, col] += (image2d[
irow:irow + N_filter_shape[2],
icol:icol + N_filter_shape[3]] * filter2d[::-1, ::-1]
irow:irow + dil_fil_shape2d[0]:filter_dilation[0],
icol:icol + dil_fil_shape2d[1]:filter_dilation[1]] * filter2d[::-1, ::-1]
).sum()
self.assertTrue(_allclose(theano_output, ref_output))
......@@ -186,6 +189,28 @@ class TestCorr2D(utt.InferShapeTester):
self.validate((1, 1, 6, 6), (1, 1, 3, 3), 1, subsample=(3, 3))
def test_filter_dilation(self):
"""
Tests correlation where filter dilation != (1,1)
"""
self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'valid', filter_dilation=(2, 2))
self.validate((3, 2, 14, 10), (5, 2, 2, 3), 'valid', filter_dilation=(3, 1))
self.validate((1, 1, 14, 14), (1, 1, 3, 3), 'valid', filter_dilation=(2, 3))
self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'full', filter_dilation=(2, 2))
self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'full', filter_dilation=(3, 1))
self.validate((1, 1, 6, 6), (1, 1, 3, 3), 'full', filter_dilation=(2, 3))
self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'half', filter_dilation=(2, 2))
self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'half', filter_dilation=(3, 1))
self.validate((1, 1, 6, 6), (1, 1, 3, 3), 'half', filter_dilation=(2, 3))
self.validate((3, 2, 7, 5), (5, 2, 2, 3), (1, 1), filter_dilation=(2, 2))
self.validate((3, 2, 7, 5), (5, 2, 2, 3), (2, 1), filter_dilation=(2, 1))
self.validate((1, 1, 6, 6), (1, 1, 3, 3), (1, 2), filter_dilation=(1, 2))
self.validate((1, 1, 6, 6), (1, 1, 3, 3), 1, subsample=(3, 3), filter_dilation=(2, 2))
@attr('slow')
def test_shape_Constant_tensor(self):
"""
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论