提交 1d2411c6 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

Enable border_mode != valid and filter_dilation in GpuCorr3dMM.

This reuses the implementation of GpuCorr2dMM and its gradient ops.
上级 1d9aff8a
......@@ -1396,29 +1396,64 @@ class BaseGpuCorr3dMM(GpuOp):
Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
`GpuCorr3dMM_gradInputs`. Cannot be used directly.
Parameters
----------
border_mode : {'valid', 'full', 'half'}
Additionally, the padding size could be directly specified by an integer
or a tuple of three integers
subsample
Perform subsampling of the output (default: (1, 1, 1)).
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
pad
*deprecated*, now you should always use border_mode.
"""
__props__ = ('border_mode', 'subsample', 'pad')
check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation')
def __init__(self, border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1),
pad=(0, 0, 0)):
if border_mode != "valid":
raise ValueError("border_mode must be 'valid'")
if pad != (0, 0, 0):
_logger.warning(
'do not use pad for BaseGpuCorr3dMM; please set padding in '
'border_mode parameter, see the docstring for more details')
if border_mode != "valid":
raise ValueError("border_mode must be 'valid' if pad is given")
border_mode = pad
if isinstance(border_mode, integer_types):
border_mode = (border_mode, border_mode, border_mode)
if isinstance(border_mode, tuple):
pad_h, pad_w, pad_d = map(int, border_mode)
border_mode = (pad_h, pad_w, pad_d)
if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
border_mode in ('valid', 'full', 'half')):
raise ValueError(
'invalid border_mode {}, which must be either '
'"valid", "full", "half", an integer or a tuple of three'
' integers'.format(border_mode))
self.border_mode = border_mode
if len(subsample) != 3:
raise ValueError("subsample must have three elements")
self.subsample = subsample
if (pad not in ("half", "full")) and (len(pad) != 3):
raise ValueError("pad must be 'half', 'full', or have three elements")
self.pad = pad
if len(filter_dilation) != 3:
raise ValueError("filter_dilation must have three elements")
self.subsample = tuple(subsample)
self.filter_dilation = tuple(filter_dilation)
@property
def pad(self):
if self.border_mode != 'valid':
return self.border_mode
return (0, 0, 0)
def __str__(self):
return '%s{%s, %s, pad=%r}' % (
return '%s{%s, %s, %s}' % (
self.__class__.__name__,
self.border_mode,
str(self.subsample),
self.pad)
str(self.filter_dilation))
def flops(self, inp, outp):
""" Useful with the hack in profiling to print the MFlops"""
......@@ -1440,7 +1475,7 @@ class BaseGpuCorr3dMM(GpuOp):
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 23)
return (0, 25)
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of
......@@ -1503,15 +1538,17 @@ class BaseGpuCorr3dMM(GpuOp):
Ignored otherwise.
"""
if self.border_mode != "valid":
raise ValueError("mode must be 'valid'")
dH, dW, dD = self.subsample
if self.pad == "half":
dilH, dilW, dilD = self.filter_dilation
if self.border_mode == "half":
padH = padW = padD = -1
elif self.pad == "full":
elif self.border_mode == "full":
padH = padW = padD = -2
elif isinstance(self.border_mode, tuple):
padH, padW, padD = self.border_mode
else:
padH, padW, padD = self.pad
assert self.border_mode == "valid"
padH = padW = padD = 0
if direction == "forward":
direction = 0
out = top
......@@ -1556,6 +1593,9 @@ class BaseGpuCorr3dMM(GpuOp):
int dH = %(dH)s;
int dW = %(dW)s;
int dD = %(dD)s;
int dilH = %(dilH)s;
int dilW = %(dilW)s;
int dilD = %(dilD)s;
int padH = %(padH)s;
int padW = %(padW)s;
int padD = %(padD)s;
......@@ -1585,12 +1625,12 @@ class BaseGpuCorr3dMM(GpuOp):
else if (padH == -2)
{
// vertical full padding, we can infer the kernel height
kH = 2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
kH = (2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1;
}
else
{
// explicit padding, we can infer the kernel height
kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
kH = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1 ;
}
if ((dW != 1) || (padW == -1))
{
......@@ -1598,11 +1638,11 @@ class BaseGpuCorr3dMM(GpuOp):
}
else if (padW == -2)
{
kW = 2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
kW = (2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
else
{
kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
kW = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
}
if ((dD != 1) || (padD == -1))
{
......@@ -1610,22 +1650,27 @@ class BaseGpuCorr3dMM(GpuOp):
}
else if (padD == -2)
{
kD = 2 - CudaNdarray_HOST_DIMS(bottom)[4] + (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD;
kD = (2 - CudaNdarray_HOST_DIMS(bottom)[4] + (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
}
else
{
kD = CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD;
kD = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD - 1) / dilD+ 1;
}
}
// Implicit dilated kernel size
int dil_kH = (kH - 1) * dilH + 1;
int dil_kW = (kW - 1) * dilW + 1;
int dil_kD = (kD - 1) * dilD + 1;
// Auto-padding if requested
if (padH == -1)
{ // vertical half padding
padH = kH / 2;
padH = dil_kH / 2;
}
else if (padH == -2)
{ // vertical full padding
padH = kH - 1;
padH = dil_kH - 1;
}
else if (padH < 0)
{
......@@ -1633,10 +1678,10 @@ class BaseGpuCorr3dMM(GpuOp):
%(fail)s
}
if (padW == -1) { // horizontal half padding
padW = kW / 2;
padW = dil_kW / 2;
}
else if (padW == -2) { // horizontal full padding
padW = kW - 1;
padW = dil_kW - 1;
}
else if (padW < 0)
{
......@@ -1645,11 +1690,11 @@ class BaseGpuCorr3dMM(GpuOp):
}
if (padD == -1)
{ // horizontal half padding
padD = kD / 2;
padD = dil_kD / 2;
}
else if (padD == -2)
{ // horizontal full padding
padD = kD - 1;
padD = dil_kD - 1;
}
else if (padD < 0)
{
......@@ -1662,16 +1707,16 @@ class BaseGpuCorr3dMM(GpuOp):
switch(direction) {
case 0: // forward pass
// output is top: (batchsize, num_filters, height, width, depth)
// height and width: top = (bottom + 2*pad - weight) / sample + 1
// height, width and depth: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - CudaNdarray_HOST_DIMS(weights)[4]) / dD + 1;
out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - ((CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - ((CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - ((CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1;
break;
case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width, depth)
// height, width and depth: weights = bottom + 2*pad - (top-1) * sample
// height, width and depth: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = CudaNdarray_HOST_DIMS(top)[1];
out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1];
out_dim[2] = kH; // already inferred further above
......@@ -1680,12 +1725,12 @@ class BaseGpuCorr3dMM(GpuOp):
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width, depth)
// height, width and depth: bottom = (top-1) * sample + weights - 2*pad
// height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
out_dim[4] = (dD != 1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + CudaNdarray_HOST_DIMS(weights)[4] - 2*padD;
out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
out_dim[4] = (dD != 1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + (CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
break;
default:
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
......@@ -1716,7 +1761,8 @@ class BaseGpuCorr3dMM(GpuOp):
}
// Call CUDA code
out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dD, padH, padW, padD);
out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dD,
dilH, dilW, dilD, padH, padW, padD);
if (out2==NULL){
%(fail)s
}
......@@ -1734,19 +1780,28 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
Currently supports "valid" only; "full" can be simulated by setting
`pad="full"` (at the cost of performance), or by using
`GpuCorrMM_gradInputs`.
The width of a border of implicit zeros to pad the
input with. Must be a tuple with 3 elements giving the width of
the padding on each side, or a single integer to pad the same
on all sides, or a string shortcut setting the padding at runtime:
``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'``
for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)``
(full convolution), ``'half'`` for ``(kernel_rows // 2,
kernel_columns // 2, kernel_depth // 2)`` (same convolution for
odd-sized kernels). Note that the three widths are each
applied twice, once per side (left and right, top and bottom, front
and back).
subsample
The subsample operation applied to each output image. Should be a tuple
with 3 elements. `(sv, sh, sl)` is equivalent to
`GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`, but faster.
Set to `(1, 1, 1)` to disable subsampling.
filter_dilation
The filter dilation operation applied to each input image.
Should be a tuple with 3 elements.
Set to `(1, 1, 1)` to disable filter dilation.
pad
The width of a border of implicit zeros to pad the input image with.
Should be a tuple with 3 elements giving the numbers of rows and columns
to pad on each side, or "half" to set the padding
to `(kernel_rows // 2, kernel_columns // 2, kernel_depth // 2)`,
or "full" to set the padding
to `(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)` at runtime.
Set to `(0, 0, 0)` to disable padding.
Deprecated alias for `border_mode`.
Notes
-----
......@@ -1765,8 +1820,10 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
batchsize or number of filters) may also work around the CUBLAS bug.
"""
def __init__(self, border_mode="valid", subsample=(1, 1, 1), pad=(0, 0, 0)):
super(GpuCorr3dMM, self).__init__(border_mode, subsample, pad)
def __init__(self, border_mode="valid", subsample=(1, 1, 1),
filter_dilation=(1, 1, 1), pad=(0, 0, 0)):
super(GpuCorr3dMM, self).__init__(border_mode, subsample,
filter_dilation, pad)
def make_node(self, img, kern):
img = as_cuda_ndarray_variable(img)
......@@ -1792,14 +1849,12 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
top = gpu_contiguous(top)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
self.subsample,
self.pad)(weights,
top,
bottom.shape[-3:])
self.filter_dilation)(
weights, top, bottom.shape[-3:])
d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
self.subsample,
self.pad)(bottom,
top,
weights.shape[-3:])
self.filter_dilation)(
bottom, top, weights.shape[-3:])
return d_bottom, d_weights
......@@ -1815,8 +1870,10 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
def __init__(self, border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1),
pad=(0, 0, 0)):
super(GpuCorr3dMM_gradWeights, self).__init__(border_mode, subsample, pad)
super(GpuCorr3dMM_gradWeights, self).__init__(border_mode, subsample,
filter_dilation, pad)
def make_node(self, img, topgrad, shape=None):
img = as_cuda_ndarray_variable(img)
......@@ -1828,10 +1885,14 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
raise TypeError('img must be 5D tensor')
if topgrad.type.ndim != 5:
raise TypeError('topgrad must be 5D tensor')
if self.subsample != (1, 1, 1) or self.pad == "half":
if self.subsample != (1, 1, 1) or self.border_mode == "half":
if shape is None:
raise ValueError('shape must be given if subsample != (1, 1, 1), or pad == "half"')
raise ValueError('shape must be given if subsample != (1, 1, 1)'
' or border_mode == "half"')
height_width_depth = [shape[0], shape[1], shape[2]]
assert shape[0].ndim == 0
assert shape[1].ndim == 0
assert shape[2].ndim == 0
else:
height_width_depth = []
......@@ -1850,9 +1911,13 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
bottom, top = inp[:2]
weights, = grads
weights = gpu_contiguous(weights)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)(weights, top, bottom.shape[-3:])
d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)(
bottom, weights)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
self.subsample,
self.filter_dilation)(weights,
top,
bottom.shape[-3:])
d_top = GpuCorr3dMM(
self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
return (d_bottom, d_top) + d_height_width_depth
......@@ -1875,8 +1940,10 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
def __init__(self, border_mode="valid",
subsample=(1, 1, 1),
filter_dilation=(1, 1, 1),
pad=(0, 0, 0)):
super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample, pad)
super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample,
filter_dilation, pad)
def make_node(self, kern, topgrad, shape=None):
kern = as_cuda_ndarray_variable(kern)
......@@ -1888,6 +1955,10 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
if self.subsample != (1, 1, 1) and shape is None:
raise ValueError('shape must be given if subsample != (1, 1, 1)')
height_width_depth = [shape[0], shape[1], shape[2]] if self.subsample != (1, 1, 1) else []
if height_width_depth:
assert shape[0].ndim == 0
assert shape[1].ndim == 0
assert shape[2].ndim == 0
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
False, False, False]
......@@ -1906,12 +1977,12 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
bottom = gpu_contiguous(bottom)
d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
self.subsample,
self.pad)(bottom,
top,
weights.shape[-3:])
self.filter_dilation)(bottom,
top,
weights.shape[-3:])
d_top = GpuCorr3dMM(self.border_mode,
self.subsample,
self.pad)(bottom, weights)
self.filter_dilation)(bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
* 3 if len(inp) == 5 else ()
return (d_weights, d_top) + d_height_width_depth
......
......@@ -52,6 +52,54 @@ inline int GET_BLOCKS(const int N) {
// (Adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
// Kernels for fast unfold + copy
// CUDA kernel for the case of dilation
__global__ void dilated_im3d2col_kernel(const int n, const float* data_im,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
float* data_col) {
CUDA_KERNEL_LOOP(index, n) {
const int w_index = index / depth_col;
const int h_index = w_index / width_col;
const int d_col = index % depth_col;
const int h_col = h_index % height_col;
const int w_col = w_index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w * kernel_d;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
const int d_offset = d_col * stride_d - pad_d;
float* data_col_ptr = data_col;
data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const float* data_im_ptr = data_im;
data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset;
for (int i = 0; i < kernel_h; ++i)
{
int h_im = h_offset + i * dilation_h;
for (int j = 0; j < kernel_w; ++j)
{
int w_im = w_offset + j * dilation_w;
for (int k = 0; k < kernel_d; ++k)
{
int d_im = d_offset + k * dilation_d;
*data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
h_im < height && w_im < width && d_im < depth) ?
data_im_ptr[i * dilation_h * (width * depth) +
j * dilation_w * depth +
k * dilation_d] : 0;
data_col_ptr += height_col * width_col * depth_col;
}
}
}
}
}
__global__ void im3d2col_kernel(const int n, const float* data_im,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
......@@ -62,41 +110,35 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
{
CUDA_KERNEL_LOOP(index, n)
{
int d_out = index % depth_col;
int w_index = index / depth_col;
int w_out = w_index % width_col;
int h_index = w_index / width_col;
int h_out = h_index % height_col;
int channel_in = h_index / height_col;
//channel_in = 1;
int channel_out = channel_in * kernel_h * kernel_w * kernel_d;
int h_in = h_out * stride_h - pad_h;
int w_in = w_out * stride_w - pad_w;
int d_in = d_out * stride_d - pad_d;
const int w_index = index / depth_col;
const int h_index = w_index / width_col;
const int d_col = index % depth_col;
const int h_col = h_index % height_col;
const int w_col = w_index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w * kernel_d;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
const int d_offset = d_col * stride_d - pad_d;
float* data_col_ptr = data_col;
data_col_ptr += channel_out * (height_col * width_col * depth_col) +
h_out * (width_col * depth_col) + w_out * depth_col + d_out;
data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const float* data_im_ptr = data_im;
data_im_ptr += channel_in * (height * width * depth) +
h_in * (width * depth) + w_in * depth + d_in;
data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset;
for (int i = 0; i < kernel_h; ++i)
{
int h = h_in + i;
int h_im = h_offset + i;
for (int j = 0; j < kernel_w; ++j)
{
int w = w_in + j;
int w_im = w_offset + j;
for (int k = 0; k < kernel_d; ++k)
{
int d = d_in + k;
*data_col_ptr = (h >= 0 && w >= 0 && d >= 0 &&
h < height && w < width && d < depth) ?
data_im_ptr[i * (width * depth) + j *depth + k] : 0;
int d_im = d_offset + k;
*data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
h_im < height && w_im < width && d_im < depth) ?
data_im_ptr[i * (width * depth) + j * depth + k] : 0;
data_col_ptr += height_col * width_col * depth_col;
}
}
......@@ -107,31 +149,105 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
void im3d2col(const float* data_im, const int channels,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
float* data_col)
{
// We are going to launch channels * height_col * width_col * depth_col kernels, each
// kernel responsible for copying a single-channel grid.
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1;
int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
int dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
int num_kernels = channels * height_col * width_col * depth_col;
im3d2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_im,
height, width, depth,
kernel_h, kernel_w, kernel_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_col);
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
dilated_im3d2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_im,
height, width, depth,
kernel_h, kernel_w, kernel_d,
dilation_h, dilation_w, dilation_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_col);
}
else{
im3d2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_im,
height, width, depth,
kernel_h, kernel_w, kernel_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_col);
}
}
// CUDA kernel for the case of dilation
__global__ void dilated_col2im3d_kernel(
const int n, const float* data_col,
const int height, const int width, const int depth,
const int channels,
const int kernel_h, const int kernel_w, const int kernel_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
float* data_im)
{
CUDA_KERNEL_LOOP(index, n)
{
float val = 0;
const int d_im = index % depth + pad_d;
const int w_index = index / depth;
const int w_im = w_index % width + pad_w;
const int h_index = w_index / width;
const int h_im = h_index % height + pad_h;
const int c_im = h_index / height;
int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
int kernel_extent_d = (kernel_d - 1) * dilation_d + 1;
// compute the start and end of the output
const int d_col_start = (d_im < kernel_extent_d) ? 0 : (d_im - kernel_extent_d) / stride_d + 1;
const int d_col_end = min(d_im / stride_d + 1, depth_col);
const int w_col_start = (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const int w_col_end = min(w_im / stride_w + 1, width_col);
const int h_col_start = (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const int h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
int h_k = (h_im - h_col * stride_h);
int w_k = (w_im - w_col * stride_w);
int d_k = (d_im - d_col * stride_d);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0 && d_k % dilation_d == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
d_k /= dilation_d;
int data_col_index = c_im * kernel_h * kernel_w * kernel_d * height_col * width_col * depth_col +
h_k * kernel_w * kernel_d * height_col * width_col * depth_col +
w_k * kernel_d * height_col * width_col * depth_col +
d_k * height_col * width_col * depth_col +
h_col * width_col * depth_col +
w_col * depth_col +
d_col;
val += data_col[data_col_index];
}
}
}
}
data_im[index] = val;
}
}
__global__ void col2im3d_kernel(const int n, const float* data_col,
const int height, const int width, const int depth,
const int channels,
const int patch_h, const int patch_w, const int patch_d,
const int kernel_h, const int kernel_w, const int kernel_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
......@@ -140,59 +256,78 @@ __global__ void col2im3d_kernel(const int n, const float* data_col,
CUDA_KERNEL_LOOP(index, n)
{
float val = 0;
int d = index % depth + pad_d;
int w_index = index / depth;
int w = w_index % width + pad_w;
int h_index = w_index / width;
int h = h_index % height + pad_h;
int c = h_index / height;
const int d_im = index % depth + pad_d;
const int w_index = index / depth;
const int w_im = w_index % width + pad_w;
const int h_index = w_index / width;
const int h_im = h_index % height + pad_h;
const int c_im = h_index / height;
// compute the start and end of the output
int d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1;
int d_col_end = min(d / stride_d + 1, depth_col);
int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
int w_col_end = min(w / stride_w + 1, width_col);
int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
int h_col_end = min(h / stride_h + 1, height_col);
const int d_col_start = (d_im < kernel_d) ? 0 : (d_im - kernel_d) / stride_d + 1;
const int d_col_end = min(d_im / stride_d + 1, depth_col);
const int w_col_start = (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
const int w_col_end = min(w_im / stride_w + 1, width_col);
const int h_col_start = (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
const int h_col_end = min(h_im / stride_h + 1, height_col);
int offset =
(c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col;
(c_im * kernel_h * kernel_w * kernel_d + h_im * kernel_w * kernel_d +
w_im * kernel_d + d_im) * height_col * width_col * depth_col;
int coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
int coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
int coeff_h_col = (1 - stride_h * kernel_w * kernel_d * height_col) * width_col * depth_col;
int coeff_w_col = (1 - stride_w * kernel_d * height_col * width_col) * depth_col;
int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
for (int d_col = d_col_start; d_col < d_col_end; ++d_col)
for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
}
}
}
data_im[index] = val;
data_im[index] = val;
}
}
void col2im3d(const float* data_col, const int channels,
const int height, const int width, const int depth,
const int patch_h, const int patch_w, const int patch_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
float* data_im)
{
int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1;
int dil_patch_h = (patch_h - 1) * dilation_h + 1;
int dil_patch_w = (patch_w - 1) * dilation_w + 1;
int dil_patch_d = (patch_d - 1) * dilation_d + 1;
int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
int num_kernels = channels * height * width * depth;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
col2im3d_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_col,
height, width, depth, channels,
patch_h, patch_w, patch_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_im);
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
dilated_col2im3d_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_col,
height, width, depth, channels,
patch_h, patch_w, patch_d,
dilation_h, dilation_w, dilation_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_im);
}
else{
col2im3d_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_col,
height, width, depth, channels,
patch_h, patch_w, patch_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_im);
}
}
......@@ -210,6 +345,9 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
const int dH = 1,
const int dW = 1,
const int dD = 1,
const int dilH = 1,
const int dilW = 1,
const int dilD = 1,
const int padH = 0,
const int padW = 0,
const int padD = 0)
......@@ -286,10 +424,14 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
"GpuCorr3dMM images and kernel must have the same stack size\n");
return 0;
}
// implicit dilated filter
const int dil_kH = (kH - 1) * dilH + 1;
const int dil_kW = (kW - 1) * dilW + 1;
const int dil_kD = (kD - 1) * dilD + 1;
// top: (batchSize, nFilters, topHeight, topWidth, topDepth)
const int topHeight = int((bottomHeight + 2*padH - kH) / dH) + 1;
const int topWidth = int((bottomWidth + 2*padW - kW) / dW) + 1;
const int topDepth = int((bottomDepth + 2*padD - kD) / dD) + 1;
const int topHeight = int((bottomHeight + 2*padH - dil_kH) / dH) + 1;
const int topWidth = int((bottomWidth + 2*padW - dil_kW) / dW) + 1;
const int topDepth = int((bottomDepth + 2*padD - dil_kD) / dD) + 1;
if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
......@@ -345,6 +487,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD,
dilH, dilW, dilD,
padH, padW, padD,
dH, dW, dD,
col->devdata);
......@@ -392,6 +535,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
im3d2col(bottom->devdata + n * bottom_stride, nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD,
dilH, dilW, dilD,
padH, padW, padD,
dH, dW, dD,
col->devdata);
......@@ -461,6 +605,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
col2im3d(col->devdata, nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD,
dilH, dilW, dilD,
padH, padW, padD,
dH, dW, dD, bottom->devdata + n * bottom_stride);
cudaError_t err = cudaGetLastError();
......
from __future__ import absolute_import, print_function, division
import unittest
import numpy
from six.moves import xrange
try:
from scipy import ndimage
except ImportError:
ndimage = None
import theano
from theano.tests import unittest_tools as utt
......@@ -21,31 +26,127 @@ else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
# python reference implementation of a 3D convolution
# see also: theano.tensor.nnet.tests.test_conv3d2d
# expects: (batch, 0, channels, 1, 2)
def pyconv3d(signals, filters, border_mode='valid', dilation=(1, 1, 1)):
Ns, Ts, C, Hs, Ws = signals.shape
Nf, Tf, C, Hf, Wf = filters.shape
Tdil, Hdil, Wdil = dilation
Tfdil = (Tf - 1) * Tdil + 1
Hfdil = (Hf - 1) * Hdil + 1
Wfdil = (Wf - 1) * Wdil + 1
# if border_mode is not 'valid', the signals need zero-padding
if border_mode == 'full':
Tpad = Tfdil - 1
Hpad = Hfdil - 1
Wpad = Wfdil - 1
elif border_mode == 'half':
Tpad = Tfdil // 2
Hpad = Hfdil // 2
Wpad = Wfdil // 2
elif isinstance(border_mode, tuple):
Tpad, Hpad, Wpad = map(int, border_mode)
else:
Tpad = 0
Hpad = 0
Wpad = 0
if Tpad > 0 or Hpad > 0 or Wpad > 0:
# zero-pad signals
signals_padded = numpy.zeros((Ns, Ts + 2 * Tpad, C,
Hs + 2 * Hpad, Ws + 2 * Wpad), 'float32')
signals_padded[:, Tpad:(Ts + Tpad), :, Hpad:(Hs + Hpad),
Wpad:(Ws + Wpad)] = signals
Ns, Ts, C, Hs, Ws = signals_padded.shape
signals = signals_padded
Tfdil2 = Tfdil // 2
Hfdil2 = Hfdil // 2
Wfdil2 = Wfdil // 2
dilated_filters = numpy.zeros((Nf, Tfdil, C, Hfdil, Wfdil), dtype=filters.dtype)
dilated_filters[:, ::Tdil, :, ::Hdil, ::Wdil] = filters
# perform valid convolution on the padded signals
rval = numpy.zeros((Ns, Ts - Tfdil + 1, Nf, Hs - Hfdil + 1, Ws - Wfdil + 1))
for ns in xrange(Ns):
for nf in xrange(Nf):
for c in xrange(C):
s_i = signals[ns, :, c, :, :]
f_i = dilated_filters[nf, :, c, :, :]
r_i = rval[ns, :, nf, :, :]
# scipy.signal.convolve performs valid convolution,
# but is quite slow. scipy.ndimage.convolve is faster
# only supports 'same' convolution.
# origin must be -1 for even filters, 0 for odd filters
o_i = ndimage.convolve(s_i, f_i, mode='constant', cval=1,
origin=(f_i.shape[0] % 2 - 1,
f_i.shape[1] % 2 - 1,
f_i.shape[2] % 2 - 1))
# crop to get the result of 'valid' convolution
o_i = o_i[Tfdil2:(r_i.shape[0] + Tfdil2),
Hfdil2:(r_i.shape[1] + Hfdil2),
Wfdil2:(r_i.shape[2] + Wfdil2)]
# the result should be equal to 'valid' convolution
# utt.assert_allclose(o_i, signal.convolve(s_i, f_i, mode='valid'))
r_i += o_i
return rval
class TestCorr3DMM(unittest.TestCase):
def run_conv_valid(self, inputs_shape, filters_shape,
subsample=(1, 1, 1)):
border_mode='valid',
filter_dilation=(1, 1, 1),
subsample=(1, 1, 1),
verify_grad=False):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = shared(inputs_val)
filters = shared(filters_val)
bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters,
b=bias, d=subsample)
conv = GpuCorr3dMM(border_mode="valid",
if filter_dilation == (1, 1, 1) and border_mode in ('valid', (0, 0, 0)):
conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters,
b=bias, d=subsample)
f_ref = theano.function([], conv_ref)
res_ref = f_ref()
elif subsample == (1, 1, 1):
if ndimage is None:
raise SkipTest('This test needs SciPy.')
# input = b012c
# pyconv3d wants = b0c12 = (0, 1, 4, 2, 3)
# pyconv3d outputs = b0c12 = (0, 1, 3, 4, 2)
res_ref = pyconv3d(signals=inputs_val.transpose(0, 1, 4, 2, 3),
filters=filters_val.transpose(0, 1, 4, 2, 3)[:, ::-1, :, ::-1, ::-1],
dilation=filter_dilation,
border_mode=border_mode).transpose(0, 1, 3, 4, 2)
else:
raise SkipTest('No reference implementation that combines '
'border_mode and subsampling.')
conv = GpuCorr3dMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample)(
inputs.dimshuffle(0, 4, 1, 2, 3),
filters.dimshuffle(0, 4, 1, 2, 3))
conv = conv.dimshuffle(0, 2, 3, 4, 1)
f_ref = theano.function([], conv_ref)
f = theano.function([], conv, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
if verify_grad:
utt.verify_grad(GpuCorr3dMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample),
[inputs_val.transpose(0, 4, 1, 2, 3),
filters_val.transpose(0, 4, 1, 2, 3)])
def test_valid(self):
self.run_conv_valid(inputs_shape=(16, 20, 12, 16, 1),
filters_shape=(10, 6, 12, 4, 1))
......@@ -68,6 +169,50 @@ class TestCorr3DMM(unittest.TestCase):
filters_shape=(10, 6, 12, 4, 1),
subsample=(1, 2, 3))
def test_border_mode(self):
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode='valid')
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode='half')
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode='full')
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode=(0, 0, 0))
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode=(1, 2, 3))
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode=(3, 2, 1))
def test_filter_dilation(self):
inputs_shape = [16, 20, 12, 15, 1]
filters_shape = [10, 6, 5, 4, 1]
for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
for border_mode in ['valid', 'half', 'full']:
self.run_conv_valid(inputs_shape=inputs_shape,
filters_shape=filters_shape,
filter_dilation=filter_dilation,
border_mode=border_mode)
def test_verify_gradients(self):
# use a small example to check the gradients
inputs_shape = [2, 7, 9, 6, 1]
filters_shape = [1, 3, 3, 2, 1]
for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
for border_mode in ['valid', 'half', 'full', (2, 1, 3)]:
self.run_conv_valid(inputs_shape=inputs_shape,
filters_shape=filters_shape,
filter_dilation=filter_dilation,
border_mode=border_mode,
verify_grad=True)
def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
subsample=(1, 1, 1)):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论