提交 1d2411c6 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

Enable border_mode != valid and filter_dilation in GpuCorr3dMM.

This reuses the implementation of GpuCorr2dMM and its gradient ops.
上级 1d9aff8a
...@@ -1396,29 +1396,64 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1396,29 +1396,64 @@ class BaseGpuCorr3dMM(GpuOp):
Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
`GpuCorr3dMM_gradInputs`. Cannot be used directly. `GpuCorr3dMM_gradInputs`. Cannot be used directly.
Parameters
----------
border_mode : {'valid', 'full', 'half'}
Additionally, the padding size could be directly specified by an integer
or a tuple of three integers
subsample
Perform subsampling of the output (default: (1, 1, 1)).
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
pad
*deprecated*, now you should always use border_mode.
""" """
__props__ = ('border_mode', 'subsample', 'pad') check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation')
def __init__(self, border_mode="valid", def __init__(self, border_mode="valid",
subsample=(1, 1, 1), subsample=(1, 1, 1),
filter_dilation=(1, 1, 1),
pad=(0, 0, 0)): pad=(0, 0, 0)):
if pad != (0, 0, 0):
_logger.warning(
'do not use pad for BaseGpuCorr3dMM; please set padding in '
'border_mode parameter, see the docstring for more details')
if border_mode != "valid": if border_mode != "valid":
raise ValueError("border_mode must be 'valid'") raise ValueError("border_mode must be 'valid' if pad is given")
border_mode = pad
if isinstance(border_mode, integer_types):
border_mode = (border_mode, border_mode, border_mode)
if isinstance(border_mode, tuple):
pad_h, pad_w, pad_d = map(int, border_mode)
border_mode = (pad_h, pad_w, pad_d)
if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
border_mode in ('valid', 'full', 'half')):
raise ValueError(
'invalid border_mode {}, which must be either '
'"valid", "full", "half", an integer or a tuple of three'
' integers'.format(border_mode))
self.border_mode = border_mode self.border_mode = border_mode
if len(subsample) != 3: if len(subsample) != 3:
raise ValueError("subsample must have three elements") raise ValueError("subsample must have three elements")
self.subsample = subsample if len(filter_dilation) != 3:
if (pad not in ("half", "full")) and (len(pad) != 3): raise ValueError("filter_dilation must have three elements")
raise ValueError("pad must be 'half', 'full', or have three elements") self.subsample = tuple(subsample)
self.pad = pad self.filter_dilation = tuple(filter_dilation)
@property
def pad(self):
if self.border_mode != 'valid':
return self.border_mode
return (0, 0, 0)
def __str__(self): def __str__(self):
return '%s{%s, %s, pad=%r}' % ( return '%s{%s, %s, %s}' % (
self.__class__.__name__, self.__class__.__name__,
self.border_mode, self.border_mode,
str(self.subsample), str(self.subsample),
self.pad) str(self.filter_dilation))
def flops(self, inp, outp): def flops(self, inp, outp):
""" Useful with the hack in profiling to print the MFlops""" """ Useful with the hack in profiling to print the MFlops"""
...@@ -1440,7 +1475,7 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1440,7 +1475,7 @@ class BaseGpuCorr3dMM(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 23) return (0, 25)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
...@@ -1503,15 +1538,17 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1503,15 +1538,17 @@ class BaseGpuCorr3dMM(GpuOp):
Ignored otherwise. Ignored otherwise.
""" """
if self.border_mode != "valid":
raise ValueError("mode must be 'valid'")
dH, dW, dD = self.subsample dH, dW, dD = self.subsample
if self.pad == "half": dilH, dilW, dilD = self.filter_dilation
if self.border_mode == "half":
padH = padW = padD = -1 padH = padW = padD = -1
elif self.pad == "full": elif self.border_mode == "full":
padH = padW = padD = -2 padH = padW = padD = -2
elif isinstance(self.border_mode, tuple):
padH, padW, padD = self.border_mode
else: else:
padH, padW, padD = self.pad assert self.border_mode == "valid"
padH = padW = padD = 0
if direction == "forward": if direction == "forward":
direction = 0 direction = 0
out = top out = top
...@@ -1556,6 +1593,9 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1556,6 +1593,9 @@ class BaseGpuCorr3dMM(GpuOp):
int dH = %(dH)s; int dH = %(dH)s;
int dW = %(dW)s; int dW = %(dW)s;
int dD = %(dD)s; int dD = %(dD)s;
int dilH = %(dilH)s;
int dilW = %(dilW)s;
int dilD = %(dilD)s;
int padH = %(padH)s; int padH = %(padH)s;
int padW = %(padW)s; int padW = %(padW)s;
int padD = %(padD)s; int padD = %(padD)s;
...@@ -1585,12 +1625,12 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1585,12 +1625,12 @@ class BaseGpuCorr3dMM(GpuOp):
else if (padH == -2) else if (padH == -2)
{ {
// vertical full padding, we can infer the kernel height // vertical full padding, we can infer the kernel height
kH = 2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH; kH = (2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1;
} }
else else
{ {
// explicit padding, we can infer the kernel height // explicit padding, we can infer the kernel height
kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH; kH = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1 ;
} }
if ((dW != 1) || (padW == -1)) if ((dW != 1) || (padW == -1))
{ {
...@@ -1598,11 +1638,11 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1598,11 +1638,11 @@ class BaseGpuCorr3dMM(GpuOp):
} }
else if (padW == -2) else if (padW == -2)
{ {
kW = 2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW; kW = (2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
} }
else else
{ {
kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW; kW = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
} }
if ((dD != 1) || (padD == -1)) if ((dD != 1) || (padD == -1))
{ {
...@@ -1610,22 +1650,27 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1610,22 +1650,27 @@ class BaseGpuCorr3dMM(GpuOp):
} }
else if (padD == -2) else if (padD == -2)
{ {
kD = 2 - CudaNdarray_HOST_DIMS(bottom)[4] + (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD; kD = (2 - CudaNdarray_HOST_DIMS(bottom)[4] + (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
} }
else else
{ {
kD = CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD; kD = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD - 1) / dilD+ 1;
} }
} }
// Implicit dilated kernel size
int dil_kH = (kH - 1) * dilH + 1;
int dil_kW = (kW - 1) * dilW + 1;
int dil_kD = (kD - 1) * dilD + 1;
// Auto-padding if requested // Auto-padding if requested
if (padH == -1) if (padH == -1)
{ // vertical half padding { // vertical half padding
padH = kH / 2; padH = dil_kH / 2;
} }
else if (padH == -2) else if (padH == -2)
{ // vertical full padding { // vertical full padding
padH = kH - 1; padH = dil_kH - 1;
} }
else if (padH < 0) else if (padH < 0)
{ {
...@@ -1633,10 +1678,10 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1633,10 +1678,10 @@ class BaseGpuCorr3dMM(GpuOp):
%(fail)s %(fail)s
} }
if (padW == -1) { // horizontal half padding if (padW == -1) { // horizontal half padding
padW = kW / 2; padW = dil_kW / 2;
} }
else if (padW == -2) { // horizontal full padding else if (padW == -2) { // horizontal full padding
padW = kW - 1; padW = dil_kW - 1;
} }
else if (padW < 0) else if (padW < 0)
{ {
...@@ -1645,11 +1690,11 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1645,11 +1690,11 @@ class BaseGpuCorr3dMM(GpuOp):
} }
if (padD == -1) if (padD == -1)
{ // horizontal half padding { // horizontal half padding
padD = kD / 2; padD = dil_kD / 2;
} }
else if (padD == -2) else if (padD == -2)
{ // horizontal full padding { // horizontal full padding
padD = kD - 1; padD = dil_kD - 1;
} }
else if (padD < 0) else if (padD < 0)
{ {
...@@ -1662,16 +1707,16 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1662,16 +1707,16 @@ class BaseGpuCorr3dMM(GpuOp):
switch(direction) { switch(direction) {
case 0: // forward pass case 0: // forward pass
// output is top: (batchsize, num_filters, height, width, depth) // output is top: (batchsize, num_filters, height, width, depth)
// height and width: top = (bottom + 2*pad - weight) / sample + 1 // height, width and depth: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0]; out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0]; out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1; out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - ((CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1; out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - ((CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - CudaNdarray_HOST_DIMS(weights)[4]) / dD + 1; out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - ((CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1;
break; break;
case 1: // backprop wrt. weights case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width, depth) // output is weights: (num_filters, num_channels, height, width, depth)
// height, width and depth: weights = bottom + 2*pad - (top-1) * sample // height, width and depth: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = CudaNdarray_HOST_DIMS(top)[1]; out_dim[0] = CudaNdarray_HOST_DIMS(top)[1];
out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1]; out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1];
out_dim[2] = kH; // already inferred further above out_dim[2] = kH; // already inferred further above
...@@ -1680,12 +1725,12 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1680,12 +1725,12 @@ class BaseGpuCorr3dMM(GpuOp):
break; break;
case 2: // backprop wrt. inputs case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width, depth) // output is bottom: (batchsize, num_channels, height, width, depth)
// height, width and depth: bottom = (top-1) * sample + weights - 2*pad // height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = CudaNdarray_HOST_DIMS(top)[0]; out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1]; out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH; out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW; out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
out_dim[4] = (dD != 1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + CudaNdarray_HOST_DIMS(weights)[4] - 2*padD; out_dim[4] = (dD != 1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + (CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
break; break;
default: default:
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n"); PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
...@@ -1716,7 +1761,8 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1716,7 +1761,8 @@ class BaseGpuCorr3dMM(GpuOp):
} }
// Call CUDA code // Call CUDA code
out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dD, padH, padW, padD); out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dD,
dilH, dilW, dilD, padH, padW, padD);
if (out2==NULL){ if (out2==NULL){
%(fail)s %(fail)s
} }
...@@ -1734,19 +1780,28 @@ class GpuCorr3dMM(BaseGpuCorr3dMM): ...@@ -1734,19 +1780,28 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
Currently supports "valid" only; "full" can be simulated by setting Currently supports "valid" only; "full" can be simulated by setting
`pad="full"` (at the cost of performance), or by using `pad="full"` (at the cost of performance), or by using
`GpuCorrMM_gradInputs`. `GpuCorrMM_gradInputs`.
The width of a border of implicit zeros to pad the
input with. Must be a tuple with 3 elements giving the width of
the padding on each side, or a single integer to pad the same
on all sides, or a string shortcut setting the padding at runtime:
``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'``
for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)``
(full convolution), ``'half'`` for ``(kernel_rows // 2,
kernel_columns // 2, kernel_depth // 2)`` (same convolution for
odd-sized kernels). Note that the three widths are each
applied twice, once per side (left and right, top and bottom, front
and back).
subsample subsample
The subsample operation applied to each output image. Should be a tuple The subsample operation applied to each output image. Should be a tuple
with 3 elements. `(sv, sh, sl)` is equivalent to with 3 elements. `(sv, sh, sl)` is equivalent to
`GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`, but faster. `GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`, but faster.
Set to `(1, 1, 1)` to disable subsampling. Set to `(1, 1, 1)` to disable subsampling.
filter_dilation
The filter dilation operation applied to each input image.
Should be a tuple with 3 elements.
Set to `(1, 1, 1)` to disable filter dilation.
pad pad
The width of a border of implicit zeros to pad the input image with. Deprecated alias for `border_mode`.
Should be a tuple with 3 elements giving the numbers of rows and columns
to pad on each side, or "half" to set the padding
to `(kernel_rows // 2, kernel_columns // 2, kernel_depth // 2)`,
or "full" to set the padding
to `(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)` at runtime.
Set to `(0, 0, 0)` to disable padding.
Notes Notes
----- -----
...@@ -1765,8 +1820,10 @@ class GpuCorr3dMM(BaseGpuCorr3dMM): ...@@ -1765,8 +1820,10 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
batchsize or number of filters) may also work around the CUBLAS bug. batchsize or number of filters) may also work around the CUBLAS bug.
""" """
def __init__(self, border_mode="valid", subsample=(1, 1, 1), pad=(0, 0, 0)): def __init__(self, border_mode="valid", subsample=(1, 1, 1),
super(GpuCorr3dMM, self).__init__(border_mode, subsample, pad) filter_dilation=(1, 1, 1), pad=(0, 0, 0)):
super(GpuCorr3dMM, self).__init__(border_mode, subsample,
filter_dilation, pad)
def make_node(self, img, kern): def make_node(self, img, kern):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
...@@ -1792,14 +1849,12 @@ class GpuCorr3dMM(BaseGpuCorr3dMM): ...@@ -1792,14 +1849,12 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
top = gpu_contiguous(top) top = gpu_contiguous(top)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
self.subsample, self.subsample,
self.pad)(weights, self.filter_dilation)(
top, weights, top, bottom.shape[-3:])
bottom.shape[-3:])
d_weights = GpuCorr3dMM_gradWeights(self.border_mode, d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
self.subsample, self.subsample,
self.pad)(bottom, self.filter_dilation)(
top, bottom, top, weights.shape[-3:])
weights.shape[-3:])
return d_bottom, d_weights return d_bottom, d_weights
...@@ -1815,8 +1870,10 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM): ...@@ -1815,8 +1870,10 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
def __init__(self, border_mode="valid", def __init__(self, border_mode="valid",
subsample=(1, 1, 1), subsample=(1, 1, 1),
filter_dilation=(1, 1, 1),
pad=(0, 0, 0)): pad=(0, 0, 0)):
super(GpuCorr3dMM_gradWeights, self).__init__(border_mode, subsample, pad) super(GpuCorr3dMM_gradWeights, self).__init__(border_mode, subsample,
filter_dilation, pad)
def make_node(self, img, topgrad, shape=None): def make_node(self, img, topgrad, shape=None):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
...@@ -1828,10 +1885,14 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM): ...@@ -1828,10 +1885,14 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
raise TypeError('img must be 5D tensor') raise TypeError('img must be 5D tensor')
if topgrad.type.ndim != 5: if topgrad.type.ndim != 5:
raise TypeError('topgrad must be 5D tensor') raise TypeError('topgrad must be 5D tensor')
if self.subsample != (1, 1, 1) or self.pad == "half": if self.subsample != (1, 1, 1) or self.border_mode == "half":
if shape is None: if shape is None:
raise ValueError('shape must be given if subsample != (1, 1, 1), or pad == "half"') raise ValueError('shape must be given if subsample != (1, 1, 1)'
' or border_mode == "half"')
height_width_depth = [shape[0], shape[1], shape[2]] height_width_depth = [shape[0], shape[1], shape[2]]
assert shape[0].ndim == 0
assert shape[1].ndim == 0
assert shape[2].ndim == 0
else: else:
height_width_depth = [] height_width_depth = []
...@@ -1850,9 +1911,13 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM): ...@@ -1850,9 +1911,13 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
bottom, top = inp[:2] bottom, top = inp[:2]
weights, = grads weights, = grads
weights = gpu_contiguous(weights) weights = gpu_contiguous(weights)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)(weights, top, bottom.shape[-3:]) d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)( self.subsample,
bottom, weights) self.filter_dilation)(weights,
top,
bottom.shape[-3:])
d_top = GpuCorr3dMM(
self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else () d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
return (d_bottom, d_top) + d_height_width_depth return (d_bottom, d_top) + d_height_width_depth
...@@ -1875,8 +1940,10 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM): ...@@ -1875,8 +1940,10 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
def __init__(self, border_mode="valid", def __init__(self, border_mode="valid",
subsample=(1, 1, 1), subsample=(1, 1, 1),
filter_dilation=(1, 1, 1),
pad=(0, 0, 0)): pad=(0, 0, 0)):
super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample, pad) super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample,
filter_dilation, pad)
def make_node(self, kern, topgrad, shape=None): def make_node(self, kern, topgrad, shape=None):
kern = as_cuda_ndarray_variable(kern) kern = as_cuda_ndarray_variable(kern)
...@@ -1888,6 +1955,10 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM): ...@@ -1888,6 +1955,10 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
if self.subsample != (1, 1, 1) and shape is None: if self.subsample != (1, 1, 1) and shape is None:
raise ValueError('shape must be given if subsample != (1, 1, 1)') raise ValueError('shape must be given if subsample != (1, 1, 1)')
height_width_depth = [shape[0], shape[1], shape[2]] if self.subsample != (1, 1, 1) else [] height_width_depth = [shape[0], shape[1], shape[2]] if self.subsample != (1, 1, 1) else []
if height_width_depth:
assert shape[0].ndim == 0
assert shape[1].ndim == 0
assert shape[2].ndim == 0
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1], broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
False, False, False] False, False, False]
...@@ -1906,12 +1977,12 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM): ...@@ -1906,12 +1977,12 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
bottom = gpu_contiguous(bottom) bottom = gpu_contiguous(bottom)
d_weights = GpuCorr3dMM_gradWeights(self.border_mode, d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
self.subsample, self.subsample,
self.pad)(bottom, self.filter_dilation)(bottom,
top, top,
weights.shape[-3:]) weights.shape[-3:])
d_top = GpuCorr3dMM(self.border_mode, d_top = GpuCorr3dMM(self.border_mode,
self.subsample, self.subsample,
self.pad)(bottom, weights) self.filter_dilation)(bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),)\ d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
* 3 if len(inp) == 5 else () * 3 if len(inp) == 5 else ()
return (d_weights, d_top) + d_height_width_depth return (d_weights, d_top) + d_height_width_depth
......
...@@ -52,6 +52,54 @@ inline int GET_BLOCKS(const int N) { ...@@ -52,6 +52,54 @@ inline int GET_BLOCKS(const int N) {
// (Adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu) // (Adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
// Kernels for fast unfold + copy // Kernels for fast unfold + copy
// CUDA kernel for the case of dilation
__global__ void dilated_im3d2col_kernel(const int n, const float* data_im,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
float* data_col) {
CUDA_KERNEL_LOOP(index, n) {
const int w_index = index / depth_col;
const int h_index = w_index / width_col;
const int d_col = index % depth_col;
const int h_col = h_index % height_col;
const int w_col = w_index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w * kernel_d;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
const int d_offset = d_col * stride_d - pad_d;
float* data_col_ptr = data_col;
data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const float* data_im_ptr = data_im;
data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset;
for (int i = 0; i < kernel_h; ++i)
{
int h_im = h_offset + i * dilation_h;
for (int j = 0; j < kernel_w; ++j)
{
int w_im = w_offset + j * dilation_w;
for (int k = 0; k < kernel_d; ++k)
{
int d_im = d_offset + k * dilation_d;
*data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
h_im < height && w_im < width && d_im < depth) ?
data_im_ptr[i * dilation_h * (width * depth) +
j * dilation_w * depth +
k * dilation_d] : 0;
data_col_ptr += height_col * width_col * depth_col;
}
}
}
}
}
__global__ void im3d2col_kernel(const int n, const float* data_im, __global__ void im3d2col_kernel(const int n, const float* data_im,
const int height, const int width, const int depth, const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d, const int kernel_h, const int kernel_w, const int kernel_d,
...@@ -62,41 +110,35 @@ __global__ void im3d2col_kernel(const int n, const float* data_im, ...@@ -62,41 +110,35 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
{ {
CUDA_KERNEL_LOOP(index, n) CUDA_KERNEL_LOOP(index, n)
{ {
int d_out = index % depth_col; const int w_index = index / depth_col;
int w_index = index / depth_col; const int h_index = w_index / width_col;
int w_out = w_index % width_col; const int d_col = index % depth_col;
int h_index = w_index / width_col; const int h_col = h_index % height_col;
int h_out = h_index % height_col; const int w_col = w_index % width_col;
const int c_im = h_index / height_col;
int channel_in = h_index / height_col; const int c_col = c_im * kernel_h * kernel_w * kernel_d;
//channel_in = 1; const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
int channel_out = channel_in * kernel_h * kernel_w * kernel_d; const int d_offset = d_col * stride_d - pad_d;
int h_in = h_out * stride_h - pad_h;
int w_in = w_out * stride_w - pad_w;
int d_in = d_out * stride_d - pad_d;
float* data_col_ptr = data_col; float* data_col_ptr = data_col;
data_col_ptr += channel_out * (height_col * width_col * depth_col) + data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_out * (width_col * depth_col) + w_out * depth_col + d_out; h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const float* data_im_ptr = data_im; const float* data_im_ptr = data_im;
data_im_ptr += channel_in * (height * width * depth) + data_im_ptr += c_im * (height * width * depth) +
h_in * (width * depth) + w_in * depth + d_in; h_offset * (width * depth) + w_offset * depth + d_offset;
for (int i = 0; i < kernel_h; ++i) for (int i = 0; i < kernel_h; ++i)
{ {
int h = h_in + i; int h_im = h_offset + i;
for (int j = 0; j < kernel_w; ++j) for (int j = 0; j < kernel_w; ++j)
{ {
int w = w_in + j; int w_im = w_offset + j;
for (int k = 0; k < kernel_d; ++k) for (int k = 0; k < kernel_d; ++k)
{ {
int d = d_in + k; int d_im = d_offset + k;
*data_col_ptr = (h >= 0 && w >= 0 && d >= 0 && *data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
h < height && w < width && d < depth) ? h_im < height && w_im < width && d_im < depth) ?
data_im_ptr[i * (width * depth) + j *depth + k] : 0; data_im_ptr[i * (width * depth) + j * depth + k] : 0;
data_col_ptr += height_col * width_col * depth_col; data_col_ptr += height_col * width_col * depth_col;
} }
} }
...@@ -107,16 +149,32 @@ __global__ void im3d2col_kernel(const int n, const float* data_im, ...@@ -107,16 +149,32 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
void im3d2col(const float* data_im, const int channels, void im3d2col(const float* data_im, const int channels,
const int height, const int width, const int depth, const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d, const int kernel_h, const int kernel_w, const int kernel_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d, const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d, const int stride_h, const int stride_w, const int stride_d,
float* data_col) float* data_col)
{ {
// We are going to launch channels * height_col * width_col * depth_col kernels, each // We are going to launch channels * height_col * width_col * depth_col kernels, each
// kernel responsible for copying a single-channel grid. // kernel responsible for copying a single-channel grid.
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
int depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1; int dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
int num_kernels = channels * height_col * width_col * depth_col; int num_kernels = channels * height_col * width_col * depth_col;
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
dilated_im3d2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_im,
height, width, depth,
kernel_h, kernel_w, kernel_d,
dilation_h, dilation_w, dilation_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_col);
}
else{
im3d2col_kernel<<<GET_BLOCKS(num_kernels), im3d2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_im, CUDA_NUM_THREADS>>>(num_kernels, data_im,
height, width, depth, height, width, depth,
...@@ -125,13 +183,71 @@ void im3d2col(const float* data_im, const int channels, ...@@ -125,13 +183,71 @@ void im3d2col(const float* data_im, const int channels,
stride_h, stride_w, stride_d, stride_h, stride_w, stride_d,
height_col, width_col, depth_col, height_col, width_col, depth_col,
data_col); data_col);
}
} }
// CUDA kernel for the case of dilation
__global__ void dilated_col2im3d_kernel(
const int n, const float* data_col,
const int height, const int width, const int depth,
const int channels,
const int kernel_h, const int kernel_w, const int kernel_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
float* data_im)
{
CUDA_KERNEL_LOOP(index, n)
{
float val = 0;
const int d_im = index % depth + pad_d;
const int w_index = index / depth;
const int w_im = w_index % width + pad_w;
const int h_index = w_index / width;
const int h_im = h_index % height + pad_h;
const int c_im = h_index / height;
int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
int kernel_extent_d = (kernel_d - 1) * dilation_d + 1;
// compute the start and end of the output
const int d_col_start = (d_im < kernel_extent_d) ? 0 : (d_im - kernel_extent_d) / stride_d + 1;
const int d_col_end = min(d_im / stride_d + 1, depth_col);
const int w_col_start = (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const int w_col_end = min(w_im / stride_w + 1, width_col);
const int h_col_start = (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const int h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
int h_k = (h_im - h_col * stride_h);
int w_k = (w_im - w_col * stride_w);
int d_k = (d_im - d_col * stride_d);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0 && d_k % dilation_d == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
d_k /= dilation_d;
int data_col_index = c_im * kernel_h * kernel_w * kernel_d * height_col * width_col * depth_col +
h_k * kernel_w * kernel_d * height_col * width_col * depth_col +
w_k * kernel_d * height_col * width_col * depth_col +
d_k * height_col * width_col * depth_col +
h_col * width_col * depth_col +
w_col * depth_col +
d_col;
val += data_col[data_col_index];
}
}
}
}
data_im[index] = val;
}
}
__global__ void col2im3d_kernel(const int n, const float* data_col, __global__ void col2im3d_kernel(const int n, const float* data_col,
const int height, const int width, const int depth, const int height, const int width, const int depth,
const int channels, const int channels,
const int patch_h, const int patch_w, const int patch_d, const int kernel_h, const int kernel_w, const int kernel_d,
const int pad_h, const int pad_w, const int pad_d, const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d, const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col, const int height_col, const int width_col, const int depth_col,
...@@ -140,33 +256,35 @@ __global__ void col2im3d_kernel(const int n, const float* data_col, ...@@ -140,33 +256,35 @@ __global__ void col2im3d_kernel(const int n, const float* data_col,
CUDA_KERNEL_LOOP(index, n) CUDA_KERNEL_LOOP(index, n)
{ {
float val = 0; float val = 0;
int d = index % depth + pad_d; const int d_im = index % depth + pad_d;
int w_index = index / depth; const int w_index = index / depth;
int w = w_index % width + pad_w; const int w_im = w_index % width + pad_w;
int h_index = w_index / width; const int h_index = w_index / width;
int h = h_index % height + pad_h; const int h_im = h_index % height + pad_h;
int c = h_index / height; const int c_im = h_index / height;
// compute the start and end of the output // compute the start and end of the output
int d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1; const int d_col_start = (d_im < kernel_d) ? 0 : (d_im - kernel_d) / stride_d + 1;
int d_col_end = min(d / stride_d + 1, depth_col); const int d_col_end = min(d_im / stride_d + 1, depth_col);
int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; const int w_col_start = (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
int w_col_end = min(w / stride_w + 1, width_col); const int w_col_end = min(w_im / stride_w + 1, width_col);
int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; const int h_col_start = (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
int h_col_end = min(h / stride_h + 1, height_col); const int h_col_end = min(h_im / stride_h + 1, height_col);
int offset = int offset =
(c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col; (c_im * kernel_h * kernel_w * kernel_d + h_im * kernel_w * kernel_d +
w_im * kernel_d + d_im) * height_col * width_col * depth_col;
int coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col; int coeff_h_col = (1 - stride_h * kernel_w * kernel_d * height_col) * width_col * depth_col;
int coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col; int coeff_w_col = (1 - stride_w * kernel_d * height_col * width_col) * depth_col;
int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col); int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
for (int d_col = d_col_start; d_col < d_col_end; ++d_col) for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col]; val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
} }
} }
}
data_im[index] = val; data_im[index] = val;
} }
} }
...@@ -174,17 +292,33 @@ __global__ void col2im3d_kernel(const int n, const float* data_col, ...@@ -174,17 +292,33 @@ __global__ void col2im3d_kernel(const int n, const float* data_col,
void col2im3d(const float* data_col, const int channels, void col2im3d(const float* data_col, const int channels,
const int height, const int width, const int depth, const int height, const int width, const int depth,
const int patch_h, const int patch_w, const int patch_d, const int patch_h, const int patch_w, const int patch_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d, const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d, const int stride_h, const int stride_w, const int stride_d,
float* data_im) float* data_im)
{ {
int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; int dil_patch_h = (patch_h - 1) * dilation_h + 1;
int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; int dil_patch_w = (patch_w - 1) * dilation_w + 1;
int depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1; int dil_patch_d = (patch_d - 1) * dilation_d + 1;
int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
int num_kernels = channels * height * width * depth; int num_kernels = channels * height * width * depth;
// To avoid involving atomic operations, we will launch one kernel per // To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions. // bottom dimension, and then in the kernel add up the top dimensions.
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
dilated_col2im3d_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_col,
height, width, depth, channels,
patch_h, patch_w, patch_d,
dilation_h, dilation_w, dilation_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_im);
}
else{
col2im3d_kernel<<<GET_BLOCKS(num_kernels), col2im3d_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_col, CUDA_NUM_THREADS>>>(num_kernels, data_col,
height, width, depth, channels, height, width, depth, channels,
...@@ -193,6 +327,7 @@ void col2im3d(const float* data_col, const int channels, ...@@ -193,6 +327,7 @@ void col2im3d(const float* data_col, const int channels,
stride_h, stride_w, stride_d, stride_h, stride_w, stride_d,
height_col, width_col, depth_col, height_col, width_col, depth_col,
data_im); data_im);
}
} }
...@@ -210,6 +345,9 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom, ...@@ -210,6 +345,9 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
const int dH = 1, const int dH = 1,
const int dW = 1, const int dW = 1,
const int dD = 1, const int dD = 1,
const int dilH = 1,
const int dilW = 1,
const int dilD = 1,
const int padH = 0, const int padH = 0,
const int padW = 0, const int padW = 0,
const int padD = 0) const int padD = 0)
...@@ -286,10 +424,14 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom, ...@@ -286,10 +424,14 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
"GpuCorr3dMM images and kernel must have the same stack size\n"); "GpuCorr3dMM images and kernel must have the same stack size\n");
return 0; return 0;
} }
// implicit dilated filter
const int dil_kH = (kH - 1) * dilH + 1;
const int dil_kW = (kW - 1) * dilW + 1;
const int dil_kD = (kD - 1) * dilD + 1;
// top: (batchSize, nFilters, topHeight, topWidth, topDepth) // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
const int topHeight = int((bottomHeight + 2*padH - kH) / dH) + 1; const int topHeight = int((bottomHeight + 2*padH - dil_kH) / dH) + 1;
const int topWidth = int((bottomWidth + 2*padW - kW) / dW) + 1; const int topWidth = int((bottomWidth + 2*padW - dil_kW) / dW) + 1;
const int topDepth = int((bottomDepth + 2*padD - kD) / dD) + 1; const int topDepth = int((bottomDepth + 2*padD - dil_kD) / dD) + 1;
if (batchSize != CudaNdarray_HOST_DIMS(top)[0] || if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
nFilters != CudaNdarray_HOST_DIMS(top)[1] || nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
topHeight != CudaNdarray_HOST_DIMS(top)[2] || topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
...@@ -345,6 +487,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom, ...@@ -345,6 +487,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
nChannels, nChannels,
bottomHeight, bottomWidth, bottomDepth, bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD, kH, kW, kD,
dilH, dilW, dilD,
padH, padW, padD, padH, padW, padD,
dH, dW, dD, dH, dW, dD,
col->devdata); col->devdata);
...@@ -392,6 +535,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom, ...@@ -392,6 +535,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
im3d2col(bottom->devdata + n * bottom_stride, nChannels, im3d2col(bottom->devdata + n * bottom_stride, nChannels,
bottomHeight, bottomWidth, bottomDepth, bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD, kH, kW, kD,
dilH, dilW, dilD,
padH, padW, padD, padH, padW, padD,
dH, dW, dD, dH, dW, dD,
col->devdata); col->devdata);
...@@ -461,6 +605,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom, ...@@ -461,6 +605,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
col2im3d(col->devdata, nChannels, col2im3d(col->devdata, nChannels,
bottomHeight, bottomWidth, bottomDepth, bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD, kH, kW, kD,
dilH, dilW, dilD,
padH, padW, padD, padH, padW, padD,
dH, dW, dD, bottom->devdata + n * bottom_stride); dH, dW, dD, bottom->devdata + n * bottom_stride);
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import unittest import unittest
import numpy import numpy
from six.moves import xrange
try:
from scipy import ndimage
except ImportError:
ndimage = None
import theano import theano
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
...@@ -21,31 +26,127 @@ else: ...@@ -21,31 +26,127 @@ else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
# python reference implementation of a 3D convolution
# see also: theano.tensor.nnet.tests.test_conv3d2d
# expects: (batch, 0, channels, 1, 2)
def pyconv3d(signals, filters, border_mode='valid', dilation=(1, 1, 1)):
Ns, Ts, C, Hs, Ws = signals.shape
Nf, Tf, C, Hf, Wf = filters.shape
Tdil, Hdil, Wdil = dilation
Tfdil = (Tf - 1) * Tdil + 1
Hfdil = (Hf - 1) * Hdil + 1
Wfdil = (Wf - 1) * Wdil + 1
# if border_mode is not 'valid', the signals need zero-padding
if border_mode == 'full':
Tpad = Tfdil - 1
Hpad = Hfdil - 1
Wpad = Wfdil - 1
elif border_mode == 'half':
Tpad = Tfdil // 2
Hpad = Hfdil // 2
Wpad = Wfdil // 2
elif isinstance(border_mode, tuple):
Tpad, Hpad, Wpad = map(int, border_mode)
else:
Tpad = 0
Hpad = 0
Wpad = 0
if Tpad > 0 or Hpad > 0 or Wpad > 0:
# zero-pad signals
signals_padded = numpy.zeros((Ns, Ts + 2 * Tpad, C,
Hs + 2 * Hpad, Ws + 2 * Wpad), 'float32')
signals_padded[:, Tpad:(Ts + Tpad), :, Hpad:(Hs + Hpad),
Wpad:(Ws + Wpad)] = signals
Ns, Ts, C, Hs, Ws = signals_padded.shape
signals = signals_padded
Tfdil2 = Tfdil // 2
Hfdil2 = Hfdil // 2
Wfdil2 = Wfdil // 2
dilated_filters = numpy.zeros((Nf, Tfdil, C, Hfdil, Wfdil), dtype=filters.dtype)
dilated_filters[:, ::Tdil, :, ::Hdil, ::Wdil] = filters
# perform valid convolution on the padded signals
rval = numpy.zeros((Ns, Ts - Tfdil + 1, Nf, Hs - Hfdil + 1, Ws - Wfdil + 1))
for ns in xrange(Ns):
for nf in xrange(Nf):
for c in xrange(C):
s_i = signals[ns, :, c, :, :]
f_i = dilated_filters[nf, :, c, :, :]
r_i = rval[ns, :, nf, :, :]
# scipy.signal.convolve performs valid convolution,
# but is quite slow. scipy.ndimage.convolve is faster
# only supports 'same' convolution.
# origin must be -1 for even filters, 0 for odd filters
o_i = ndimage.convolve(s_i, f_i, mode='constant', cval=1,
origin=(f_i.shape[0] % 2 - 1,
f_i.shape[1] % 2 - 1,
f_i.shape[2] % 2 - 1))
# crop to get the result of 'valid' convolution
o_i = o_i[Tfdil2:(r_i.shape[0] + Tfdil2),
Hfdil2:(r_i.shape[1] + Hfdil2),
Wfdil2:(r_i.shape[2] + Wfdil2)]
# the result should be equal to 'valid' convolution
# utt.assert_allclose(o_i, signal.convolve(s_i, f_i, mode='valid'))
r_i += o_i
return rval
class TestCorr3DMM(unittest.TestCase): class TestCorr3DMM(unittest.TestCase):
def run_conv_valid(self, inputs_shape, filters_shape, def run_conv_valid(self, inputs_shape, filters_shape,
subsample=(1, 1, 1)): border_mode='valid',
filter_dilation=(1, 1, 1),
subsample=(1, 1, 1),
verify_grad=False):
inputs_val = numpy.random.random(inputs_shape).astype('float32') inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = shared(inputs_val) inputs = shared(inputs_val)
filters = shared(filters_val) filters = shared(filters_val)
bias = shared(numpy.zeros(filters_shape[0]).astype('float32')) bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
if filter_dilation == (1, 1, 1) and border_mode in ('valid', (0, 0, 0)):
conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters, conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters,
b=bias, d=subsample) b=bias, d=subsample)
conv = GpuCorr3dMM(border_mode="valid", f_ref = theano.function([], conv_ref)
res_ref = f_ref()
elif subsample == (1, 1, 1):
if ndimage is None:
raise SkipTest('This test needs SciPy.')
# input = b012c
# pyconv3d wants = b0c12 = (0, 1, 4, 2, 3)
# pyconv3d outputs = b0c12 = (0, 1, 3, 4, 2)
res_ref = pyconv3d(signals=inputs_val.transpose(0, 1, 4, 2, 3),
filters=filters_val.transpose(0, 1, 4, 2, 3)[:, ::-1, :, ::-1, ::-1],
dilation=filter_dilation,
border_mode=border_mode).transpose(0, 1, 3, 4, 2)
else:
raise SkipTest('No reference implementation that combines '
'border_mode and subsampling.')
conv = GpuCorr3dMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample)( subsample=subsample)(
inputs.dimshuffle(0, 4, 1, 2, 3), inputs.dimshuffle(0, 4, 1, 2, 3),
filters.dimshuffle(0, 4, 1, 2, 3)) filters.dimshuffle(0, 4, 1, 2, 3))
conv = conv.dimshuffle(0, 2, 3, 4, 1) conv = conv.dimshuffle(0, 2, 3, 4, 1)
f_ref = theano.function([], conv_ref)
f = theano.function([], conv, mode=mode_with_gpu) f = theano.function([], conv, mode=mode_with_gpu)
res_ref = f_ref()
res = f() res = f()
utt.assert_allclose(res_ref, res) utt.assert_allclose(res_ref, res)
if verify_grad:
utt.verify_grad(GpuCorr3dMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample),
[inputs_val.transpose(0, 4, 1, 2, 3),
filters_val.transpose(0, 4, 1, 2, 3)])
def test_valid(self): def test_valid(self):
self.run_conv_valid(inputs_shape=(16, 20, 12, 16, 1), self.run_conv_valid(inputs_shape=(16, 20, 12, 16, 1),
filters_shape=(10, 6, 12, 4, 1)) filters_shape=(10, 6, 12, 4, 1))
...@@ -68,6 +169,50 @@ class TestCorr3DMM(unittest.TestCase): ...@@ -68,6 +169,50 @@ class TestCorr3DMM(unittest.TestCase):
filters_shape=(10, 6, 12, 4, 1), filters_shape=(10, 6, 12, 4, 1),
subsample=(1, 2, 3)) subsample=(1, 2, 3))
def test_border_mode(self):
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode='valid')
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode='half')
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode='full')
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode=(0, 0, 0))
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode=(1, 2, 3))
self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
border_mode=(3, 2, 1))
def test_filter_dilation(self):
inputs_shape = [16, 20, 12, 15, 1]
filters_shape = [10, 6, 5, 4, 1]
for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
for border_mode in ['valid', 'half', 'full']:
self.run_conv_valid(inputs_shape=inputs_shape,
filters_shape=filters_shape,
filter_dilation=filter_dilation,
border_mode=border_mode)
def test_verify_gradients(self):
# use a small example to check the gradients
inputs_shape = [2, 7, 9, 6, 1]
filters_shape = [1, 3, 3, 2, 1]
for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
for border_mode in ['valid', 'half', 'full', (2, 1, 3)]:
self.run_conv_valid(inputs_shape=inputs_shape,
filters_shape=filters_shape,
filter_dilation=filter_dilation,
border_mode=border_mode,
verify_grad=True)
def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape, def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
subsample=(1, 1, 1)): subsample=(1, 1, 1)):
inputs_val = numpy.random.random(inputs_shape).astype('float32') inputs_val = numpy.random.random(inputs_shape).astype('float32')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论