提交 4acf0a40 authored 作者: Nicolas Ballas's avatar Nicolas Ballas

Add 3d correlation based on matrix multiplication

上级 c022347b
......@@ -960,6 +960,503 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
return [[1], [1], [0], [0]] # no connection to height, width
class BaseGpuCorr3dMM(GpuOp):
"""Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
`GpuCorr3dMM_gradInputs`. Cannot be used directly."""
def __init__(self, border_mode="valid",
subsample=(1, 1, 1),
pad=(0, 0, 0)):
if border_mode != "valid":
raise ValueError("border_mode must be 'valid'")
self.border_mode = border_mode
if len(subsample) != 3:
raise ValueError("subsample must have three elements")
self.subsample = subsample
if (pad not in ("half", "full")) and (len(pad) != 3):
raise ValueError("pad must be 'half', 'full', or have three elements")
self.pad = pad
def __eq__(self, other):
return type(self) == type(other) \
and self.border_mode == other.border_mode \
and self.subsample == other.subsample \
and self.pad == other.pad
def __hash__(self):
# don't use hash(self.version) as hash(-1)==-2 and
# hash(-2)==-2 in python!
return hash(type(self)) \
^ hash(self.border_mode) \
^ hash(self.subsample) \
^ hash(self.pad)
def __str__(self):
return '%s{%s, %s, pad=%r}' % (
self.__class__.__name__,
self.border_mode,
str(self.subsample),
self.pad)
def flops(self, inp, outp):
""" Useful with the hack in profilemode to print the MFlops"""
# if the output shape is correct, then this gives the correct
# flops for any direction, sampling, padding, and border mode
inputs, filters = inp
outputs, = outp
assert inputs[1] == filters[1]
# nb mul and add by output pixel
flops = filters[2] * filters[3] * 2
# nb flops by output image
flops *= outputs[2] * outputs[3]
# nb patch multiplied
flops *= inputs[1] * filters[0] * inputs[0]
return flops
def c_headers(self):
return ['cuda_ndarray.cuh', '<stdio.h>']
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 1)
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of
# these files
files = ['corr3d_gemm.cu']
codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in files]
return reduce(str.__add__, codes)
def c_code_helper(self, bottom, weights,
top, direction,
sub,
height=None, width=None, depth=None):
"""
This generates the C code for GpuCorrMM (direction="forward"),
GpuCorrMM_gradWeights (direction="backprop weights"), and
GpuCorrMM_gradInputs (direction="backprop inputs").
Depending on the direction, one of bottom, weights, top will
receive the output, while the other two serve as inputs.
:param bottom: Variable name of the input images in the forward pass,
or the gradient of the input images in backprop wrt. inputs
:param weights: Variable name of the filters in the forward pass,
or the gradient of the filters in backprop wrt. weights
:param top: Variable name of the output images / feature maps in the
forward pass, or the gradient of the outputs in the backprop passes
:param direction: "forward" to correlate bottom with weights and store
results in top,
"backprop weights" to do a valid convolution of bottom with top
(swapping the first two dimensions) and store results in weights,
and "backprop inputs" to do a full convolution of top with weights
(swapping the first two dimensions) and store results in bottom.
:param sub: Dictionary of substitutions useable to help generating the
C code.
:param height: If self.subsample[0] != 1, a variable giving the height
of the filters for direction="backprop weights" or the height of the
input images for direction="backprop inputs".
If self.pad == 'half', a variable giving the height of the filters
for direction="backprop weights".
Ignored otherwise.
:param width: If self.subsample[1] != 1, a variable giving the width
of the filters for direction="backprop weights" or the width of the
input images for direction="backprop inputs".
If self.pad == 'half', a variable giving the width of the filters
for direction="backprop weights".
Ignored otherwise.
"""
if self.border_mode != "valid":
raise ValueError("mode must be 'valid'")
dH, dW, dD = self.subsample
if self.pad == "half":
padH = padW = padD = -1
elif self.pad == "full":
padH = padW = padD =-2
else:
padH, padW, padD = self.pad
if direction == "forward":
direction = 0
out = top
elif direction == "backprop weights":
direction = 1
out = weights
elif direction == "backprop inputs":
direction = 2
out = bottom
else:
raise ValueError("direction must be one of 'forward', "
"'backprop weights', 'backprop inputs'")
# When subsampling, we cannot unambiguously infer the height and width
# of bottom and weights from top, so we require them to be given.
# Similarly, when pad="half", we cannot infer the weight size.
if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
if not height:
raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
else:
height = 'NULL'
if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
if not width:
raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
else:
width = 'NULL'
if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
if not depth:
raise ValueError("depth must be given for backprop with horizontal sampling or pad='half'")
depth = '(*(npy_int*)(PyArray_DATA(%s)))' % depth
else:
depth = 'NULL'
sub = sub.copy()
sub.update(locals())
return """
// Mandatory args
int direction = %(direction)s; // forward, bprop weights, bprop inputs
// Optional args
int dH = %(dH)s;
int dW = %(dW)s;
int dD = %(dD)s;
int padH = %(padH)s;
int padW = %(padW)s;
int padD = %(padD)s;
CudaNdarray * bottom = %(bottom)s;
CudaNdarray * weights = %(weights)s;
CudaNdarray * top = %(top)s;
CudaNdarray * out2 = NULL;
// Obtain or infer kernel width and height
// (we need to know it early to be able to handle auto-padding)
int kH, kW, kD;
if (direction != 1)
{
// weight is an input variable, we can just read its shape
kH = CudaNdarray_HOST_DIMS(weights)[2];
kW = CudaNdarray_HOST_DIMS(weights)[3];
kD = CudaNdarray_HOST_DIMS(weights)[4];
}
else
{
if ((dH != 1) || (padH == -1))
{
// vertical subsampling or half padding, kernel height is specified
kH = %(height)s;
}
else if (padH == -2)
{
// vertical full padding, we can infer the kernel height
kH = 2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
}
else
{
// explicit padding, we can infer the kernel height
kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
}
if ((dW != 1) || (padW == -1))
{
kW = %(width)s;
}
else if (padW == -2)
{
kW = 2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
}
else
{
kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
}
if ((dD != 1) || (padD == -1))
{
kD = %(depth)s;
}
else if (padD == -2)
{
kD = 2 - CudaNdarray_HOST_DIMS(bottom)[4] + (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD;
}
else
{
kD = CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD;
}
}
// Auto-padding if requested
if (padH == -1)
{ // vertical half padding
padH = kH / 2;
}
else if (padH == -2)
{ // vertical full padding
padH = kH - 1;
}
else if (padH < 0)
{
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padH must be >= -2");
%(fail)s
}
if (padW == -1) { // horizontal half padding
padW = kW / 2;
}
else if (padW == -2) { // horizontal full padding
padW = kW - 1;
}
else if (padW < 0)
{
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padW must be >= -2");
%(fail)s
}
if (padD == -1)
{ // horizontal half padding
padD = kD / 2;
}
else if (padD == -2)
{ // horizontal full padding
padW = kW - 1;
}
else if (padW < 0)
{
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padW must be >= -2");
%(fail)s
}
// Infer output shape
int out_dim[5];
switch(direction) {
case 0: // forward pass
// output is top: (batchsize, num_filters, height, width, depth)
// height and width: top = (bottom + 2*pad - weight) / sample + 1
out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - CudaNdarray_HOST_DIMS(weights)[4]) / dD + 1;
break;
case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width, depth)
// height, width and depth: weights = bottom + 2*pad - (top-1) * sample
out_dim[0] = CudaNdarray_HOST_DIMS(top)[1];
out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1];
out_dim[2] = kH; // already inferred further above
out_dim[3] = kW; // how convenient
out_dim[4] = kD;
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width, depth)
// height, width and depth: bottom = (top-1) * sample + weights - 2*pad
out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
out_dim[4] = (dD != 1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + CudaNdarray_HOST_DIMS(weights)[4] - 2*padD;
break;
default:
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
%(fail)s
}
// Prepare output array
if (!(%(out)s
&& %(out)s->nd == 5
&& CudaNdarray_is_c_contiguous(%(out)s)
&& CudaNdarray_HOST_DIMS(%(out)s)[0] == out_dim[0]
&& CudaNdarray_HOST_DIMS(%(out)s)[1] == out_dim[1]
&& CudaNdarray_HOST_DIMS(%(out)s)[2] == out_dim[2]
&& CudaNdarray_HOST_DIMS(%(out)s)[3] == out_dim[3]
&& CudaNdarray_HOST_DIMS(%(out)s)[4] == out_dim[4]))
{
Py_XDECREF(%(out)s);
%(out)s = (CudaNdarray*)CudaNdarray_NewDims(5, out_dim);
if (NULL == %(out)s)
{
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorr3dM: Failed to allocate output of %%d x %%d x %%d x %%d x %%d",
out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
%(fail)s
}
}
// Call CUDA code
out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dD, padH, padW, padD);
if (out2==NULL){
%(fail)s
}
assert (out2 == %(out)s);
""" % sub
class GpuCorr3dMM(BaseGpuCorr3dMM):
"""GPU correlation implementation using Matrix Multiplication.
:warning: For 700 series Nvidia GPUs of compute capability 3.5 and CUDA 5.0
to 6.0, there is a bug in CUBLAS' matrix multiplication function that
can make GpuCorrMM or its gradients crash for some input and filter
shapes. So if you have a Tesla K20, Tesla K40, Quadro K6000, GeForce GT
640 (DDR5), GeForce GTX 780 (or Ti), GeForce GTX TITAN (or Black or Z)
and experience a crash, switching to CUDA 6.5 or CUDA 4.2 should fix it.
If this is not possible, changing the input or filter shapes (e.g., the
batchsize or number of filters) may also work around the CUBLAS bug.
"""
def __init__(self, border_mode="valid",
subsample=(1, 1, 1),
pad=(0, 0, 0)):
"""
:param border_mode: currently supports "valid" only; "full" can be
simulated by setting `pad="full"` (at the cost of performance), or
by using `GpuCorrMM_gradInputs`
:param subsample: the subsample operation applied to each output image.
Should be a tuple with 3 elements.
`(sv, sh, sl)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`,
but faster.
Set to `(1, 1, 1)` to disable subsampling.
:param pad: the width of a border of implicit zeros to pad the input
image with. Should be a tuple with 3 elements giving the numbers of
rows and columns to pad on each side, or "half" to set the padding
to `(kernel_rows // 2, kernel_columns // 2, kernel_depth // 2)`, or "full" to set the
padding to `(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)` at runtime.
Set to `(0, 0, 0)` to disable padding.
:note: Currently, the Op requires the inputs, filters and outputs to be
C-contiguous. Use :func:`gpu_contiguous
<theano.sandbox.cuda.basic_ops.gpu_contiguous>` on these arguments
if needed.
"""
super(GpuCorr3dMM, self).__init__(border_mode, subsample, pad)
def make_node(self, img, kern):
img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern)
if img.type.ndim != 5:
raise TypeError('img must be 5D tensor')
if kern.type.ndim != 5:
raise TypeError('kern must be 5D tensor')
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
bottom, weights = inp
top, = out_
direction = "forward"
return super(GpuCorr3dMM, self).c_code_helper(bottom, weights, top, direction, sub)
def grad(self, inp, grads):
bottom, weights = inp
top, = grads
top = gpu_contiguous(top)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)(
weights, top, bottom.shape[-2:])
d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)(
bottom, top, weights.shape[-2:])
return d_bottom, d_weights
class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
"""Gradient wrt. filters for `GpuCorr3dMM`.
:note: You will not want to use this directly, but rely on Theano's
automatic differentiation or graph optimization to use it as needed."""
def __init__(self, border_mode="valid",
subsample=(1, 1, 1),
pad=(0, 0, 0)):
super(GpuCorr3dMM_gradWeights, self).__init__(border_mode, subsample, pad)
def make_node(self, img, topgrad, shape=None):
img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad)
if img.type.ndim != 5:
raise TypeError('img must be 5D tensor')
if topgrad.type.ndim != 5:
raise TypeError('topgrad must be 5D tensor')
if self.subsample != (1, 1, 1) or self.pad == "half":
if shape is None:
raise ValueError('shape must be given if subsample != (1, 1, 1), or pad == "half"')
height_width_depth = [shape[0], shape[1], shape[2]]
else:
height_width_depth = []
broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
False, False, False]
print [img, topgrad] + height_width_depth
return Apply(self, [img, topgrad] + height_width_depth, [CudaNdarrayType(broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
bottom, top = inp[:2]
height, width, depth = inp[2:] or (None, None, None)
weights, = out_
direction = "backprop weights"
return super(GpuCorr3dMM_gradWeights, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth)
def grad(self, inp, grads):
bottom, top = inp[:2]
weights, = grads
weights = gpu_contiguous(weights)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)(weights, top, bottom.shape[-2:])
d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)(
bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
return (d_bottom, d_top) + d_height_width_depth
def connection_pattern(self, node):
if node.nin == 2:
return [[1], [1]]
else:
return [[1], [1], [0], [0], [0]] # no connection to height, width
class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
"""Gradient wrt. inputs for `GpuCorr3dMM`.
:note: You will not want to use this directly, but rely on Theano's
automatic differentiation or graph optimization to use it as needed."""
def __init__(self, border_mode="valid",
subsample=(1, 1, 1),
pad=(0, 0, 0)):
super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample, pad)
def make_node(self, kern, topgrad, shape=None):
kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad)
if kern.type.ndim != 5:
raise TypeError('kern must be 5D tensor')
if topgrad.type.ndim != 5:
raise TypeError('topgrad must be 5D tensor')
if self.subsample != (1, 1, 1) and shape is None:
raise ValueError('shape must be given if subsample != (1, 1, 1)')
height_width_depth = [shape[0], shape[1], shape[2]] if self.subsample != (1, 1, 1) else []
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
False, False, False]
return Apply(self, [kern, topgrad] + height_width_depth, [CudaNdarrayType(broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
weights, top = inp[:2]
height, width, depth = inp[2:] or (None, None, None)
bottom, = out_
direction = "backprop inputs"
return super(GpuCorr3dMM_gradInputs, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth)
def grad(self, inp, grads):
weights, top = inp[:2]
bottom, = grads
bottom = gpu_contiguous(bottom)
d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)(
bottom, top, weights.shape[-2:])
d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)(
bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
return (d_weights, d_top) + d_height_width_depth
def connection_pattern(self, node):
if node.nin == 2:
return [[1], [1], [1]]
else:
return [[1], [1], [0], [0], [0]] # no connection to height, width
##
# Not really a BLAS operation, but whatever.
#
......
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of
// the Caffe software.
/*
Copyright (c) 2014, The Regents of the University of California (Regents)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef _GLIBCXX_ATOMIC_BUILTINS
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/caffe_common.hpp)
// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
// CUDA: thread number configuration.
// Use 1024 threads per block, which requires cuda sm_2x or above,
// or fall back to attempt compatibility (best of luck to you).
#if __CUDA_ARCH__ >= 200
const int CUDA_NUM_THREADS = 1024;
#else
const int CUDA_NUM_THREADS = 512;
#endif
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
// (Adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
// Kernels for fast unfold + copy
__global__ void im3d2col_kernel(const int n, const float* data_im,
const int height, const int width,
const int depth,
const int kernel_h, const int kernel_w,
const int kernel_d,
const int pad_h, const int pad_w,
const int pad_d,
const int stride_h, const int stride_w,
const int stride_d,
const int height_col, const int width_col,
const int depth_col,
float* data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
int d_out = index % depth_col;
int w_index = index / depth_col;
int w_out = w_index % width_col;
int h_index = w_index / width_col;
int h_out = h_index % height_col;
int channel_in = h_index / height_col;
//channel_in = 1;
int channel_out = channel_in * kernel_h * kernel_w * kernel_d;
int h_in = h_out * stride_h - pad_h;
int w_in = w_out * stride_w - pad_w;
int d_in = d_out * stride_d - pad_d;
float* data_col_ptr = data_col;
data_col_ptr += channel_out * (height_col * width_col * depth_col) +
h_out * (width_col * depth_col) + w_out * depth_col + d_out;
const float* data_im_ptr = data_im;
data_im_ptr += channel_in * (height * width * depth) +
h_in * (width * depth) + w_in * depth + d_in;
for (int i = 0; i < kernel_h; ++i)
{
int h = h_in + i;
for (int j = 0; j < kernel_w; ++j)
{
int w = w_in + j;
for (int k = 0; k < kernel_d; ++k)
{
int d = d_in + k;
*data_col_ptr = (h >= 0 && w >= 0 && d >= 0 &&
h < height && w < width && d < depth) ?
data_im_ptr[i * (width * depth) + j *depth + k] : 0;
data_col_ptr += height_col * width_col * depth_col;
}
}
}
}
}
void im3d2col(const float* data_im, const int channels,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
float* data_col)
{
// We are going to launch channels * height_col * width_col * depth_col kernels, each
// kernel responsible for copying a single-channel grid.
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1;
int num_kernels = channels * height_col * width_col * depth_col;
im3d2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_im,
height, width, depth,
kernel_h, kernel_w, kernel_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_col);
}
__global__ void col2im3d_kernel(const int n, const float* data_col,
const int height, const int width,
const int depth, const int channels,
const int patch_h, const int patch_w,
const int patch_d, const int pad_h,
const int pad_w, const int pad_d,
const int stride_h, const int stride_w,
const int stride_d, const int height_col,
const int width_col, const int depth_col,
float* data_im)
{
CUDA_KERNEL_LOOP(index, n)
{
float val = 0;
int d = index % depth + pad_d;
int w_index = index / depth;
int w = w_index % width + pad_w;
int h_index = w_index / width;
int h = h_index % height + pad_h;
int c = h_index / height;
// compute the start and end of the output
int d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1;
int d_col_end = min(d / stride_d + 1, depth_col);
int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
int w_col_end = min(w / stride_w + 1, width_col);
int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
int h_col_end = min(h / stride_h + 1, height_col);
int offset =
(c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col;
int coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
int coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
for (int d_col = d_col_start; d_col < d_col_end; ++d_col)
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
}
}
data_im[index] = val;
}
}
void col2im3d(const float* data_col, const int channels,
const int height, const int width, const int depth,
const int patch_h, const int patch_w, const int patch_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
float* data_im)
{
int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1;
int num_kernels = channels * height * width * depth;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
col2im3d_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_col,
height, width, depth, channels,
patch_h, patch_w, patch_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_im);
}
// Theano op code
// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
// Adaptation for 3d
CudaNdarray* corr3dMM(CudaNdarray *const bottom,
CudaNdarray *const weight,
CudaNdarray *const top,
const int direction,
const int dH = 1,
const int dW = 1,
const int dD = 1,
const int padH = 0,
const int padW = 0,
const int padD = 1)
{
if (bottom->nd != 5)
{
PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D");
return NULL;
}
if (!CudaNdarray_is_c_contiguous(bottom))
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM requires bottom to be C-contiguous, "
"but strides are: %d %d %d %d %d\n",
CudaNdarray_HOST_STRIDES(bottom)[0],
CudaNdarray_HOST_STRIDES(bottom)[1],
CudaNdarray_HOST_STRIDES(bottom)[2],
CudaNdarray_HOST_STRIDES(bottom)[3],
CudaNdarray_HOST_STRIDES(bottom)[4]);
return 0;
}
if (weight->nd != 5)
{
PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D");
return 0;
}
if (!CudaNdarray_is_c_contiguous(weight))
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM requires weight to be C-contiguous, "
"but strides are: %d %d %d %d %d\n",
CudaNdarray_HOST_STRIDES(weight)[0],
CudaNdarray_HOST_STRIDES(weight)[1],
CudaNdarray_HOST_STRIDES(weight)[2],
CudaNdarray_HOST_STRIDES(weight)[3],
CudaNdarray_HOST_STRIDES(weight)[4]);
return 0;
}
if (top->nd != 5)
{
PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D");
return 0;
}
if (!CudaNdarray_is_c_contiguous(top))
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM requires top to be C-contiguous, "
"but strides are: %d %d %d %d %d\n",
CudaNdarray_HOST_STRIDES(top)[0],
CudaNdarray_HOST_STRIDES(top)[1],
CudaNdarray_HOST_STRIDES(top)[2],
CudaNdarray_HOST_STRIDES(top)[3],
CudaNdarray_HOST_STRIDES(top)[4]);
return 0;
}
// Extract some shape information for later and check shape consistency
// bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
const int batchSize = CudaNdarray_HOST_DIMS(bottom)[0];
const int nChannels = CudaNdarray_HOST_DIMS(bottom)[1];
const int bottomHeight = CudaNdarray_HOST_DIMS(bottom)[2];
const int bottomWidth = CudaNdarray_HOST_DIMS(bottom)[3];
const int bottomDepth = CudaNdarray_HOST_DIMS(bottom)[4];
// weights: (nFilters, nChannels, rows, columns, depth)
const int nFilters = CudaNdarray_HOST_DIMS(weight)[0];
const int kH = CudaNdarray_HOST_DIMS(weight)[2];
const int kW = CudaNdarray_HOST_DIMS(weight)[3];
const int kD = CudaNdarray_HOST_DIMS(weight)[4];
if (nChannels != CudaNdarray_HOST_DIMS(weight)[1])
{
PyErr_SetString(PyExc_ValueError,
"GpuCorr3dMM images and kernel must have the same stack size\n");
return 0;
}
// top: (batchSize, nFilters, topHeight, topWidth, topDepth)
const int topHeight = int((bottomHeight + 2*padH - kH) / dH) + 1;
const int topWidth = int((bottomWidth + 2*padW - kW) / dW) + 1;
const int topDepth = int((bottomDepth + 2*padD - kD) / dD) + 1;
if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
topWidth != CudaNdarray_HOST_DIMS(top)[3] ||
topDepth != CudaNdarray_HOST_DIMS(top)[4])
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM shape inconsistency:\n"
" bottom shape: %d %d %d %d %d\n"
" weight shape: %d %d %d %d %d\n"
" top shape: %d %d %d %d %d (expected %d %d %d %d %d)\n",
batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
nFilters, nChannels, kH, kW, kD,
CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3],
CudaNdarray_HOST_DIMS(top)[4],
batchSize, nFilters, topHeight, topWidth, topDepth);
return 0;
}
// Create temporary columns
int col_dim[2];
col_dim[0] = nChannels * kW * kH * kD;
col_dim[1] = topHeight * topWidth * topDepth;
CudaNdarray* col = (CudaNdarray*) CudaNdarray_NewDims(2, col_dim);
if (0 == col)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM failed to allocate working memory of %d x %d\n",
col_dim[0], col_dim[1]);
return 0;
}
// Define some useful variables
const int bottom_stride = CudaNdarray_HOST_STRIDES(bottom)[0];
const int top_stride = CudaNdarray_HOST_STRIDES(top)[0];
const int K_ = col_dim[0];
const int N_ = col_dim[1];
const int M_ = nFilters;
const float one = 1.0f;
const float zero = 0.0f;
CudaNdarray *output;
if (direction == 0)
{ // forward pass
output = top;
// valid correlation: im2col, then gemm
// Iterate over batch
for (int n = 0; n < batchSize; n++)
{
// First, im3d2col
im3d2col(bottom->devdata + n * bottom_stride,
nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD,
padH, padW, padD,
dH, dW, dD,
col->devdata);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered a CUDA error in im2col: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorr3dMM() documentation.\n",
cudaGetErrorString(err));
return 0;
}
// Second, gemm
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
N_, M_, K_,
&one,
col->devdata, N_,
weight->devdata, K_,
&zero,
top->devdata + n * top_stride, N_);
if (status != CUBLAS_STATUS_SUCCESS)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered a CUBLAS error: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorr3dMM() documentation.\n",
cublasGetErrorString(status));
return 0;
}
}
}
else if (direction == 1)
{
// backprop wrt. weights
output = weight;
// valid convolution: im2col, then gemm
// Iterate over batch
for (int n = 0; n < batchSize; n++)
{
// First, im2col
im3d2col(bottom->devdata + n * bottom_stride, nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD,
padH, padW, padD,
dH, dW, dD,
col->devdata);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered a CUDA error in im2col: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorr3dMM() documentation.\n",
cudaGetErrorString(err));
return 0;
}
// Second, gemm
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_T, CUBLAS_OP_N,
K_, M_, N_,
&one,
col->devdata, N_,
top->devdata + n * top_stride, N_,
(n == 0) ? &zero : &one,
weight->devdata, K_);
if (status != CUBLAS_STATUS_SUCCESS)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered a CUBLAS error: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorr3dMM() documentation.\n",
cublasGetErrorString(status));
return 0;
}
}
}
else if (direction == 2)
{
// backprop wrt. inputs
output = bottom;
// full convolution: gemm, then col2im3d
// Iterate over batch
for (int n = 0; n < batchSize; n++)
{
// gemm into columns
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_T,
N_, K_, M_,
&one,
top->devdata + n * top_stride, N_,
weight->devdata, K_,
&zero,
col->devdata, N_);
if (status != CUBLAS_STATUS_SUCCESS)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered a CUBLAS error: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorr3dMM() documentation.\n",
cublasGetErrorString(status));
return 0;
}
// col2im3d back to the data
col2im3d(col->devdata, nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD,
padH, padW, padD,
dH, dW, dD, bottom->devdata + n * bottom_stride);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered a CUDA error in col2im: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorr3dMM() documentation.\n",
cudaGetErrorString(err));
return 0;
}
}
}
// Free temporary columns
Py_DECREF(col);
// Note that we don't change the refcount of the output matrix here. Output
// allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper();
// in here output is just aliased to one of bottom, weights, or top.
return output;
}
import unittest
import numpy
import theano
from theano.tests import unittest_tools as utt
# Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
from theano.sandbox.cuda import float32_shared_constructor as shared
from theano.sandbox.cuda.blas import GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradInputs
from theano.sandbox.cuda.basic_ops import gpu_contiguous
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
class TestCorr3DMM(unittest.TestCase):
def run_conv_valid(self, inputs_shape, filters_shape,
subsample = ( 1, 1, 1)):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = shared(inputs_val)
filters = shared(filters_val)
bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters,
b=bias, d=subsample)
conv = GpuCorr3dMM(border_mode = "valid",
subsample=subsample)(inputs.dimshuffle(0, 4, 1, 2, 3),
filters.dimshuffle(0, 4, 1, 2, 3))
conv = conv.dimshuffle(0, 2, 3, 4, 1)
f_ref = theano.function([], conv_ref)
f = theano.function([], conv, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res, rtol=1e-05, atol=1e-05)
def test_valid(self):
self.run_conv_valid(inputs_shape=(16, 20, 32, 16, 1),
filters_shape=(10, 6, 12, 4, 1))
self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(2, 2, 2))
self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(2, 2, 2))
self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 3, 3))
self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 3, 3))
self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 2, 1))
self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
filters_shape=(10, 6, 12, 4, 1),
subsample=(1, 2, 3))
def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
subsample = (1, 1, 1)):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = shared(inputs_val)
dCdH = shared(dCdH_val)
filters = shared(filters_val)
conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH,
WShape=filters_shape,
d=subsample)
img = gpu_contiguous(inputs.dimshuffle(0, 4, 1, 2, 3))
topgrad = gpu_contiguous(dCdH.dimshuffle(0, 4, 1, 2, 3))
if (subsample == (1, 1, 1)):
conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(img,
topgrad)
else:
conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(img,
topgrad,
shape=filters.shape[1:4])
conv_gemm = conv_gemm.dimshuffle(0, 2, 3, 4, 1)
f_ref = theano.function([], conv)
f = theano.function([], conv_gemm)
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res, rtol=1e-04, atol=1e-04)
def test_gradweight(self):
self.run_gradweight(inputs_shape = (16, 20, 32, 16, 1),
filters_shape = (10, 6, 12, 4, 1),
dCdH_shape = (16, 15, 21, 13, 10),
subsample = (1, 1, 1))
self.run_gradweight(inputs_shape = (16, 20, 32, 16, 1),
filters_shape = (10, 6, 12, 4, 1),
dCdH_shape = (16, 8, 11, 7, 10),
subsample = (2, 2, 2))
self.run_gradweight(inputs_shape = (16, 20, 32, 16, 1),
filters_shape = (10, 6, 12, 4, 1),
dCdH_shape = (16, 5, 7, 5, 10),
subsample = (3, 3, 3))
self.run_gradweight(inputs_shape = (16, 20, 32, 16, 1),
filters_shape = (10, 6, 12, 4, 1),
dCdH_shape = (16, 8, 21, 5, 10),
subsample = (2, 1, 3))
def run_gradinput(self, inputs_shape, filters_shape,
subsample = (1, 1, 1)):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = shared(inputs_val)
filters = shared(filters_val)
bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=subsample,
H=inputs)
f_ref = theano.function([], conv)
res_ref = f_ref()
### Get bottom shape using convTransp3D
bottom_shape = res_ref.shape
bottom_val = numpy.random.random(bottom_shape).astype('float32')
bottom = shared(bottom_val)
weight = gpu_contiguous(filters.dimshuffle(0, 4, 1, 2, 3))
top = gpu_contiguous(inputs.dimshuffle(0, 4, 1, 2, 3))
if (subsample == (1, 1, 1)):
conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(kern=weight, topgrad=top)
else:
conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(kern=weight, topgrad=top,
shape = bottom.shape[1:4])
conv_gemm = conv_gemm.dimshuffle(0, 2, 3, 4, 1)
f = theano.function([], conv_gemm)
res = f()
utt.assert_allclose(res_ref, res, rtol=1e-04, atol=1e-04)
def test_gradinput(self):
self.run_gradinput(inputs_shape = (16, 15, 21, 12, 10),
filters_shape = (10, 6, 12, 4, 1))
self.run_gradinput(inputs_shape = (16, 15, 21, 12, 10),
filters_shape = (10, 6, 12, 4, 1),
subsample=(2,2,2))
self.run_gradinput(inputs_shape = (16, 15, 21, 12, 10),
filters_shape = (10, 6, 12, 4, 1),
subsample=(3,3,3))
self.run_gradinput(inputs_shape = (16, 15, 21, 12, 10),
filters_shape = (10, 6, 12, 4, 1),
subsample=(3,1,2))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论