提交 a725adf3 authored 作者: f0k's avatar f0k

Refactored GpuCorrMM to be split into separate ops for the forward pass and the two backward passes

上级 e76a29d9
......@@ -8,6 +8,7 @@ from theano.compat.six import StringIO
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda import as_cuda_ndarray_variable
from theano.sandbox.cuda.basic_ops import gpu_contiguous
class GpuDot22(GpuOp):
......@@ -500,60 +501,22 @@ gpu_ger_no_inplace = GpuGer(inplace=False)
gpu_ger_inplace = GpuGer(inplace=True)
class GpuCorrMM(GpuOp):
"""GPU correlation/convolution implementation using Matrix Multiplication.
class BaseGpuCorrMM(GpuOp):
"""Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
`GpuCorrMM_gradInputs`. Cannot be used directly."""
:note: It doesn't implement the grad. So you shouldn't use it directly, but
use :func:`conv2d <theano.tensor.nnet.conv.conv2d>` and then enable the
Theano flag ``optimizer_including=conv_gemm`` to automatically replace
all convolution operations with `GpuCorrMM`.
"""
def __init__(self, border_mode,
def __init__(self, border_mode="valid",
subsample=(1, 1),
pad=(0, 0)):
"""
:param border_mode: "valid" or "full"
:param subsample: the subsample operation applied to each output image.
Should be a tuple with 2 elements.
(sv, sh) is equivalent to GpuCorrMM(...)(...)[:,:,::sv, ::sh]
If border_mode="full", this is instead treated as an upsampling
operation applied to each input image.
Set to (1, 1) to disable downsampling/upsampling.
:param pad: the width of a border of implicit zeros to pad the input
image with. Should be a tuple with 2 elements giving the numbers of
rows and columns to pad on each side, or "auto" to set the padding
to (kernel_rows - 1, kernel_columns - 1) at runtime.
If border_mode="full", this is instead treated as the width of a
border to crop from the output image.
Set to (0, 0) to disable padding/cropping.
:note: The border_mode changes the meaning of several parameters.
If border_mode="valid", the Op does a valid correlation of a padded
input image and subsamples it. (To perform a convolution instead,
you will need to flip the kernels.)
If border_mode="full", the Op does a full convolution of an
upsampled input image and crops it. (This can be used as a backward
pass of the valid correlation done with border_mode="valid".)
Combined with pad="auto", you can use border_mode="valid" to
simulate a full correlation with subsampling, or border_mode="full"
to simulate a valid convolution with upsampling.
:note: Currently, the Op requires a very specific memory layout.
For border_mode="valid", inputs, filters and outputs must be
C-contiguous. For border_mode="full", the same applies, except that
the strides of the first two dimensions of the filters (output and
input channels) must be swapped compared to C-contiguity.
"""
if border_mode != "valid":
raise ValueError("border_mode must be 'valid'")
self.border_mode = border_mode
if len(subsample) != 2:
raise ValueError("subsample must have two elements")
self.subsample = subsample
#if (border_mode == "full") and (subsample != (1,1)):
# raise NotImplementedError(
# "GpuCorrMM doesn't support subsampling for border_mode='full'")
if (pad != "auto") and (len(pad) != 2):
raise ValueError("pad must be 'auto' or have two elements")
self.pad = pad
#if (border_mode == "full") and (pad != (0,0)):
# raise NotImplementedError(
# "GpuCorrMM doesn't support padding for border_mode='full'")
def __eq__(self, other):
return type(self) == type(other) \
......@@ -576,34 +539,19 @@ class GpuCorrMM(GpuOp):
str(self.subsample),
self.pad)
def make_node(self, img, kern):
img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def flops(self, inputs, outputs):
images, kerns = inputs
out, = outputs
assert images[1] == kerns[1]
flops = 0
if self.border_mode == "valid":
# nb mul and add by output pixel
flops = kerns[2] * kerns[3] * 2
# nb flops by output image
flops *= out[2] * out[3]
# nb patch multiplied
flops *= images[1] * kerns[0] * images[0]
else:
flops = (images[0] * kerns[0] * images[1] *
kerns[2] * kerns[3] *
images[2] * images[3] * 2)
def flops(self, inp, outp):
""" Useful with the hack in profilemode to print the MFlops"""
# if the output shape is correct, then this gives the correct
# flops for any direction, sampling, padding, and border mode
inputs, filters = inp
outputs, = outp
assert inputs[1] == filters[1]
# nb mul and add by output pixel
flops = filters[2] * filters[3] * 2
# nb flops by output image
flops *= outputs[2] * outputs[3]
# nb patch multiplied
flops *= inputs[1] * filters[0] * inputs[0]
return flops
def c_headers(self):
......@@ -621,61 +569,98 @@ class GpuCorrMM(GpuOp):
for f in files]
return reduce(str.__add__, codes)
def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp
out, = out_
dx = self.subsample[0]
dy = self.subsample[1]
def c_code(self, bottom, weights, top, direction, sub):
# This is the shared code for GpuCorrMM (direction="forward"),
# GpuCorrMM_gradWeights (direction="backprop weights"), and
# GpuCorrMM_gradInputs (direction="backprop inputs").
# Depending on the direction, one of bottom, weights, top will
# receive the output, while the other two serve as inputs.
if self.border_mode != "valid":
raise ValueError("mode must be 'valid'")
dH, dW = self.subsample
if self.pad == "auto":
padH = padW = -1
else:
padH = self.pad[0]
padW = self.pad[1]
if self.border_mode == "valid":
bmode = 1
elif self.border_mode == "full":
bmode = 0
else:
raise ValueError("mode must be one of 'full' or 'valid'")
padH, padW = self.pad
if direction == "forward":
direction = 0
out = top
elif direction == "backprop weights":
direction = 1
out = weights
elif direction == "backprop inputs":
direction = 2
out = bottom
sub = sub.copy()
sub.update(locals())
return """
//Mandatory args
int mode = %(bmode)s;
// Mandatory args
int direction = %(direction)s; // forward, bprop weights, bprop inputs
//Optional args
int dx = %(dx)s;
int dy = %(dy)s;
// Optional args
int dH = %(dH)s;
int dW = %(dW)s;
int padH = %(padH)s;
int padW = %(padW)s;
CudaNdarray * img = %(img)s;
CudaNdarray * kern = %(kern)s;
CudaNdarray * bottom = %(bottom)s;
CudaNdarray * weights = %(weights)s;
CudaNdarray * top = %(top)s;
CudaNdarray * out2 = NULL;
//Auto-padding if requested
// Obtain or infer kernel width and height
int kH, kW;
if (direction != 1) {
kH = CudaNdarray_HOST_DIMS(weights)[2];
kW = CudaNdarray_HOST_DIMS(weights)[3];
}
else {
kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
}
// Auto-padding if requested
if (padH < 0) {
padH = CudaNdarray_HOST_DIMS(kern)[2] - 1;
padH = kH - 1;
}
if (padW < 0) {
padW = CudaNdarray_HOST_DIMS(kern)[3] - 1;
padW = kW - 1;
}
// Infer output shape
int out_dim[4];
out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0];
if (mode == 1) // valid correlation with padding and subsampling
{
out_dim[2] = ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2] + 2*padH - CudaNdarray_HOST_DIMS(kern)[2] + 1, dx);
out_dim[3] = ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3] + 2*padW - CudaNdarray_HOST_DIMS(kern)[3] + 1, dy);
}
else // full convolution with upsampling and cropping
{
out_dim[2] = (CudaNdarray_HOST_DIMS(img)[2] - 1) * dx + CudaNdarray_HOST_DIMS(kern)[2] - 2*padH;
out_dim[3] = (CudaNdarray_HOST_DIMS(img)[3] - 1) * dy + CudaNdarray_HOST_DIMS(kern)[3] - 2*padW;
switch(direction) {
case 0: // forward pass
// output is top: (batchsize, num_filters, height, width)
// height and width: top = (bottom + 2*pad - weight) / sample + 1
out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
break;
case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width)
// height and width: weights = bottom + 2*pad - (top - 1) * sample
out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[0];
out_dim[2] = kH; // already inferred further above
out_dim[3] = kW; // how convenient
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width)
// height and width: bottom = (top - 1) * sample + weights - 2*pad
out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
out_dim[2] = (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
out_dim[3] = (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
break;
default:
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
%(fail)s
}
// Prepare output array
if ( !(%(out)s
&& %(out)s->nd==4
&& CudaNdarray_is_c_contiguous(%(out)s)
......@@ -688,7 +673,8 @@ class GpuCorrMM(GpuOp):
%(out)s = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
}
out2 = corrMM(%(img)s, %(kern)s, %(out)s, mode, dx, dy, padH, padW);
// Call CUDA code
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, padH, padW);
if (out2==NULL){
%(fail)s
}
......@@ -697,6 +683,132 @@ class GpuCorrMM(GpuOp):
""" % sub
class GpuCorrMM(BaseGpuCorrMM):
"""GPU correlation implementation using Matrix Multiplication.
:note: You can either enable the Theano flag `optimizer_including=conv_gemm`
to automatically replace all convolution operations with `GpuCorrMM`
or one of its gradients, or you can use it as a replacement for
:func:`conv2d <theano.tensor.nnet.conv.conv2d>`, called as
`GpuCorrMM(subsample=...)(image, filters)`. The latter is currently
faster, but note that it computes a correlation -- if you need to
compute a convolution, flip the filters as `filters[:,:,::-1,::-1]`.
"""
def __init__(self, border_mode="valid",
subsample=(1, 1),
pad=(0, 0)):
"""
:param border_mode: currently supports "valid" only; "full" can be
simulated by setting `pad="auto"` (at the cost of performance), or
by using `GpuCorrMM_gradInputs`
:param subsample: the subsample operation applied to each output image.
Should be a tuple with 2 elements.
`(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
but faster.
Set to `(1, 1)` to disable subsampling.
:param pad: the width of a border of implicit zeros to pad the input
image with. Should be a tuple with 2 elements giving the numbers of
rows and columns to pad on each side, or "auto" to set the padding
to `(kernel_rows - 1, kernel_columns - 1)` at runtime.
Set to `(0, 0)` to disable padding.
:note: Currently, the Op requires the inputs, filters and outputs to be
C-contiguous. Use :func:`gpu_contiguous
<theano.sandbox.cuda.basic_ops.gpu_contiguous>` on these arguments
if needed.
"""
super(GpuCorrMM, self).__init__(border_mode, subsample, pad)
def make_node(self, img, kern):
img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
bottom, weights = inp
top, = out_
direction = "forward"
return super(GpuCorrMM, self).c_code(bottom, weights, top, direction, sub)
def grad(self, inp, grads):
bottom, weights = inp
top, = grads
top = gpu_contiguous(top)
d_bottom = GpuCorrMM_gradInputs(self.border_mode, self.subsample, self.pad)(
weights, top)
d_weights = GpuCorrMM_gradWeights(self.border_mode, self.subsample, self.pad)(
bottom, top)
return d_bottom, d_weights
class GpuCorrMM_gradWeights(BaseGpuCorrMM):
"""Gradient wrt. filters for `GpuCorrMM`.
:note: You will not want to use this directly, but rely on Theano's
automatic differentiation or graph optimization to use it as needed."""
def __init__(self, border_mode="valid",
subsample=(1, 1),
pad=(0, 0)):
super(GpuCorrMM_gradWeights, self).__init__(border_mode, subsample, pad)
def make_node(self, img, topgrad):
img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
False, False]
return Apply(self, [img, topgrad], [CudaNdarrayType(broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
bottom, top = inp
weights, = out_
direction = "backprop weights"
return super(GpuCorrMM_gradWeights, self).c_code(bottom, weights, top, direction, sub)
class GpuCorrMM_gradInputs(BaseGpuCorrMM):
"""Gradient wrt. inputs for `GpuCorrMM`.
:note: You will not want to use this directly, but rely on Theano's
automatic differentiation or graph optimization to use it as needed."""
def __init__(self, border_mode="valid",
subsample=(1, 1),
pad=(0, 0)):
super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample, pad)
def make_node(self, kern, topgrad):
kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad)
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
False, False]
return Apply(self, [kern, topgrad], [CudaNdarrayType(broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
weights, top = inp
bottom, = out_
direction = "backprop inputs"
return super(GpuCorrMM_gradInputs, self).c_code(bottom, weights, top, direction, sub)
##
# Not really a BLAS operation, but whatever.
#
......
......@@ -161,18 +161,18 @@ void col2im(const float* data_col, const int channels,
// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
CudaNdarray* corrMM(const CudaNdarray *input,
CudaNdarray *weight,
CudaNdarray *output,
int mode,
int dH = 1,
int dW = 1,
int padH = 0,
int padW = 0)
CudaNdarray* corrMM(CudaNdarray *const bottom,
CudaNdarray *const weight,
CudaNdarray *const top,
const int direction,
const int dH = 1,
const int dW = 1,
const int padH = 0,
const int padW = 0)
{
if (input->nd != 4)
if (bottom->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires input of 4D");
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires bottom of 4D");
}
if (weight->nd != 4)
......@@ -180,83 +180,75 @@ CudaNdarray* corrMM(const CudaNdarray *input,
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
}
if (output->nd != 4)
if (top->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires output of 4D");
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires top of 4D");
}
// Extract some shape information for later and check shape consistency
// inputs: (batchSize, nInputPlane, inputHeight, inputWidth)
const int batchSize = CudaNdarray_HOST_DIMS(input)[0];
const int nInputPlane = CudaNdarray_HOST_DIMS(input)[1];
const int inputHeight = CudaNdarray_HOST_DIMS(input)[2];
const int inputWidth = CudaNdarray_HOST_DIMS(input)[3];
// filters: (nOutputPlane, nInputPlane, rows, columns)
const int nOutputPlane = CudaNdarray_HOST_DIMS(weight)[0];
// bottom: (batchSize, nChannels, bottomHeight, bottomWidth)
const int batchSize = CudaNdarray_HOST_DIMS(bottom)[0];
const int nChannels = CudaNdarray_HOST_DIMS(bottom)[1];
const int bottomHeight = CudaNdarray_HOST_DIMS(bottom)[2];
const int bottomWidth = CudaNdarray_HOST_DIMS(bottom)[3];
// weights: (nFilters, nChannels, rows, columns)
const int nFilters = CudaNdarray_HOST_DIMS(weight)[0];
const int kH = CudaNdarray_HOST_DIMS(weight)[2];
const int kW = CudaNdarray_HOST_DIMS(weight)[3];
if (nInputPlane != CudaNdarray_HOST_DIMS(weight)[1]) {
if (nChannels != CudaNdarray_HOST_DIMS(weight)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuCorrMM images and kernel must have the same stack size\n");
return NULL;
}
// outputs: (batchSize, nOutputPlane, outputHeight, outputWidth)
int outputHeight, outputWidth;
if (mode == 1) { // valid correlation with padding and subsampling
outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
}
else if (mode == 0) { // full convolution with upsampling and cropping
// these would be the shapes for a standard full convolution:
//outputHeight = (inputHeight + 2*padH + kH - 2) / dH + 1;
//outputWidth = (inputWidth + 2*padW + kW - 2) / dW + 1;
// but here, dH and dW are *upsampling* factors, and padding is reversed
// (because the implementation was meant as a backward pass for a CNN)
outputHeight = (inputHeight - 1) * dH + kH - 2*padH;
outputWidth = (inputWidth - 1) * dW + kW - 2*padW;
}
if (batchSize != CudaNdarray_HOST_DIMS(output)[0] ||
nOutputPlane != CudaNdarray_HOST_DIMS(output)[1] ||
outputHeight != CudaNdarray_HOST_DIMS(output)[2] ||
outputWidth != CudaNdarray_HOST_DIMS(output)[3]) {
// top: (batchSize, nFilters, topHeight, topWidth)
const int topHeight = (bottomHeight + 2*padH - kH) / dH + 1;
const int topWidth = (bottomWidth + 2*padW - kW) / dW + 1;
if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
topWidth != CudaNdarray_HOST_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM output parameter has wrong shape %d %d %d %d, expected %d %d %d %d\n",
CudaNdarray_HOST_DIMS(output)[0], CudaNdarray_HOST_DIMS(output)[1],
CudaNdarray_HOST_DIMS(output)[2], CudaNdarray_HOST_DIMS(output)[3],
batchSize, nOutputPlane, outputHeight, outputWidth);
"GpuCorrMM shape inconsistency: From bottom and weights, "
"top shape should be %d %d %d %d, but is %d %d %d %d.\n",
batchSize, nFilters, topHeight, topWidth,
CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3]);
return NULL;
}
if (mode == 1) { // valid correlation: im2col, then gemm
// Create temporary columns (col_data)
int col_dim[2];
col_dim[0] = nInputPlane * kW * kH;
col_dim[1] = outputHeight * outputWidth;
CudaNdarray* col_data = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
// Create temporary columns
int col_dim[2];
col_dim[0] = nChannels * kW * kH;
col_dim[1] = topHeight * topWidth;
CudaNdarray* col = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
// Define some useful variables
const int bottom_stride = CudaNdarray_HOST_STRIDES(bottom)[0];
const int top_stride = CudaNdarray_HOST_STRIDES(top)[0];
const int K_ = col_dim[0];
const int N_ = col_dim[1];
const int M_ = nFilters;
const float one = 1.0f;
const float zero = 0.0f;
// Define some useful variables
const int ip_stride = CudaNdarray_HOST_STRIDES(input)[0];
const int op_stride = CudaNdarray_HOST_STRIDES(output)[0];
const int K_ = col_dim[0];
const int N_ = col_dim[1];
const int M_ = nOutputPlane;
const float alpha = 1.0f;
const float beta = 0.0f;
CudaNdarray *output;
if (direction == 0) { // forward pass
output = top;
// valid correlation: im2col, then gemm
// Iterate over batch
for (int n = 0; n < batchSize; n++) {
// First, im2col
im2col(input->devdata + n * ip_stride, nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, col_data->devdata);
im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
// Second, gemm
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
N_, M_, K_,
&alpha,
col_data->devdata, N_,
&one,
col->devdata, N_,
weight->devdata, K_,
&beta,
output->devdata + n * op_stride, N_);
&zero,
top->devdata + n * top_stride, N_);
if (status != CUBLAS_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUBLAS error: %s\n",
......@@ -264,17 +256,11 @@ CudaNdarray* corrMM(const CudaNdarray *input,
return NULL;
}
}
// Free temporary columns
Py_DECREF(col_data);
/*
// Original caffe code for comparison
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// Note that this is for grouped convolution; we can ignore groups
const Dtype* bottom_data = bottom[i]->gpu_data();
Dtype* top_data = (*top)[i]->mutable_gpu_data();
Dtype* col_data = col_buffer_.mutable_gpu_data();
const Dtype* weight = this->blobs_[0]->gpu_data();
// Note that this is for grouped convolution; we can ignore groups here,
// but the group-related offsets help explain what M_, N_ and K_ are
int weight_offset = M_ * K_;
int col_offset = K_ * N_;
int top_offset = M_ * N_;
......@@ -300,33 +286,81 @@ CudaNdarray* corrMM(const CudaNdarray *input,
}
*/
}
else if (mode == 0) { // full convolution: gemm, then col2im
// Create temporary columns (col_diff)
int col_dim[2];
col_dim[0] = nOutputPlane * kW * kH;
col_dim[1] = inputHeight * inputWidth;
CudaNdarray* col_diff = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
// Define some useful variables
const int ip_stride = CudaNdarray_HOST_STRIDES(input)[0];
const int op_stride = CudaNdarray_HOST_STRIDES(output)[0];
const int K_ = col_dim[0];
const int N_ = col_dim[1];
const int M_ = nInputPlane;
const float alpha = 1.0f;
const float beta = 0.0f;
else if (direction == 1) { // backprop wrt. weights
output = weight;
// valid convolution: im2col, then gemm
// Initialize target with zeros as we will accumulate into it
// (all kernels run on the null stream, so we don't need to synchronize)
cudaError_t err = cudaMemsetAsync(weight->devdata, 0,
sizeof(float) * M_ * K_);
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUDA error: %s\n",
cudaGetErrorString(err));
return NULL;
}
// Iterate over batch
for (int n = 0; n < batchSize; n++) {
// First, im2col
im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
// Second, gemm
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_T, CUBLAS_OP_N,
K_, M_, N_,
&one,
col->devdata, N_,
top->devdata + n * top_stride, N_,
&one,
weight->devdata, K_);
if (status != CUBLAS_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUBLAS error: %s\n",
cublasGetErrorString(status));
return NULL;
}
}
/*
// Original caffe code for comparison
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// Note that this is for grouped convolution; we can ignore groups
for (int n = 0; n < num_; ++n) {
// Since we saved memory in the forward pass by not storing all col
// data, we will need to recompute them.
im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
stride_h_, stride_w_, col_data);
// gradient w.r.t. weight. Note that we will accumulate diffs.
for (int g = 0; g < group_; ++g) {
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
(Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
col_data + col_offset * g, (Dtype)1.,
weight_diff + weight_offset * g);
== (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
cublasSgemm(CUBLAS_OP_T, CUBLAS_OP_N, K_, M_, N_,
1.0,
col_data + col_offset * g, N_,
top_diff + top[i]->offset(n) + top_offset * g, N_,
1.0,
weight_diff + weight_offset * g, K_);
}
}
*/
}
else if (direction == 2) { // backprop wrt. inputs
output = bottom;
// full convolution: gemm, then col2im
// Iterate over batch
for (int n = 0; n < batchSize; n++) {
// gemm into columns
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_T,
N_, K_, M_,
&alpha,
input->devdata + n * ip_stride, N_,
&one,
top->devdata + n * top_stride, N_,
weight->devdata, K_,
&beta,
col_diff->devdata, N_);
&zero,
col->devdata, N_);
if (status != CUBLAS_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUBLAS error: %s\n",
......@@ -334,22 +368,15 @@ CudaNdarray* corrMM(const CudaNdarray *input,
return NULL;
}
// col2im back to the data
col2im(col_diff->devdata, nOutputPlane, outputHeight, outputWidth,
kH, kW, padH, padW, dH, dW, output->devdata + n * op_stride);
col2im(col->devdata, nChannels, bottomHeight, bottomWidth,
kH, kW, padH, padW, dH, dW, bottom->devdata + n * bottom_stride);
}
// Free temporary columns
Py_DECREF(col_diff);
/*
// Original caffe code for comparison
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// Note that this is the backward pass of a valid convolution, so
// top_diff is the input, bottom_diff is the output, weights are weights
Dtype* col_data = col_buffer_.mutable_gpu_data();
Dtype* col_diff = col_buffer_.mutable_gpu_diff();
Dtype* bottom_diff = (*bottom)[i]->mutable_gpu_diff();
for (int n = 0; n < num_; ++n) {
// gradient w.r.t. bottom data, if necessary
// gradient w.r.t. bottom data, if necessary
if (propagate_down[i]) {
for (int g = 0; g < group_; ++g) {
caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
(Dtype)1., weight + weight_offset * g,
......@@ -367,9 +394,13 @@ CudaNdarray* corrMM(const CudaNdarray *input,
col2im_gpu(col_diff, channels_, height_, width_,
kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
bottom_diff + (*bottom)[i]->offset(n));
}
}
*/
}
// Free temporary columns
Py_DECREF(col);
return output;
}
......@@ -25,7 +25,8 @@ from theano.sandbox.cuda.basic_ops import (
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape)
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, GpuCorrMM)
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights)
from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace
......@@ -1354,19 +1355,23 @@ def local_conv_gemm(node):
border_mode = node.op.border_mode
subsample = node.op.subsample
pad = (0,0)
if (border_mode == 'full') and ((subsample != (1,1)) or (pad != (0,0))):
if (border_mode == 'full') and (subsample != (1,1)):
# need to simulate this via a padded valid convolution
pad = 'auto'
border_mode = 'valid'
if (border_mode == 'valid'):
# need to flip the kernel for valid convolution
kern = gpu_contiguous(kern[:, :, ::-1, ::-1])
kern = kern[:, :, ::-1, ::-1]
# call GpuCorrMM
# TODO: call GpuCorrMM_gradWeights instead if appropriate
return [GpuCorrMM('valid', subsample, pad)(
gpu_contiguous(img), gpu_contiguous(kern))]
elif (border_mode == 'full'):
# need to bring kernel into correct memory layout for full convolution
kern = gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)).dimshuffle(1, 0, 2, 3)
# need C-contiguous inputs
img = gpu_contiguous(img)
return [GpuCorrMM(border_mode, subsample, pad)(img, kern)]
# need to dimshuffle the kernel for full convolution
kern = kern.dimshuffle(1, 0, 2, 3)
# call GpuCorrMM_gradInputs
return [GpuCorrMM_gradInputs('valid', subsample, pad)(
gpu_contiguous(kern), gpu_contiguous(img))]
gpu_optimizer.register("conv_gemm", local_conv_gemm)
......
......@@ -186,7 +186,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
f = theano.function([i, k], op, mode=theano_mode)
if cls is not None:
assert any([isinstance(node.op, cls)
for node in f.maker.fgraph.toposort()]), f.maker.fgraph.toposort()
for node in f.maker.fgraph.toposort()]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
gpuval = f(img, kern)
t2 = time.time()
for i in range(nb_iter):
......@@ -284,7 +284,7 @@ def exec_conv(version, shapes, verbose, random, mode,
cls=cls)
except Exception, e:
print ver, id, (ishape, kshape, subshape, istride, kstride)
print e
print "Exception", type(e), e
pass
if not ret:
failed_version.add(ver)
......@@ -634,7 +634,7 @@ def test_valid(conv_gemm=False):
if conv_gemm:
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.GpuCorrMM
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version = [-1]
# Add tests with strided inputs by still square images and filters.
......@@ -713,7 +713,7 @@ def test_full(conv_gemm=False):
if conv_gemm:
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.GpuCorrMM
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version = [-1]
else:
......@@ -753,7 +753,7 @@ def test_subsample(conv_gemm=False):
if conv_gemm:
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.GpuCorrMM
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version_valid = version_full = [-1]
else:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论