提交 a725adf3 authored 作者: f0k's avatar f0k

Refactored GpuCorrMM to be split into separate ops for the forward pass and the two backward passes

上级 e76a29d9
...@@ -8,6 +8,7 @@ from theano.compat.six import StringIO ...@@ -8,6 +8,7 @@ from theano.compat.six import StringIO
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda import as_cuda_ndarray_variable from theano.sandbox.cuda import as_cuda_ndarray_variable
from theano.sandbox.cuda.basic_ops import gpu_contiguous
class GpuDot22(GpuOp): class GpuDot22(GpuOp):
...@@ -500,60 +501,22 @@ gpu_ger_no_inplace = GpuGer(inplace=False) ...@@ -500,60 +501,22 @@ gpu_ger_no_inplace = GpuGer(inplace=False)
gpu_ger_inplace = GpuGer(inplace=True) gpu_ger_inplace = GpuGer(inplace=True)
class GpuCorrMM(GpuOp): class BaseGpuCorrMM(GpuOp):
"""GPU correlation/convolution implementation using Matrix Multiplication. """Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
`GpuCorrMM_gradInputs`. Cannot be used directly."""
:note: It doesn't implement the grad. So you shouldn't use it directly, but def __init__(self, border_mode="valid",
use :func:`conv2d <theano.tensor.nnet.conv.conv2d>` and then enable the
Theano flag ``optimizer_including=conv_gemm`` to automatically replace
all convolution operations with `GpuCorrMM`.
"""
def __init__(self, border_mode,
subsample=(1, 1), subsample=(1, 1),
pad=(0, 0)): pad=(0, 0)):
""" if border_mode != "valid":
:param border_mode: "valid" or "full" raise ValueError("border_mode must be 'valid'")
:param subsample: the subsample operation applied to each output image.
Should be a tuple with 2 elements.
(sv, sh) is equivalent to GpuCorrMM(...)(...)[:,:,::sv, ::sh]
If border_mode="full", this is instead treated as an upsampling
operation applied to each input image.
Set to (1, 1) to disable downsampling/upsampling.
:param pad: the width of a border of implicit zeros to pad the input
image with. Should be a tuple with 2 elements giving the numbers of
rows and columns to pad on each side, or "auto" to set the padding
to (kernel_rows - 1, kernel_columns - 1) at runtime.
If border_mode="full", this is instead treated as the width of a
border to crop from the output image.
Set to (0, 0) to disable padding/cropping.
:note: The border_mode changes the meaning of several parameters.
If border_mode="valid", the Op does a valid correlation of a padded
input image and subsamples it. (To perform a convolution instead,
you will need to flip the kernels.)
If border_mode="full", the Op does a full convolution of an
upsampled input image and crops it. (This can be used as a backward
pass of the valid correlation done with border_mode="valid".)
Combined with pad="auto", you can use border_mode="valid" to
simulate a full correlation with subsampling, or border_mode="full"
to simulate a valid convolution with upsampling.
:note: Currently, the Op requires a very specific memory layout.
For border_mode="valid", inputs, filters and outputs must be
C-contiguous. For border_mode="full", the same applies, except that
the strides of the first two dimensions of the filters (output and
input channels) must be swapped compared to C-contiguity.
"""
self.border_mode = border_mode self.border_mode = border_mode
if len(subsample) != 2:
raise ValueError("subsample must have two elements")
self.subsample = subsample self.subsample = subsample
#if (border_mode == "full") and (subsample != (1,1)): if (pad != "auto") and (len(pad) != 2):
# raise NotImplementedError( raise ValueError("pad must be 'auto' or have two elements")
# "GpuCorrMM doesn't support subsampling for border_mode='full'")
self.pad = pad self.pad = pad
#if (border_mode == "full") and (pad != (0,0)):
# raise NotImplementedError(
# "GpuCorrMM doesn't support padding for border_mode='full'")
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) \ return type(self) == type(other) \
...@@ -576,34 +539,19 @@ class GpuCorrMM(GpuOp): ...@@ -576,34 +539,19 @@ class GpuCorrMM(GpuOp):
str(self.subsample), str(self.subsample),
self.pad) self.pad)
def make_node(self, img, kern): def flops(self, inp, outp):
img = as_cuda_ndarray_variable(img) """ Useful with the hack in profilemode to print the MFlops"""
kern = as_cuda_ndarray_variable(kern) # if the output shape is correct, then this gives the correct
if img.type.ndim != 4: # flops for any direction, sampling, padding, and border mode
raise TypeError('img must be 4D tensor') inputs, filters = inp
if kern.type.ndim != 4: outputs, = outp
raise TypeError('kern must be 4D tensor') assert inputs[1] == filters[1]
# nb mul and add by output pixel
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], flops = filters[2] * filters[3] * 2
False, False] # nb flops by output image
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()]) flops *= outputs[2] * outputs[3]
# nb patch multiplied
def flops(self, inputs, outputs): flops *= inputs[1] * filters[0] * inputs[0]
images, kerns = inputs
out, = outputs
assert images[1] == kerns[1]
flops = 0
if self.border_mode == "valid":
# nb mul and add by output pixel
flops = kerns[2] * kerns[3] * 2
# nb flops by output image
flops *= out[2] * out[3]
# nb patch multiplied
flops *= images[1] * kerns[0] * images[0]
else:
flops = (images[0] * kerns[0] * images[1] *
kerns[2] * kerns[3] *
images[2] * images[3] * 2)
return flops return flops
def c_headers(self): def c_headers(self):
...@@ -621,61 +569,98 @@ class GpuCorrMM(GpuOp): ...@@ -621,61 +569,98 @@ class GpuCorrMM(GpuOp):
for f in files] for f in files]
return reduce(str.__add__, codes) return reduce(str.__add__, codes)
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, bottom, weights, top, direction, sub):
img, kern = inp # This is the shared code for GpuCorrMM (direction="forward"),
out, = out_ # GpuCorrMM_gradWeights (direction="backprop weights"), and
dx = self.subsample[0] # GpuCorrMM_gradInputs (direction="backprop inputs").
dy = self.subsample[1] # Depending on the direction, one of bottom, weights, top will
# receive the output, while the other two serve as inputs.
if self.border_mode != "valid":
raise ValueError("mode must be 'valid'")
dH, dW = self.subsample
if self.pad == "auto": if self.pad == "auto":
padH = padW = -1 padH = padW = -1
else: else:
padH = self.pad[0] padH, padW = self.pad
padW = self.pad[1] if direction == "forward":
if self.border_mode == "valid": direction = 0
bmode = 1 out = top
elif self.border_mode == "full": elif direction == "backprop weights":
bmode = 0 direction = 1
else: out = weights
raise ValueError("mode must be one of 'full' or 'valid'") elif direction == "backprop inputs":
direction = 2
out = bottom
sub = sub.copy() sub = sub.copy()
sub.update(locals()) sub.update(locals())
return """ return """
//Mandatory args // Mandatory args
int mode = %(bmode)s; int direction = %(direction)s; // forward, bprop weights, bprop inputs
//Optional args // Optional args
int dx = %(dx)s; int dH = %(dH)s;
int dy = %(dy)s; int dW = %(dW)s;
int padH = %(padH)s; int padH = %(padH)s;
int padW = %(padW)s; int padW = %(padW)s;
CudaNdarray * img = %(img)s; CudaNdarray * bottom = %(bottom)s;
CudaNdarray * kern = %(kern)s; CudaNdarray * weights = %(weights)s;
CudaNdarray * top = %(top)s;
CudaNdarray * out2 = NULL; CudaNdarray * out2 = NULL;
//Auto-padding if requested // Obtain or infer kernel width and height
int kH, kW;
if (direction != 1) {
kH = CudaNdarray_HOST_DIMS(weights)[2];
kW = CudaNdarray_HOST_DIMS(weights)[3];
}
else {
kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
}
// Auto-padding if requested
if (padH < 0) { if (padH < 0) {
padH = CudaNdarray_HOST_DIMS(kern)[2] - 1; padH = kH - 1;
} }
if (padW < 0) { if (padW < 0) {
padW = CudaNdarray_HOST_DIMS(kern)[3] - 1; padW = kW - 1;
} }
// Infer output shape
int out_dim[4]; int out_dim[4];
out_dim[0] = CudaNdarray_HOST_DIMS(img)[0]; switch(direction) {
out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0]; case 0: // forward pass
if (mode == 1) // valid correlation with padding and subsampling // output is top: (batchsize, num_filters, height, width)
{ // height and width: top = (bottom + 2*pad - weight) / sample + 1
out_dim[2] = ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2] + 2*padH - CudaNdarray_HOST_DIMS(kern)[2] + 1, dx); out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
out_dim[3] = ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3] + 2*padW - CudaNdarray_HOST_DIMS(kern)[3] + 1, dy); out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
} out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
else // full convolution with upsampling and cropping out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
{ break;
out_dim[2] = (CudaNdarray_HOST_DIMS(img)[2] - 1) * dx + CudaNdarray_HOST_DIMS(kern)[2] - 2*padH; case 1: // backprop wrt. weights
out_dim[3] = (CudaNdarray_HOST_DIMS(img)[3] - 1) * dy + CudaNdarray_HOST_DIMS(kern)[3] - 2*padW; // output is weights: (num_filters, num_channels, height, width)
// height and width: weights = bottom + 2*pad - (top - 1) * sample
out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[0];
out_dim[2] = kH; // already inferred further above
out_dim[3] = kW; // how convenient
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width)
// height and width: bottom = (top - 1) * sample + weights - 2*pad
out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
out_dim[2] = (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
out_dim[3] = (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
break;
default:
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
%(fail)s
} }
// Prepare output array
if ( !(%(out)s if ( !(%(out)s
&& %(out)s->nd==4 && %(out)s->nd==4
&& CudaNdarray_is_c_contiguous(%(out)s) && CudaNdarray_is_c_contiguous(%(out)s)
...@@ -688,7 +673,8 @@ class GpuCorrMM(GpuOp): ...@@ -688,7 +673,8 @@ class GpuCorrMM(GpuOp):
%(out)s = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim); %(out)s = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
} }
out2 = corrMM(%(img)s, %(kern)s, %(out)s, mode, dx, dy, padH, padW); // Call CUDA code
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, padH, padW);
if (out2==NULL){ if (out2==NULL){
%(fail)s %(fail)s
} }
...@@ -697,6 +683,132 @@ class GpuCorrMM(GpuOp): ...@@ -697,6 +683,132 @@ class GpuCorrMM(GpuOp):
""" % sub """ % sub
class GpuCorrMM(BaseGpuCorrMM):
"""GPU correlation implementation using Matrix Multiplication.
:note: You can either enable the Theano flag `optimizer_including=conv_gemm`
to automatically replace all convolution operations with `GpuCorrMM`
or one of its gradients, or you can use it as a replacement for
:func:`conv2d <theano.tensor.nnet.conv.conv2d>`, called as
`GpuCorrMM(subsample=...)(image, filters)`. The latter is currently
faster, but note that it computes a correlation -- if you need to
compute a convolution, flip the filters as `filters[:,:,::-1,::-1]`.
"""
def __init__(self, border_mode="valid",
subsample=(1, 1),
pad=(0, 0)):
"""
:param border_mode: currently supports "valid" only; "full" can be
simulated by setting `pad="auto"` (at the cost of performance), or
by using `GpuCorrMM_gradInputs`
:param subsample: the subsample operation applied to each output image.
Should be a tuple with 2 elements.
`(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
but faster.
Set to `(1, 1)` to disable subsampling.
:param pad: the width of a border of implicit zeros to pad the input
image with. Should be a tuple with 2 elements giving the numbers of
rows and columns to pad on each side, or "auto" to set the padding
to `(kernel_rows - 1, kernel_columns - 1)` at runtime.
Set to `(0, 0)` to disable padding.
:note: Currently, the Op requires the inputs, filters and outputs to be
C-contiguous. Use :func:`gpu_contiguous
<theano.sandbox.cuda.basic_ops.gpu_contiguous>` on these arguments
if needed.
"""
super(GpuCorrMM, self).__init__(border_mode, subsample, pad)
def make_node(self, img, kern):
img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
bottom, weights = inp
top, = out_
direction = "forward"
return super(GpuCorrMM, self).c_code(bottom, weights, top, direction, sub)
def grad(self, inp, grads):
bottom, weights = inp
top, = grads
top = gpu_contiguous(top)
d_bottom = GpuCorrMM_gradInputs(self.border_mode, self.subsample, self.pad)(
weights, top)
d_weights = GpuCorrMM_gradWeights(self.border_mode, self.subsample, self.pad)(
bottom, top)
return d_bottom, d_weights
class GpuCorrMM_gradWeights(BaseGpuCorrMM):
"""Gradient wrt. filters for `GpuCorrMM`.
:note: You will not want to use this directly, but rely on Theano's
automatic differentiation or graph optimization to use it as needed."""
def __init__(self, border_mode="valid",
subsample=(1, 1),
pad=(0, 0)):
super(GpuCorrMM_gradWeights, self).__init__(border_mode, subsample, pad)
def make_node(self, img, topgrad):
img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
False, False]
return Apply(self, [img, topgrad], [CudaNdarrayType(broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
bottom, top = inp
weights, = out_
direction = "backprop weights"
return super(GpuCorrMM_gradWeights, self).c_code(bottom, weights, top, direction, sub)
class GpuCorrMM_gradInputs(BaseGpuCorrMM):
"""Gradient wrt. inputs for `GpuCorrMM`.
:note: You will not want to use this directly, but rely on Theano's
automatic differentiation or graph optimization to use it as needed."""
def __init__(self, border_mode="valid",
subsample=(1, 1),
pad=(0, 0)):
super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample, pad)
def make_node(self, kern, topgrad):
kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad)
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
False, False]
return Apply(self, [kern, topgrad], [CudaNdarrayType(broadcastable)()])
def c_code(self, node, nodename, inp, out_, sub):
weights, top = inp
bottom, = out_
direction = "backprop inputs"
return super(GpuCorrMM_gradInputs, self).c_code(bottom, weights, top, direction, sub)
## ##
# Not really a BLAS operation, but whatever. # Not really a BLAS operation, but whatever.
# #
......
...@@ -161,18 +161,18 @@ void col2im(const float* data_col, const int channels, ...@@ -161,18 +161,18 @@ void col2im(const float* data_col, const int channels,
// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter // Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu // Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu // and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
CudaNdarray* corrMM(const CudaNdarray *input, CudaNdarray* corrMM(CudaNdarray *const bottom,
CudaNdarray *weight, CudaNdarray *const weight,
CudaNdarray *output, CudaNdarray *const top,
int mode, const int direction,
int dH = 1, const int dH = 1,
int dW = 1, const int dW = 1,
int padH = 0, const int padH = 0,
int padW = 0) const int padW = 0)
{ {
if (input->nd != 4) if (bottom->nd != 4)
{ {
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires input of 4D"); PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires bottom of 4D");
} }
if (weight->nd != 4) if (weight->nd != 4)
...@@ -180,83 +180,75 @@ CudaNdarray* corrMM(const CudaNdarray *input, ...@@ -180,83 +180,75 @@ CudaNdarray* corrMM(const CudaNdarray *input,
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D"); PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
} }
if (output->nd != 4) if (top->nd != 4)
{ {
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires output of 4D"); PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires top of 4D");
} }
// Extract some shape information for later and check shape consistency // Extract some shape information for later and check shape consistency
// inputs: (batchSize, nInputPlane, inputHeight, inputWidth) // bottom: (batchSize, nChannels, bottomHeight, bottomWidth)
const int batchSize = CudaNdarray_HOST_DIMS(input)[0]; const int batchSize = CudaNdarray_HOST_DIMS(bottom)[0];
const int nInputPlane = CudaNdarray_HOST_DIMS(input)[1]; const int nChannels = CudaNdarray_HOST_DIMS(bottom)[1];
const int inputHeight = CudaNdarray_HOST_DIMS(input)[2]; const int bottomHeight = CudaNdarray_HOST_DIMS(bottom)[2];
const int inputWidth = CudaNdarray_HOST_DIMS(input)[3]; const int bottomWidth = CudaNdarray_HOST_DIMS(bottom)[3];
// filters: (nOutputPlane, nInputPlane, rows, columns) // weights: (nFilters, nChannels, rows, columns)
const int nOutputPlane = CudaNdarray_HOST_DIMS(weight)[0]; const int nFilters = CudaNdarray_HOST_DIMS(weight)[0];
const int kH = CudaNdarray_HOST_DIMS(weight)[2]; const int kH = CudaNdarray_HOST_DIMS(weight)[2];
const int kW = CudaNdarray_HOST_DIMS(weight)[3]; const int kW = CudaNdarray_HOST_DIMS(weight)[3];
if (nInputPlane != CudaNdarray_HOST_DIMS(weight)[1]) { if (nChannels != CudaNdarray_HOST_DIMS(weight)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuCorrMM images and kernel must have the same stack size\n"); "GpuCorrMM images and kernel must have the same stack size\n");
return NULL; return NULL;
} }
// outputs: (batchSize, nOutputPlane, outputHeight, outputWidth) // top: (batchSize, nFilters, topHeight, topWidth)
int outputHeight, outputWidth; const int topHeight = (bottomHeight + 2*padH - kH) / dH + 1;
if (mode == 1) { // valid correlation with padding and subsampling const int topWidth = (bottomWidth + 2*padW - kW) / dW + 1;
outputHeight = (inputHeight + 2*padH - kH) / dH + 1; if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
outputWidth = (inputWidth + 2*padW - kW) / dW + 1; nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
} topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
else if (mode == 0) { // full convolution with upsampling and cropping topWidth != CudaNdarray_HOST_DIMS(top)[3]) {
// these would be the shapes for a standard full convolution:
//outputHeight = (inputHeight + 2*padH + kH - 2) / dH + 1;
//outputWidth = (inputWidth + 2*padW + kW - 2) / dW + 1;
// but here, dH and dW are *upsampling* factors, and padding is reversed
// (because the implementation was meant as a backward pass for a CNN)
outputHeight = (inputHeight - 1) * dH + kH - 2*padH;
outputWidth = (inputWidth - 1) * dW + kW - 2*padW;
}
if (batchSize != CudaNdarray_HOST_DIMS(output)[0] ||
nOutputPlane != CudaNdarray_HOST_DIMS(output)[1] ||
outputHeight != CudaNdarray_HOST_DIMS(output)[2] ||
outputWidth != CudaNdarray_HOST_DIMS(output)[3]) {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"GpuCorrMM output parameter has wrong shape %d %d %d %d, expected %d %d %d %d\n", "GpuCorrMM shape inconsistency: From bottom and weights, "
CudaNdarray_HOST_DIMS(output)[0], CudaNdarray_HOST_DIMS(output)[1], "top shape should be %d %d %d %d, but is %d %d %d %d.\n",
CudaNdarray_HOST_DIMS(output)[2], CudaNdarray_HOST_DIMS(output)[3], batchSize, nFilters, topHeight, topWidth,
batchSize, nOutputPlane, outputHeight, outputWidth); CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3]);
return NULL; return NULL;
} }
if (mode == 1) { // valid correlation: im2col, then gemm // Create temporary columns
// Create temporary columns (col_data) int col_dim[2];
int col_dim[2]; col_dim[0] = nChannels * kW * kH;
col_dim[0] = nInputPlane * kW * kH; col_dim[1] = topHeight * topWidth;
col_dim[1] = outputHeight * outputWidth; CudaNdarray* col = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
CudaNdarray* col_data = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
// Define some useful variables
const int bottom_stride = CudaNdarray_HOST_STRIDES(bottom)[0];
const int top_stride = CudaNdarray_HOST_STRIDES(top)[0];
const int K_ = col_dim[0];
const int N_ = col_dim[1];
const int M_ = nFilters;
const float one = 1.0f;
const float zero = 0.0f;
// Define some useful variables CudaNdarray *output;
const int ip_stride = CudaNdarray_HOST_STRIDES(input)[0]; if (direction == 0) { // forward pass
const int op_stride = CudaNdarray_HOST_STRIDES(output)[0]; output = top;
const int K_ = col_dim[0]; // valid correlation: im2col, then gemm
const int N_ = col_dim[1];
const int M_ = nOutputPlane;
const float alpha = 1.0f;
const float beta = 0.0f;
// Iterate over batch // Iterate over batch
for (int n = 0; n < batchSize; n++) { for (int n = 0; n < batchSize; n++) {
// First, im2col // First, im2col
im2col(input->devdata + n * ip_stride, nInputPlane, inputHeight, im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
inputWidth, kH, kW, padH, padW, dH, dW, col_data->devdata); bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
// Second, gemm // Second, gemm
cublasStatus_t status = cublasSgemm(handle, cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_N, CUBLAS_OP_N, CUBLAS_OP_N,
N_, M_, K_, N_, M_, K_,
&alpha, &one,
col_data->devdata, N_, col->devdata, N_,
weight->devdata, K_, weight->devdata, K_,
&beta, &zero,
output->devdata + n * op_stride, N_); top->devdata + n * top_stride, N_);
if (status != CUBLAS_STATUS_SUCCESS) { if (status != CUBLAS_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUBLAS error: %s\n", "GpuCorrMM encountered a CUBLAS error: %s\n",
...@@ -264,17 +256,11 @@ CudaNdarray* corrMM(const CudaNdarray *input, ...@@ -264,17 +256,11 @@ CudaNdarray* corrMM(const CudaNdarray *input,
return NULL; return NULL;
} }
} }
// Free temporary columns
Py_DECREF(col_data);
/* /*
// Original caffe code for comparison // Original caffe code for comparison
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// Note that this is for grouped convolution; we can ignore groups // Note that this is for grouped convolution; we can ignore groups here,
const Dtype* bottom_data = bottom[i]->gpu_data(); // but the group-related offsets help explain what M_, N_ and K_ are
Dtype* top_data = (*top)[i]->mutable_gpu_data();
Dtype* col_data = col_buffer_.mutable_gpu_data();
const Dtype* weight = this->blobs_[0]->gpu_data();
int weight_offset = M_ * K_; int weight_offset = M_ * K_;
int col_offset = K_ * N_; int col_offset = K_ * N_;
int top_offset = M_ * N_; int top_offset = M_ * N_;
...@@ -300,33 +286,81 @@ CudaNdarray* corrMM(const CudaNdarray *input, ...@@ -300,33 +286,81 @@ CudaNdarray* corrMM(const CudaNdarray *input,
} }
*/ */
} }
else if (mode == 0) { // full convolution: gemm, then col2im else if (direction == 1) { // backprop wrt. weights
// Create temporary columns (col_diff) output = weight;
int col_dim[2]; // valid convolution: im2col, then gemm
col_dim[0] = nOutputPlane * kW * kH; // Initialize target with zeros as we will accumulate into it
col_dim[1] = inputHeight * inputWidth; // (all kernels run on the null stream, so we don't need to synchronize)
CudaNdarray* col_diff = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim); cudaError_t err = cudaMemsetAsync(weight->devdata, 0,
sizeof(float) * M_ * K_);
// Define some useful variables if (err != cudaSuccess) {
const int ip_stride = CudaNdarray_HOST_STRIDES(input)[0]; PyErr_Format(PyExc_RuntimeError,
const int op_stride = CudaNdarray_HOST_STRIDES(output)[0]; "GpuCorrMM encountered a CUDA error: %s\n",
const int K_ = col_dim[0]; cudaGetErrorString(err));
const int N_ = col_dim[1]; return NULL;
const int M_ = nInputPlane; }
const float alpha = 1.0f; // Iterate over batch
const float beta = 0.0f; for (int n = 0; n < batchSize; n++) {
// First, im2col
im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
// Second, gemm
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_T, CUBLAS_OP_N,
K_, M_, N_,
&one,
col->devdata, N_,
top->devdata + n * top_stride, N_,
&one,
weight->devdata, K_);
if (status != CUBLAS_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUBLAS error: %s\n",
cublasGetErrorString(status));
return NULL;
}
}
/*
// Original caffe code for comparison
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// Note that this is for grouped convolution; we can ignore groups
for (int n = 0; n < num_; ++n) {
// Since we saved memory in the forward pass by not storing all col
// data, we will need to recompute them.
im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
stride_h_, stride_w_, col_data);
// gradient w.r.t. weight. Note that we will accumulate diffs.
for (int g = 0; g < group_; ++g) {
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
(Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
col_data + col_offset * g, (Dtype)1.,
weight_diff + weight_offset * g);
== (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
cublasSgemm(CUBLAS_OP_T, CUBLAS_OP_N, K_, M_, N_,
1.0,
col_data + col_offset * g, N_,
top_diff + top[i]->offset(n) + top_offset * g, N_,
1.0,
weight_diff + weight_offset * g, K_);
}
}
*/
}
else if (direction == 2) { // backprop wrt. inputs
output = bottom;
// full convolution: gemm, then col2im
// Iterate over batch // Iterate over batch
for (int n = 0; n < batchSize; n++) { for (int n = 0; n < batchSize; n++) {
// gemm into columns // gemm into columns
cublasStatus_t status = cublasSgemm(handle, cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_T, CUBLAS_OP_N, CUBLAS_OP_T,
N_, K_, M_, N_, K_, M_,
&alpha, &one,
input->devdata + n * ip_stride, N_, top->devdata + n * top_stride, N_,
weight->devdata, K_, weight->devdata, K_,
&beta, &zero,
col_diff->devdata, N_); col->devdata, N_);
if (status != CUBLAS_STATUS_SUCCESS) { if (status != CUBLAS_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUBLAS error: %s\n", "GpuCorrMM encountered a CUBLAS error: %s\n",
...@@ -334,22 +368,15 @@ CudaNdarray* corrMM(const CudaNdarray *input, ...@@ -334,22 +368,15 @@ CudaNdarray* corrMM(const CudaNdarray *input,
return NULL; return NULL;
} }
// col2im back to the data // col2im back to the data
col2im(col_diff->devdata, nOutputPlane, outputHeight, outputWidth, col2im(col->devdata, nChannels, bottomHeight, bottomWidth,
kH, kW, padH, padW, dH, dW, output->devdata + n * op_stride); kH, kW, padH, padW, dH, dW, bottom->devdata + n * bottom_stride);
} }
// Free temporary columns
Py_DECREF(col_diff);
/* /*
// Original caffe code for comparison // Original caffe code for comparison
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// Note that this is the backward pass of a valid convolution, so
// top_diff is the input, bottom_diff is the output, weights are weights
Dtype* col_data = col_buffer_.mutable_gpu_data();
Dtype* col_diff = col_buffer_.mutable_gpu_diff();
Dtype* bottom_diff = (*bottom)[i]->mutable_gpu_diff();
for (int n = 0; n < num_; ++n) { for (int n = 0; n < num_; ++n) {
// gradient w.r.t. bottom data, if necessary // gradient w.r.t. bottom data, if necessary
if (propagate_down[i]) {
for (int g = 0; g < group_; ++g) { for (int g = 0; g < group_; ++g) {
caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_, caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
(Dtype)1., weight + weight_offset * g, (Dtype)1., weight + weight_offset * g,
...@@ -367,9 +394,13 @@ CudaNdarray* corrMM(const CudaNdarray *input, ...@@ -367,9 +394,13 @@ CudaNdarray* corrMM(const CudaNdarray *input,
col2im_gpu(col_diff, channels_, height_, width_, col2im_gpu(col_diff, channels_, height_, width_,
kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
bottom_diff + (*bottom)[i]->offset(n)); bottom_diff + (*bottom)[i]->offset(n));
}
} }
*/ */
} }
// Free temporary columns
Py_DECREF(col);
return output; return output;
} }
...@@ -25,7 +25,8 @@ from theano.sandbox.cuda.basic_ops import ( ...@@ -25,7 +25,8 @@ from theano.sandbox.cuda.basic_ops import (
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape) GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape)
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar, from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, GpuCorrMM) gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights)
from theano.sandbox.cuda.blas import gpu_gemv_inplace from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace from theano.sandbox.cuda.blas import gpu_ger_inplace
...@@ -1354,19 +1355,23 @@ def local_conv_gemm(node): ...@@ -1354,19 +1355,23 @@ def local_conv_gemm(node):
border_mode = node.op.border_mode border_mode = node.op.border_mode
subsample = node.op.subsample subsample = node.op.subsample
pad = (0,0) pad = (0,0)
if (border_mode == 'full') and ((subsample != (1,1)) or (pad != (0,0))): if (border_mode == 'full') and (subsample != (1,1)):
# need to simulate this via a padded valid convolution # need to simulate this via a padded valid convolution
pad = 'auto' pad = 'auto'
border_mode = 'valid' border_mode = 'valid'
if (border_mode == 'valid'): if (border_mode == 'valid'):
# need to flip the kernel for valid convolution # need to flip the kernel for valid convolution
kern = gpu_contiguous(kern[:, :, ::-1, ::-1]) kern = kern[:, :, ::-1, ::-1]
# call GpuCorrMM
# TODO: call GpuCorrMM_gradWeights instead if appropriate
return [GpuCorrMM('valid', subsample, pad)(
gpu_contiguous(img), gpu_contiguous(kern))]
elif (border_mode == 'full'): elif (border_mode == 'full'):
# need to bring kernel into correct memory layout for full convolution # need to dimshuffle the kernel for full convolution
kern = gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)).dimshuffle(1, 0, 2, 3) kern = kern.dimshuffle(1, 0, 2, 3)
# need C-contiguous inputs # call GpuCorrMM_gradInputs
img = gpu_contiguous(img) return [GpuCorrMM_gradInputs('valid', subsample, pad)(
return [GpuCorrMM(border_mode, subsample, pad)(img, kern)] gpu_contiguous(kern), gpu_contiguous(img))]
gpu_optimizer.register("conv_gemm", local_conv_gemm) gpu_optimizer.register("conv_gemm", local_conv_gemm)
......
...@@ -186,7 +186,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -186,7 +186,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
f = theano.function([i, k], op, mode=theano_mode) f = theano.function([i, k], op, mode=theano_mode)
if cls is not None: if cls is not None:
assert any([isinstance(node.op, cls) assert any([isinstance(node.op, cls)
for node in f.maker.fgraph.toposort()]), f.maker.fgraph.toposort() for node in f.maker.fgraph.toposort()]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
gpuval = f(img, kern) gpuval = f(img, kern)
t2 = time.time() t2 = time.time()
for i in range(nb_iter): for i in range(nb_iter):
...@@ -284,7 +284,7 @@ def exec_conv(version, shapes, verbose, random, mode, ...@@ -284,7 +284,7 @@ def exec_conv(version, shapes, verbose, random, mode,
cls=cls) cls=cls)
except Exception, e: except Exception, e:
print ver, id, (ishape, kshape, subshape, istride, kstride) print ver, id, (ishape, kshape, subshape, istride, kstride)
print e print "Exception", type(e), e
pass pass
if not ret: if not ret:
failed_version.add(ver) failed_version.add(ver)
...@@ -634,7 +634,7 @@ def test_valid(conv_gemm=False): ...@@ -634,7 +634,7 @@ def test_valid(conv_gemm=False):
if conv_gemm: if conv_gemm:
# Test the GpuCorrMM version # Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm") mode = theano_mode.including("conv_gemm")
cls = cuda.blas.GpuCorrMM cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough # dummy version; not used by GpuCorrMM so one version is enough
version = [-1] version = [-1]
# Add tests with strided inputs by still square images and filters. # Add tests with strided inputs by still square images and filters.
...@@ -713,7 +713,7 @@ def test_full(conv_gemm=False): ...@@ -713,7 +713,7 @@ def test_full(conv_gemm=False):
if conv_gemm: if conv_gemm:
# Test the GpuCorrMM version # Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm") mode = theano_mode.including("conv_gemm")
cls = cuda.blas.GpuCorrMM cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough # dummy version; not used by GpuCorrMM so one version is enough
version = [-1] version = [-1]
else: else:
...@@ -753,7 +753,7 @@ def test_subsample(conv_gemm=False): ...@@ -753,7 +753,7 @@ def test_subsample(conv_gemm=False):
if conv_gemm: if conv_gemm:
# Test the GpuCorrMM version # Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm") mode = theano_mode.including("conv_gemm")
cls = cuda.blas.GpuCorrMM cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough # dummy version; not used by GpuCorrMM so one version is enough
version_valid = version_full = [-1] version_valid = version_full = [-1]
else: else:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论