提交 1765cf40 authored 作者: Ubuntu's avatar Ubuntu 提交者: Mohammed Affan

add support for grouped convolution in GpuCorrMM

上级 816cdaf6
...@@ -489,11 +489,11 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -489,11 +489,11 @@ class BaseGpuCorrMM(CGpuKernelBase):
Perform subsampling of the input, also known as dilation (default: (1, 1)). Perform subsampling of the input, also known as dilation (default: (1, 1)).
""" """
check_broadcast = False check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation') __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups')
_f16_ok = True _f16_ok = True
def __init__(self, border_mode="valid", subsample=(1, 1), def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1)): filter_dilation=(1, 1), num_groups=1):
if isinstance(border_mode, integer_types): if isinstance(border_mode, integer_types):
border_mode = (border_mode, border_mode) border_mode = (border_mode, border_mode)
if isinstance(border_mode, tuple): if isinstance(border_mode, tuple):
...@@ -512,6 +512,9 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -512,6 +512,9 @@ class BaseGpuCorrMM(CGpuKernelBase):
raise ValueError("filter_dilation must have two elements") raise ValueError("filter_dilation must have two elements")
self.subsample = tuple(subsample) self.subsample = tuple(subsample)
self.filter_dilation = tuple(filter_dilation) self.filter_dilation = tuple(filter_dilation)
if num_groups < 1:
raise ValueError("Number of groups should be greater than 0")
self.num_groups = num_groups
CGpuKernelBase.__init__(self, ['corr_gemm.c']) CGpuKernelBase.__init__(self, ['corr_gemm.c'])
@property @property
...@@ -521,11 +524,12 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -521,11 +524,12 @@ class BaseGpuCorrMM(CGpuKernelBase):
return (0, 0) return (0, 0)
def __str__(self): def __str__(self):
return '%s{%s, %s, %s}' % ( return '%s{%s, %s, %s, %s}' % (
self.__class__.__name__, self.__class__.__name__,
self.border_mode, self.border_mode,
str(self.subsample), str(self.subsample),
str(self.filter_dilation)) str(self.filter_dilation),
str(self.num_groups))
def flops(self, inp, outp): def flops(self, inp, outp):
""" """
...@@ -600,6 +604,7 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -600,6 +604,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
""" """
dH, dW = self.subsample dH, dW = self.subsample
dilH, dilW = self.filter_dilation dilH, dilW = self.filter_dilation
numgroups = self.num_groups
if self.border_mode == "half": if self.border_mode == "half":
padH = padW = -1 padH = padW = -1
elif self.border_mode == "full": elif self.border_mode == "full":
...@@ -660,6 +665,7 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -660,6 +665,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
size_t dilW = %(dilW)s; size_t dilW = %(dilW)s;
int padH = %(padH)s; int padH = %(padH)s;
int padW = %(padW)s; int padW = %(padW)s;
int numgroups = %(numgroups)s;
PyGpuArrayObject * bottom = %(bottom)s; PyGpuArrayObject * bottom = %(bottom)s;
PyGpuArrayObject * weights = %(weights)s; PyGpuArrayObject * weights = %(weights)s;
...@@ -759,7 +765,7 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -759,7 +765,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
// output is weights: (num_filters, num_channels, height, width) // output is weights: (num_filters, num_channels, height, width)
// height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1 // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = PyGpuArray_DIMS(top)[1]; out_dim[0] = PyGpuArray_DIMS(top)[1];
out_dim[1] = PyGpuArray_DIMS(bottom)[1]; out_dim[1] = PyGpuArray_DIMS(bottom)[1] / numgroups;
out_dim[2] = kH; // already inferred further above out_dim[2] = kH; // already inferred further above
out_dim[3] = kW; // how convenient out_dim[3] = kW; // how convenient
out_typecode = top->ga.typecode; out_typecode = top->ga.typecode;
...@@ -783,7 +789,7 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -783,7 +789,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
// output is bottom: (batchsize, num_channels, height, width) // output is bottom: (batchsize, num_channels, height, width)
// height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = PyGpuArray_DIMS(top)[0]; out_dim[0] = PyGpuArray_DIMS(top)[0];
out_dim[1] = PyGpuArray_DIMS(weights)[1]; out_dim[1] = PyGpuArray_DIMS(weights)[1] * numgroups;
out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH; out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW; out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
out_typecode = top->ga.typecode; out_typecode = top->ga.typecode;
...@@ -827,7 +833,7 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -827,7 +833,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
} }
// Call GPU code // Call GPU code
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW); out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups);
if (out2==NULL){ if (out2==NULL){
%(fail)s %(fail)s
} }
...@@ -883,9 +889,9 @@ class GpuCorrMM(BaseGpuCorrMM): ...@@ -883,9 +889,9 @@ class GpuCorrMM(BaseGpuCorrMM):
""" """
def __init__(self, border_mode="valid", def __init__(self, border_mode="valid",
subsample=(1, 1), subsample=(1, 1),
filter_dilation=(1, 1)): filter_dilation=(1, 1), num_groups=1):
super(GpuCorrMM, self).__init__(border_mode, subsample, super(GpuCorrMM, self).__init__(border_mode, subsample,
filter_dilation) filter_dilation, num_groups)
def make_node(self, img, kern): def make_node(self, img, kern):
ctx_name = infer_context_name(img, kern) ctx_name = infer_context_name(img, kern)
...@@ -914,11 +920,13 @@ class GpuCorrMM(BaseGpuCorrMM): ...@@ -914,11 +920,13 @@ class GpuCorrMM(BaseGpuCorrMM):
top = gpu_contiguous(top) top = gpu_contiguous(top)
d_bottom = GpuCorrMM_gradInputs(self.border_mode, d_bottom = GpuCorrMM_gradInputs(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation)( self.filter_dilation,
self.num_groups)(
weights, top, bottom.shape[-2:]) weights, top, bottom.shape[-2:])
d_weights = GpuCorrMM_gradWeights(self.border_mode, d_weights = GpuCorrMM_gradWeights(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation)( self.filter_dilation,
self.num_groups)(
bottom, top, weights.shape[-2:]) bottom, top, weights.shape[-2:])
return d_bottom, d_weights return d_bottom, d_weights
...@@ -936,10 +944,11 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM): ...@@ -936,10 +944,11 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
def __init__(self, border_mode="valid", def __init__(self, border_mode="valid",
subsample=(1, 1), subsample=(1, 1),
filter_dilation=(1, 1)): filter_dilation=(1, 1),
num_groups=1):
super(GpuCorrMM_gradWeights, self).__init__(border_mode, super(GpuCorrMM_gradWeights, self).__init__(border_mode,
subsample, subsample,
filter_dilation) filter_dilation, num_groups)
def make_node(self, img, topgrad, shape=None): def make_node(self, img, topgrad, shape=None):
ctx_name = infer_context_name(img, topgrad) ctx_name = infer_context_name(img, topgrad)
...@@ -978,11 +987,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM): ...@@ -978,11 +987,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
weights = gpu_contiguous(weights) weights = gpu_contiguous(weights)
d_bottom = GpuCorrMM_gradInputs(self.border_mode, d_bottom = GpuCorrMM_gradInputs(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation)(weights, self.filter_dilation,
self.num_groups)(weights,
top, top,
bottom.shape[-2:]) bottom.shape[-2:])
d_top = GpuCorrMM( d_top = GpuCorrMM(
self.border_mode, self.subsample, self.filter_dilation)(bottom, weights) self.border_mode, self.subsample, self.filter_dilation, self.num_groups)(bottom, weights)
d_height_width = ( d_height_width = (
theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else () ) * 2 if len(inp) == 4 else ()
...@@ -1008,9 +1018,10 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM): ...@@ -1008,9 +1018,10 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
def __init__(self, border_mode="valid", def __init__(self, border_mode="valid",
subsample=(1, 1), subsample=(1, 1),
filter_dilation=(1, 1)): filter_dilation=(1, 1),
num_groups=1):
super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample, super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
filter_dilation) filter_dilation, num_groups)
def make_node(self, kern, topgrad, shape=None): def make_node(self, kern, topgrad, shape=None):
ctx_name = infer_context_name(kern, topgrad) ctx_name = infer_context_name(kern, topgrad)
...@@ -1029,6 +1040,10 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM): ...@@ -1029,6 +1040,10 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
assert shape[0].ndim == 0 assert shape[0].ndim == 0
assert shape[1].ndim == 0 assert shape[1].ndim == 0
if self.num_groups > 1:
broadcastable = [topgrad.type.broadcastable[0], False,
False, False]
else:
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1], broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
False, False] False, False]
return Apply(self, [kern, topgrad] + height_width, [GpuArrayType(dtype=topgrad.dtype, return Apply(self, [kern, topgrad] + height_width, [GpuArrayType(dtype=topgrad.dtype,
...@@ -1048,12 +1063,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM): ...@@ -1048,12 +1063,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
bottom = gpu_contiguous(bottom) bottom = gpu_contiguous(bottom)
d_weights = GpuCorrMM_gradWeights(self.border_mode, d_weights = GpuCorrMM_gradWeights(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation)(bottom, self.filter_dilation,
self.num_groups)(bottom,
top, top,
weights.shape[-2:]) weights.shape[-2:])
d_top = GpuCorrMM(self.border_mode, d_top = GpuCorrMM(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation)(bottom, weights) self.filter_dilation,
self.num_groups)(bottom, weights)
d_height_width = ( d_height_width = (
theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else () ) * 2 if len(inp) == 4 else ()
......
...@@ -348,7 +348,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -348,7 +348,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const size_t dilH = 1, const size_t dilH = 1,
const size_t dilW = 1, const size_t dilW = 1,
const size_t padH = 0, const size_t padH = 0,
const size_t padW = 0) const size_t padW = 0,
const size_t numgroups = 1)
{ {
if (PyGpuArray_NDIM(bottom) != 4) if (PyGpuArray_NDIM(bottom) != 4)
{ {
...@@ -411,8 +412,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -411,8 +412,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const size_t nFilters = PyGpuArray_DIMS(weight)[0]; const size_t nFilters = PyGpuArray_DIMS(weight)[0];
const size_t kH = PyGpuArray_DIMS(weight)[2]; const size_t kH = PyGpuArray_DIMS(weight)[2];
const size_t kW = PyGpuArray_DIMS(weight)[3]; const size_t kW = PyGpuArray_DIMS(weight)[3];
if (nChannels != PyGpuArray_DIMS(weight)[1]) { if (nChannels != (PyGpuArray_DIMS(weight)[1] * numgroups)) {
PyErr_SetString(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"GpuCorrMM images and kernel must have the same stack size\n"); "GpuCorrMM images and kernel must have the same stack size\n");
return NULL; return NULL;
} }
...@@ -469,11 +470,15 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -469,11 +470,15 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
} }
// Define some useful variables // Define some useful variables
const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode); const size_t batch_bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode); const size_t batch_top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
const size_t K_ = col_dim[0]; const size_t group_bottom_stride = (PyGpuArray_STRIDES(bottom)[1] * nChannels / numgroups) / gpuarray_get_elsize(bottom->ga.typecode);
const size_t group_top_stride = (PyGpuArray_STRIDES(top)[1] * nFilters / numgroups) / gpuarray_get_elsize(top->ga.typecode);
const size_t group_weight_stride = (PyGpuArray_STRIDES(weight)[0] * nFilters / numgroups) / gpuarray_get_elsize(weight->ga.typecode);
const size_t K_ = col_dim[0] / numgroups;
const size_t N_ = col_dim[1]; const size_t N_ = col_dim[1];
const size_t M_ = nFilters; const size_t group_col_stride = (K_ * N_);
const size_t M_ = nFilters / numgroups;
PyGpuArrayObject *output; PyGpuArrayObject *output;
if (direction == 0) { // forward pass if (direction == 0) { // forward pass
...@@ -493,7 +498,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -493,7 +498,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// First, im2col // First, im2col
err = im2col(&bottom->ga, n * bottom_stride, err = im2col(&bottom->ga, n * batch_bottom_stride,
nChannels, bottomHeight, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW, bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, &col->ga); padH, padW, dH, dW, &col->ga);
...@@ -502,12 +507,14 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -502,12 +507,14 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
return NULL; return NULL;
} }
// Second, gemm // Second, gemm
for (size_t g = 0; g < numgroups; g++){
err = rgemm(cb_fortran, cb_no_trans, cb_no_trans, err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1, N_, M_, K_, 1,
&col->ga, 0, N_, &col->ga, g * group_col_stride, N_,
&weight->ga, 0, K_, &weight->ga, g * group_weight_stride, K_,
0, 0,
&top->ga, n * top_stride, N_); &top->ga, n * batch_top_stride + g * group_top_stride, N_);
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM forward encountered an error running gemm: %d", err); "GpuCorrMM forward encountered an error running gemm: %d", err);
...@@ -533,7 +540,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -533,7 +540,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// First, im2col // First, im2col
err = im2col(&bottom->ga, n * bottom_stride, err = im2col(&bottom->ga, n * batch_bottom_stride,
nChannels, bottomHeight, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW, bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, &col->ga); padH, padW, dH, dW, &col->ga);
...@@ -545,12 +552,14 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -545,12 +552,14 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Note that we accumulate into weight. We do so by setting beta = 0 // Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This // for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.) // is faster than setting weight to all zeros before the loop.)
for(size_t g = 0; g < numgroups; g++){
err = rgemm(cb_fortran, cb_trans, cb_no_trans, err = rgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1, K_, M_, N_, 1,
&col->ga, 0, N_, &col->ga, g * group_col_stride, N_,
&top->ga, n * top_stride, N_, &top->ga, n * batch_top_stride + g * group_top_stride, N_,
(n == 0) ? 0 : 1, (n == 0) ? 0 : 1,
&weight->ga, 0, K_); &weight->ga, g * group_weight_stride, K_);
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM grad weights encountered an error running gemm: %d", err); "GpuCorrMM grad weights encountered an error running gemm: %d", err);
...@@ -576,12 +585,14 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -576,12 +585,14 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// gemm into columns // gemm into columns
for(size_t g = 0; g < numgroups; g++){
err = rgemm(cb_fortran, cb_no_trans, cb_trans, err = rgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, 1, N_, K_, M_, 1,
&top->ga, n * top_stride, N_, &top->ga, n * batch_top_stride + g * group_top_stride, N_,
&weight->ga, 0, K_, &weight->ga, g * group_weight_stride, K_,
0, 0,
&col->ga, 0, N_); &col->ga, g * group_col_stride, N_);
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM grad inputs encountered an error running gemm: %d", err); "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
...@@ -591,7 +602,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -591,7 +602,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// col2im back to the data // col2im back to the data
err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth, err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
kH, kW, dilH, dilW, padH, padW, kH, kW, dilH, dilW, padH, padW,
dH, dW, &bottom->ga, n * bottom_stride); dH, dW, &bottom->ga, n * batch_bottom_stride);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
......
...@@ -1509,7 +1509,8 @@ def local_abstractconv_gemm(node): ...@@ -1509,7 +1509,8 @@ def local_abstractconv_gemm(node):
border_mode = node.op.border_mode border_mode = node.op.border_mode
subsample = node.op.subsample subsample = node.op.subsample
filter_dilation = node.op.filter_dilation filter_dilation = node.op.filter_dilation
if ((border_mode == 'full') and (subsample == (1, 1))):
if ((border_mode == 'full') and (subsample == (1, 1)) and node.op.num_groups == 1):
if not node.op.filter_flip: if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1] kern = kern[:, :, ::-1, ::-1]
# need to dimshuffle the kernel for full convolution # need to dimshuffle the kernel for full convolution
...@@ -1526,7 +1527,8 @@ def local_abstractconv_gemm(node): ...@@ -1526,7 +1527,8 @@ def local_abstractconv_gemm(node):
# By default use GpuCorrMM # By default use GpuCorrMM
rval = GpuCorrMM(border_mode, rval = GpuCorrMM(border_mode,
subsample, subsample,
filter_dilation)(gpu_contiguous(img), filter_dilation,
node.op.num_groups)(gpu_contiguous(img),
gpu_contiguous(kern)) gpu_contiguous(kern))
# call GpuCorrMM_gradWeights if good # call GpuCorrMM_gradWeights if good
...@@ -1645,7 +1647,8 @@ def local_abstractconv_gradweights_gemm(node): ...@@ -1645,7 +1647,8 @@ def local_abstractconv_gradweights_gemm(node):
rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode, rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample, subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)( filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(
gpu_contiguous(img), gpu_contiguous(topgrad), shape) gpu_contiguous(img), gpu_contiguous(topgrad), shape)
if node.op.filter_flip: if node.op.filter_flip:
rval = rval[:, :, ::-1, ::-1] rval = rval[:, :, ::-1, ::-1]
...@@ -1689,7 +1692,8 @@ def local_abstractconv_gradinputs_gemm(node): ...@@ -1689,7 +1692,8 @@ def local_abstractconv_gradinputs_gemm(node):
rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode, rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample, subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)( filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(
gpu_contiguous(kern), gpu_contiguous(topgrad), shape) gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
return [rval] return [rval]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论