提交 c2e14ce1 authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #5991 from affanv14/group

Implement Grouped Convolutions
差异被折叠。
...@@ -348,7 +348,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -348,7 +348,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const size_t dilH = 1, const size_t dilH = 1,
const size_t dilW = 1, const size_t dilW = 1,
const size_t padH = 0, const size_t padH = 0,
const size_t padW = 0) const size_t padW = 0,
const size_t numgroups = 1)
{ {
if (PyGpuArray_NDIM(bottom) != 4) if (PyGpuArray_NDIM(bottom) != 4)
{ {
...@@ -411,7 +412,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -411,7 +412,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const size_t nFilters = PyGpuArray_DIMS(weight)[0]; const size_t nFilters = PyGpuArray_DIMS(weight)[0];
const size_t kH = PyGpuArray_DIMS(weight)[2]; const size_t kH = PyGpuArray_DIMS(weight)[2];
const size_t kW = PyGpuArray_DIMS(weight)[3]; const size_t kW = PyGpuArray_DIMS(weight)[3];
if (nChannels != PyGpuArray_DIMS(weight)[1]) { if (nChannels != (PyGpuArray_DIMS(weight)[1] * numgroups)) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuCorrMM images and kernel must have the same stack size\n"); "GpuCorrMM images and kernel must have the same stack size\n");
return NULL; return NULL;
...@@ -469,11 +470,15 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -469,11 +470,15 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
} }
// Define some useful variables // Define some useful variables
const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode); const size_t batch_bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode); const size_t batch_top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
const size_t K_ = col_dim[0]; const size_t group_bottom_stride = (PyGpuArray_STRIDES(bottom)[1] * nChannels / numgroups) / gpuarray_get_elsize(bottom->ga.typecode);
const size_t group_top_stride = (PyGpuArray_STRIDES(top)[1] * nFilters / numgroups) / gpuarray_get_elsize(top->ga.typecode);
const size_t group_weight_stride = (PyGpuArray_STRIDES(weight)[0] * nFilters / numgroups) / gpuarray_get_elsize(weight->ga.typecode);
const size_t K_ = col_dim[0] / numgroups;
const size_t N_ = col_dim[1]; const size_t N_ = col_dim[1];
const size_t M_ = nFilters; const size_t group_col_stride = (K_ * N_);
const size_t M_ = nFilters / numgroups;
PyGpuArrayObject *output; PyGpuArrayObject *output;
if (direction == 0) { // forward pass if (direction == 0) { // forward pass
...@@ -493,21 +498,23 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -493,21 +498,23 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// First, im2col // First, im2col
err = im2col(&bottom->ga, n * bottom_stride, err = im2col(&bottom->ga, n * batch_bottom_stride,
nChannels, bottomHeight, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW, bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, &col->ga); padH, padW, dH, dW, &col->ga);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
} }
// Second, gemm // Second, gemm
err = rgemm(cb_fortran, cb_no_trans, cb_no_trans, for (size_t g = 0; g < numgroups; g++){
N_, M_, K_, 1, err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
&col->ga, 0, N_, N_, M_, K_, 1,
&weight->ga, 0, K_, &col->ga, g * group_col_stride, N_,
0, &weight->ga, g * group_weight_stride, K_,
&top->ga, n * top_stride, N_); 0,
&top->ga, n * batch_top_stride + g * group_top_stride, N_);
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM forward encountered an error running gemm: %d", err); "GpuCorrMM forward encountered an error running gemm: %d", err);
...@@ -533,7 +540,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -533,7 +540,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// First, im2col // First, im2col
err = im2col(&bottom->ga, n * bottom_stride, err = im2col(&bottom->ga, n * batch_bottom_stride,
nChannels, bottomHeight, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW, bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, &col->ga); padH, padW, dH, dW, &col->ga);
...@@ -545,12 +552,14 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -545,12 +552,14 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Note that we accumulate into weight. We do so by setting beta = 0 // Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This // for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.) // is faster than setting weight to all zeros before the loop.)
err = rgemm(cb_fortran, cb_trans, cb_no_trans, for(size_t g = 0; g < numgroups; g++){
K_, M_, N_, 1, err = rgemm(cb_fortran, cb_trans, cb_no_trans,
&col->ga, 0, N_, K_, M_, N_, 1,
&top->ga, n * top_stride, N_, &col->ga, g * group_col_stride, N_,
(n == 0) ? 0 : 1, &top->ga, n * batch_top_stride + g * group_top_stride, N_,
&weight->ga, 0, K_); (n == 0) ? 0 : 1,
&weight->ga, g * group_weight_stride, K_);
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM grad weights encountered an error running gemm: %d", err); "GpuCorrMM grad weights encountered an error running gemm: %d", err);
...@@ -575,13 +584,15 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -575,13 +584,15 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// full convolution: gemm, then col2im // full convolution: gemm, then col2im
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// gemm into columns // gemm into columns
err = rgemm(cb_fortran, cb_no_trans, cb_trans, for(size_t g = 0; g < numgroups; g++){
N_, K_, M_, 1, err = rgemm(cb_fortran, cb_no_trans, cb_trans,
&top->ga, n * top_stride, N_, N_, K_, M_, 1,
&weight->ga, 0, K_, &top->ga, n * batch_top_stride + g * group_top_stride, N_,
0, &weight->ga, g * group_weight_stride, K_,
&col->ga, 0, N_); 0,
&col->ga, g * group_col_stride, N_);
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM grad inputs encountered an error running gemm: %d", err); "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
...@@ -591,7 +602,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -591,7 +602,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// col2im back to the data // col2im back to the data
err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth, err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
kH, kW, dilH, dilW, padH, padW, kH, kW, dilH, dilW, padH, padW,
dH, dW, &bottom->ga, n * bottom_stride); dH, dW, &bottom->ga, n * batch_bottom_stride);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
......
差异被折叠。
#section support_code #section support_code
static int static int
c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { c_set_tensor_for_conv(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc, size_t groups) {
cudnnDataType_t dt; cudnnDataType_t dt;
size_t ds; size_t ds;
switch (var->ga.typecode) { switch (var->ga.typecode) {
...@@ -42,7 +42,8 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { ...@@ -42,7 +42,8 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
strs[i] = 1; strs[i] = 1;
dims[i] = 1; dims[i] = 1;
} }
//only for grouped convolution i.e when groups > 1
dims[1] = dims[1] / groups;
cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd < 3 ? 3 : nd, cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd < 3 ? 3 : nd,
dims, strs); dims, strs);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -54,6 +55,11 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { ...@@ -54,6 +55,11 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
return 0; return 0;
} }
static int
c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
return c_set_tensor_for_conv(var, desc, 1);
}
static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc) { static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc) {
cudnnStatus_t err; cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(desc); err = cudnnCreateTensorDescriptor(desc);
...@@ -71,7 +77,7 @@ static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc) ...@@ -71,7 +77,7 @@ static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc)
} }
static int static int
c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc, size_t groups) {
cudnnDataType_t dt; cudnnDataType_t dt;
cudnnStatus_t err; cudnnStatus_t err;
...@@ -111,6 +117,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { ...@@ -111,6 +117,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
/* Filters can't be less than 3d so we pad */ /* Filters can't be less than 3d so we pad */
for (unsigned int i = nd; i < 3; i++) for (unsigned int i = nd; i < 3; i++)
dims[i] = 1; dims[i] = 1;
dims[0] = dims[0] / groups;
if (nd < 3) if (nd < 3)
nd = 3; nd = 3;
...@@ -135,7 +142,7 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) { ...@@ -135,7 +142,7 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return -1; return -1;
} }
if (c_set_filter(var, *desc) != 0) { if (c_set_filter(var, *desc, 1) != 0) {
cudnnDestroyFilterDescriptor(*desc); cudnnDestroyFilterDescriptor(*desc);
return -1; return -1;
} }
......
...@@ -29,7 +29,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -29,7 +29,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"images and kernel must have the same stack size"); "images and kernel must have the same stack size");
return 1; return 1;
...@@ -72,12 +72,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -72,12 +72,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 0; return 0;
} }
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
return 1; return 1;
if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1)
return 1; return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups;
size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups;
size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo; cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
...@@ -281,15 +284,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -281,15 +284,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < params->num_groups; g++) {
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
params->handle, params->handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input) + input_offset * g,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns) + kern_offset * g,
desc, algo, desc, algo,
worksize == 0 ? NULL : *(void **)workspace, worksize, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output) + output_offset * g);
}
if (worksize != 0) if (worksize != 0)
gpudata_release(workspace); gpudata_release(workspace);
......
...@@ -28,7 +28,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -28,7 +28,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same " PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
"stack size"); "stack size");
return 1; return 1;
...@@ -71,12 +71,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -71,12 +71,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 0; return 0;
} }
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
return 1; return 1;
if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor_for_conv(*input, APPLY_SPECIFIC(input), params->num_groups) == -1)
return 1; return 1;
size_t input_offset = PyGpuArray_STRIDE(*input, 0) / params->num_groups;
size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo; cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
...@@ -93,7 +96,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -93,7 +96,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
if (PyGpuArray_NDIM(im) == 4) { if (PyGpuArray_NDIM(im) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) || if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) || (PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) || (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) { (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld" PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
...@@ -286,14 +289,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -286,14 +289,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardData( for ( int g = 0; g < params->num_groups; g++)
params->handle, {
alpha_p, err = cudnnConvolutionBackwardData(
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), params->handle,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), alpha_p,
desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns) + kern_offset * g,
beta_p, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output) + output_offset * g,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input)); desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input) + input_offset * g);
}
if (worksize != 0) if (worksize != 0)
gpudata_release(workspace); gpudata_release(workspace);
......
...@@ -28,7 +28,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -28,7 +28,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size"); "GpuDnnConv images and kernel must have the same stack size");
return 1; return 1;
...@@ -71,13 +71,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -71,13 +71,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 0; return 0;
} }
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1)
return 1; return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1)
return 1; return 1;
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
return 1; return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups;
size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / params->num_groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo; cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -93,7 +97,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -93,7 +97,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
if (PyGpuArray_NDIM(input) == 4) { if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) || if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) || (PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) || (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) { (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld" PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld"
...@@ -273,14 +277,18 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -273,14 +277,18 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardFilter( for ( int g = 0; g < params->num_groups; g++)
params->handle, {
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), err = cudnnConvolutionBackwardFilter(
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), params->handle,
desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, alpha_p,
beta_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input) + input_offset * g ,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns)); APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output) + output_offset * g,
desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns) + kern_offset * g);
}
if (worksize != 0) if (worksize != 0)
gpudata_release(workspace); gpudata_release(workspace);
......
...@@ -1533,7 +1533,8 @@ def local_abstractconv_gemm(node): ...@@ -1533,7 +1533,8 @@ def local_abstractconv_gemm(node):
border_mode = node.op.border_mode border_mode = node.op.border_mode
subsample = node.op.subsample subsample = node.op.subsample
filter_dilation = node.op.filter_dilation filter_dilation = node.op.filter_dilation
if ((border_mode == 'full') and (subsample == (1, 1))):
if ((border_mode == 'full') and (subsample == (1, 1)) and node.op.num_groups == 1):
if not node.op.filter_flip: if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1] kern = kern[:, :, ::-1, ::-1]
# need to dimshuffle the kernel for full convolution # need to dimshuffle the kernel for full convolution
...@@ -1550,8 +1551,9 @@ def local_abstractconv_gemm(node): ...@@ -1550,8 +1551,9 @@ def local_abstractconv_gemm(node):
# By default use GpuCorrMM # By default use GpuCorrMM
rval = GpuCorrMM(border_mode, rval = GpuCorrMM(border_mode,
subsample, subsample,
filter_dilation)(gpu_contiguous(img), filter_dilation,
gpu_contiguous(kern)) node.op.num_groups)(gpu_contiguous(img),
gpu_contiguous(kern))
# call GpuCorrMM_gradWeights if good # call GpuCorrMM_gradWeights if good
# (the latter is faster if batchsize * kernelHeight * kernelWidth # (the latter is faster if batchsize * kernelHeight * kernelWidth
...@@ -1669,7 +1671,8 @@ def local_abstractconv_gradweights_gemm(node): ...@@ -1669,7 +1671,8 @@ def local_abstractconv_gradweights_gemm(node):
rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode, rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample, subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)( filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(
gpu_contiguous(img), gpu_contiguous(topgrad), shape) gpu_contiguous(img), gpu_contiguous(topgrad), shape)
if node.op.filter_flip: if node.op.filter_flip:
rval = rval[:, :, ::-1, ::-1] rval = rval[:, :, ::-1, ::-1]
...@@ -1713,7 +1716,8 @@ def local_abstractconv_gradinputs_gemm(node): ...@@ -1713,7 +1716,8 @@ def local_abstractconv_gradinputs_gemm(node):
rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode, rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample, subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)( filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(
gpu_contiguous(kern), gpu_contiguous(topgrad), shape) gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
return [rval] return [rval]
......
...@@ -25,6 +25,7 @@ from . import test_nnet ...@@ -25,6 +25,7 @@ from . import test_nnet
from .rnn_support import Model, GRU, LSTM, WrapperLayer from .rnn_support import Model, GRU, LSTM, WrapperLayer
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD
from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
try: try:
import pygpu import pygpu
...@@ -2263,3 +2264,37 @@ def test_dnn_rnn_lstm_grad_c(): ...@@ -2263,3 +2264,37 @@ def test_dnn_rnn_lstm_grad_c():
(i + 1) * len(cudnn_grads_layer)] (i + 1) * len(cudnn_grads_layer)]
for j, g in enumerate(cudnn_grads_layer): for j, g in enumerate(cudnn_grads_layer):
utt.assert_allclose(ref_grads_layer[j], g) utt.assert_allclose(ref_grads_layer[j], g)
def dconv2d(border_mode, subsample, filter_dilation, num_groups):
def dconv(img, kern):
return dnn.dnn_conv(img, kern, border_mode=border_mode, subsample=subsample, dilation=filter_dilation,
conv_mode='conv', direction_hint='forward', workmem=None,
algo=None, precision=None, num_groups=num_groups)
return dconv
def dconv2dw(border_mode, subsample, filter_dilation, num_groups):
def dconvw(img, topgrad, kshp):
return dnn.dnn_gradweight(img, topgrad, kshp, border_mode=border_mode, subsample=subsample, dilation=filter_dilation,
conv_mode='conv', precision=None, algo=None, num_groups=num_groups)
return dconvw
def dconv2di(border_mode, subsample, filter_dilation, num_groups):
def dconvi(kern, topgrad, imshp):
return dnn.dnn_gradinput(kern, topgrad, imshp, border_mode=border_mode, subsample=subsample, dilation=filter_dilation,
conv_mode='conv', precision=None, algo=None, num_groups=num_groups)
return dconvi
class Cudnn_grouped_conv(Grouped_conv_noOptim):
mode = mode_with_gpu
conv2d = staticmethod(dconv2d)
conv2d_gradw = staticmethod(dconv2dw)
conv2d_gradi = staticmethod(dconv2di)
conv2d_op = dnn.GpuDnnConv
conv2d_gradw_op = dnn.GpuDnnConvGradW
conv2d_gradi_op = dnn.GpuDnnConvGradI
flip_filter = False
is_dnn = True
...@@ -11,6 +11,7 @@ from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInput ...@@ -11,6 +11,7 @@ from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInput
from ..type import gpuarray_shared_constructor from ..type import gpuarray_shared_constructor
from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
from .config import mode_with_gpu, mode_without_gpu, ref_cast from .config import mode_with_gpu, mode_without_gpu, ref_cast
from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
class TestCorrMM(unittest.TestCase): class TestCorrMM(unittest.TestCase):
...@@ -219,3 +220,15 @@ class TestCorrMM(unittest.TestCase): ...@@ -219,3 +220,15 @@ class TestCorrMM(unittest.TestCase):
verify_grad=False) verify_grad=False)
self.run_gradinput(inputs_shape=(1, 1024, 3, 1), self.run_gradinput(inputs_shape=(1, 1024, 3, 1),
filters_shape=(1, 1, 1, 1024)) filters_shape=(1, 1, 1, 1024))
class TestGroupGpuCorr2d(Grouped_conv_noOptim):
mode = theano.compile.get_mode("FAST_RUN")
conv2d = GpuCorrMM
conv2d_gradw = GpuCorrMM_gradWeights
conv2d_gradi = GpuCorrMM_gradInputs
conv2d_op = GpuCorrMM
conv2d_gradw_op = GpuCorrMM_gradWeights
conv2d_gradi_op = GpuCorrMM_gradInputs
flip_filter = True
is_dnn = False
...@@ -39,7 +39,7 @@ from .abstract_conv import conv3d ...@@ -39,7 +39,7 @@ from .abstract_conv import conv3d
def conv2d(input, filters, input_shape=None, filter_shape=None, def conv2d(input, filters, input_shape=None, filter_shape=None,
border_mode='valid', subsample=(1, 1), filter_flip=True, border_mode='valid', subsample=(1, 1), filter_flip=True,
image_shape=None, filter_dilation=(1, 1), **kwargs): image_shape=None, filter_dilation=(1, 1), num_groups=1, **kwargs):
""" """
This function will build the symbolic graph for convolving a mini-batch of a This function will build the symbolic graph for convolving a mini-batch of a
stack of 2D inputs with a set of 2D filters. The implementation is modelled stack of 2D inputs with a set of 2D filters. The implementation is modelled
...@@ -103,6 +103,10 @@ def conv2d(input, filters, input_shape=None, filter_shape=None, ...@@ -103,6 +103,10 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
Factor by which to subsample (stride) the input. Factor by which to subsample (stride) the input.
Also called dilation elsewhere. Also called dilation elsewhere.
num_groups : int
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately
kwargs: Any other keyword arguments are accepted for backwards kwargs: Any other keyword arguments are accepted for backwards
compatibility, but will be ignored. compatibility, but will be ignored.
...@@ -152,12 +156,12 @@ def conv2d(input, filters, input_shape=None, filter_shape=None, ...@@ -152,12 +156,12 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
return abstract_conv2d(input, filters, input_shape, filter_shape, return abstract_conv2d(input, filters, input_shape, filter_shape,
border_mode, subsample, filter_flip, border_mode, subsample, filter_flip,
filter_dilation) filter_dilation, num_groups)
def conv2d_transpose(input, filters, output_shape, filter_shape=None, def conv2d_transpose(input, filters, output_shape, filter_shape=None,
border_mode='valid', input_dilation=(1, 1), border_mode='valid', input_dilation=(1, 1),
filter_flip=True, filter_dilation=(1, 1)): filter_flip=True, filter_dilation=(1, 1), num_groups=1):
""" """
This function will build the symbolic graph for applying a transposed This function will build the symbolic graph for applying a transposed
convolution over a mini-batch of a stack of 2D inputs with a set of 2D convolution over a mini-batch of a stack of 2D inputs with a set of 2D
...@@ -209,6 +213,10 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None, ...@@ -209,6 +213,10 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
Factor by which to subsample (stride) the input. Factor by which to subsample (stride) the input.
Also called dilation elsewhere. Also called dilation elsewhere.
num_groups : int
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately
Returns Returns
------- -------
Symbolic 4D tensor Symbolic 4D tensor
...@@ -235,4 +243,5 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None, ...@@ -235,4 +243,5 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
border_mode=border_mode, border_mode=border_mode,
subsample=input_dilation, subsample=input_dilation,
filter_flip=filter_flip, filter_flip=filter_flip,
filter_dilation=filter_dilation) filter_dilation=filter_dilation,
num_groups=num_groups)
...@@ -40,9 +40,11 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -40,9 +40,11 @@ class BaseCorrMM(gof.OpenMPOp):
Perform subsampling of the output (default: (1, 1)). Perform subsampling of the output (default: (1, 1)).
filter_dilation filter_dilation
Perform dilated correlation (default: (1,1)) Perform dilated correlation (default: (1,1))
num_groups
Perform grouped convolutions (default: 1)
""" """
check_broadcast = False check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation') __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups')
_direction = None _direction = None
...@@ -51,10 +53,11 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -51,10 +53,11 @@ class BaseCorrMM(gof.OpenMPOp):
('DIRECTION_BACKPROP_INPUTS', 'backprop inputs')), # 2 ('DIRECTION_BACKPROP_INPUTS', 'backprop inputs')), # 2
dH=int64, dW=int64, dH=int64, dW=int64,
dilH=int64, dilW=int64, dilH=int64, dilW=int64,
padH=int64, padW=int64) padH=int64, padW=int64,
num_groups=int64)
def __init__(self, border_mode="valid", subsample=(1, 1), def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1), openmp=None): filter_dilation=(1, 1), num_groups=1, openmp=None):
super(BaseCorrMM, self).__init__(openmp=openmp) super(BaseCorrMM, self).__init__(openmp=openmp)
if isinstance(border_mode, integer_types): if isinstance(border_mode, integer_types):
if border_mode < 0: if border_mode < 0:
...@@ -97,6 +100,9 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -97,6 +100,9 @@ class BaseCorrMM(gof.OpenMPOp):
if self._direction not in ["forward", "backprop weights", "backprop inputs"]: if self._direction not in ["forward", "backprop weights", "backprop inputs"]:
raise ValueError("_direction must be one of 'forward', " raise ValueError("_direction must be one of 'forward', "
"'backprop weights', 'backprop inputs'") "'backprop weights', 'backprop inputs'")
if num_groups < 1:
raise ValueError("Number of groups should be greater than 0")
self.num_groups = num_groups
@property @property
def pad(self): def pad(self):
...@@ -124,11 +130,12 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -124,11 +130,12 @@ class BaseCorrMM(gof.OpenMPOp):
padW = property(lambda self: self.pad[1]) padW = property(lambda self: self.pad[1])
def __str__(self): def __str__(self):
return '%s{%s, %s, %s}' % ( return '%s{%s, %s, %s, %s}' % (
self.__class__.__name__, self.__class__.__name__,
self.border_mode, self.border_mode,
str(self.subsample), str(self.subsample),
str(self.filter_dilation)) str(self.filter_dilation),
str(self.num_groups))
@staticmethod @staticmethod
def as_common_dtype(in1, in2): def as_common_dtype(in1, in2):
...@@ -138,6 +145,11 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -138,6 +145,11 @@ class BaseCorrMM(gof.OpenMPOp):
dtype = theano.scalar.upcast(in1.dtype, in2.dtype) dtype = theano.scalar.upcast(in1.dtype, in2.dtype)
return in1.astype(dtype), in2.astype(dtype) return in1.astype(dtype), in2.astype(dtype)
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'num_groups'):
self.num_groups = 1
def c_support_code(self): def c_support_code(self):
ccodes = blas_headers.blas_header_text() ccodes = blas_headers.blas_header_text()
if self.blas_type == 'openblas': if self.blas_type == 'openblas':
...@@ -167,7 +179,7 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -167,7 +179,7 @@ class BaseCorrMM(gof.OpenMPOp):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (6, self.openmp, blas_header_version()) return (7, self.openmp, blas_header_version())
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
...@@ -274,6 +286,7 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -274,6 +286,7 @@ class BaseCorrMM(gof.OpenMPOp):
int dilW = %(params)s->dilW; int dilW = %(params)s->dilW;
int padH = %(params)s->padH; int padH = %(params)s->padH;
int padW = %(params)s->padW; int padW = %(params)s->padW;
int numgroups = %(params)s->num_groups;
PyArrayObject * bottom = %(bottom)s; PyArrayObject * bottom = %(bottom)s;
PyArrayObject * weights = %(weights)s; PyArrayObject * weights = %(weights)s;
...@@ -386,7 +399,7 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -386,7 +399,7 @@ class BaseCorrMM(gof.OpenMPOp):
// output is weights: (num_filters, num_channels, height, width) // output is weights: (num_filters, num_channels, height, width)
// height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1 // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = (npy_intp)PyArray_DIMS(top)[1]; out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1]; out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
out_dim[2] = (npy_intp)kH; // already inferred further above out_dim[2] = (npy_intp)kH; // already inferred further above
out_dim[3] = (npy_intp)kW; // how convenient out_dim[3] = (npy_intp)kW; // how convenient
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
...@@ -409,7 +422,7 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -409,7 +422,7 @@ class BaseCorrMM(gof.OpenMPOp):
// output is bottom: (batchsize, num_channels, height, width) // output is bottom: (batchsize, num_channels, height, width)
// height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
out_dim[0] = (npy_intp)PyArray_DIMS(top)[0]; out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1]; out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1] * numgroups;
out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH); out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW); out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
...@@ -465,7 +478,7 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -465,7 +478,7 @@ class BaseCorrMM(gof.OpenMPOp):
} }
// Call corrMM code // Call corrMM code
out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW); out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups );
if (out2==NULL){ if (out2==NULL){
%(fail)s %(fail)s
} }
...@@ -541,12 +554,14 @@ class CorrMM(BaseCorrMM): ...@@ -541,12 +554,14 @@ class CorrMM(BaseCorrMM):
top, = grads top, = grads
d_bottom = CorrMM_gradInputs(self.border_mode, d_bottom = CorrMM_gradInputs(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation)(weights, top, self.filter_dilation,
bottom.shape[-2:]) self.num_groups)(weights, top,
bottom.shape[-2:])
d_weights = CorrMM_gradWeights(self.border_mode, d_weights = CorrMM_gradWeights(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation)(bottom, top, self.filter_dilation,
weights.shape[-2:]) self.num_groups)(bottom, top,
weights.shape[-2:])
return d_bottom, d_weights return d_bottom, d_weights
...@@ -600,6 +615,7 @@ class CorrMM_gradWeights(BaseCorrMM): ...@@ -600,6 +615,7 @@ class CorrMM_gradWeights(BaseCorrMM):
imshp = input_shape[0] imshp = input_shape[0]
topshp = input_shape[1] topshp = input_shape[1]
ssize, imshp = imshp[1], list(imshp[2:]) ssize, imshp = imshp[1], list(imshp[2:])
ssize = ssize // self.num_groups
nkern, topshp = topshp[1], list(topshp[2:]) nkern, topshp = topshp[1], list(topshp[2:])
height_width = node.inputs[-2:] height_width = node.inputs[-2:]
if ((dH != 1) or (padH == -1)): if ((dH != 1) or (padH == -1)):
...@@ -632,11 +648,13 @@ class CorrMM_gradWeights(BaseCorrMM): ...@@ -632,11 +648,13 @@ class CorrMM_gradWeights(BaseCorrMM):
weights, = grads weights, = grads
d_bottom = CorrMM_gradInputs(self.border_mode, d_bottom = CorrMM_gradInputs(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation)(weights, top, self.filter_dilation,
bottom.shape[-2:]) self.num_groups)(weights, top,
bottom.shape[-2:])
d_top = CorrMM(self.border_mode, d_top = CorrMM(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation)(bottom, weights) self.filter_dilation,
self.num_groups)(bottom, weights)
d_height_width = ((theano.gradient.DisconnectedType()(),) * 2 d_height_width = ((theano.gradient.DisconnectedType()(),) * 2
if len(inp) == 4 else ()) if len(inp) == 4 else ())
return (d_bottom, d_top) + d_height_width return (d_bottom, d_top) + d_height_width
...@@ -678,8 +696,12 @@ class CorrMM_gradInputs(BaseCorrMM): ...@@ -678,8 +696,12 @@ class CorrMM_gradInputs(BaseCorrMM):
height_width = [as_tensor_variable(shape[0]).astype('int64'), height_width = [as_tensor_variable(shape[0]).astype('int64'),
as_tensor_variable(shape[1]).astype('int64')] as_tensor_variable(shape[1]).astype('int64')]
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1], if self.num_groups > 1:
False, False] broadcastable = [topgrad.type.broadcastable[0], False,
False, False]
else:
broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
False, False]
dtype = kern.type.dtype dtype = kern.type.dtype
return Apply(self, [kern, topgrad] + height_width, return Apply(self, [kern, topgrad] + height_width,
[TensorType(dtype, broadcastable)()]) [TensorType(dtype, broadcastable)()])
...@@ -698,6 +720,7 @@ class CorrMM_gradInputs(BaseCorrMM): ...@@ -698,6 +720,7 @@ class CorrMM_gradInputs(BaseCorrMM):
kshp = input_shape[0] kshp = input_shape[0]
topshp = input_shape[1] topshp = input_shape[1]
ssize, kshp = kshp[1], list(kshp[2:]) ssize, kshp = kshp[1], list(kshp[2:])
ssize = ssize * self.num_groups
bsize, topshp = topshp[0], list(topshp[2:]) bsize, topshp = topshp[0], list(topshp[2:])
height_width = node.inputs[-2:] height_width = node.inputs[-2:]
if padH == -1: if padH == -1:
...@@ -738,12 +761,14 @@ class CorrMM_gradInputs(BaseCorrMM): ...@@ -738,12 +761,14 @@ class CorrMM_gradInputs(BaseCorrMM):
bottom, = grads bottom, = grads
d_weights = CorrMM_gradWeights(self.border_mode, d_weights = CorrMM_gradWeights(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation)(bottom, self.filter_dilation,
top, self.num_groups)(bottom,
weights.shape[-2:]) top,
weights.shape[-2:])
d_top = CorrMM(self.border_mode, d_top = CorrMM(self.border_mode,
self.subsample, self.subsample,
self.filter_dilation)(bottom, weights) self.filter_dilation,
self.num_groups)(bottom, weights)
d_height_width = ((theano.gradient.DisconnectedType()(),) * d_height_width = ((theano.gradient.DisconnectedType()(),) *
2 if len(inp) == 4 else ()) 2 if len(inp) == 4 else ())
return (d_weights, d_top) + d_height_width return (d_weights, d_top) + d_height_width
......
...@@ -106,7 +106,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -106,7 +106,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
const int dilH = 1, const int dilH = 1,
const int dilW = 1, const int dilW = 1,
const int padH = 0, const int padH = 0,
const int padW = 0) const int padW = 0,
const int numgroups = 1)
{ {
if (PyArray_NDIM(bottom) != 4) if (PyArray_NDIM(bottom) != 4)
{ {
...@@ -155,7 +156,7 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -155,7 +156,7 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
const int nFilters = PyArray_DIMS(weight)[0]; const int nFilters = PyArray_DIMS(weight)[0];
const int kH = PyArray_DIMS(weight)[2]; const int kH = PyArray_DIMS(weight)[2];
const int kW = PyArray_DIMS(weight)[3]; const int kW = PyArray_DIMS(weight)[3];
if (nChannels != PyArray_DIMS(weight)[1]) { if (nChannels != (PyArray_DIMS(weight)[1] * numgroups)) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"CorrMM images and kernel must have the same stack size\n"); "CorrMM images and kernel must have the same stack size\n");
return NULL; return NULL;
...@@ -214,12 +215,16 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -214,12 +215,16 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
} }
// Define some useful variables // Define some useful variables
const int bottom_stride = PyArray_STRIDES(bottom)[0]/%(n_bytes)f; const int batch_bottom_stride = PyArray_STRIDES(bottom)[0]/%(n_bytes)f;
const int top_stride = PyArray_STRIDES(top)[0]/%(n_bytes)f; const int group_bottom_stride = (PyArray_STRIDES(bottom)[1] * nChannels / numgroups)/%(n_bytes)f;
const int K_ = col_dim[1]; const int batch_top_stride = PyArray_STRIDES(top)[0]/%(n_bytes)f;
const int group_top_stride = (PyArray_STRIDES(top)[1] * nFilters / numgroups)/%(n_bytes)f;
const int K_ = col_dim[1] / numgroups;
const int N_ = col_dim[2]; const int N_ = col_dim[2];
const int col_stride = (K_ * N_); const int col_stride = (K_ * N_ * numgroups);
const int M_ = nFilters; const int group_col_stride = (K_ * N_);
const int group_weight_stride = (PyArray_STRIDES(weight)[0] * nFilters / numgroups)/%(n_bytes)f;
const int M_ = nFilters / numgroups;
const %(c_float_type)s one = 1.0; const %(c_float_type)s one = 1.0;
const %(c_float_type)s zero = 0.0; const %(c_float_type)s zero = 0.0;
char NTrans = 'N'; char NTrans = 'N';
...@@ -253,17 +258,19 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -253,17 +258,19 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
for (int n = 0; n < batchSize; ++n) { for (int n = 0; n < batchSize; ++n) {
int tid = %(omp_get_thread_num)s; int tid = %(omp_get_thread_num)s;
// First, im2col // First, im2col
im2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels, bottomHeight, im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride, nChannels,
bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW, bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
(%(float_type)s*)PyArray_DATA(col)+ tid * col_stride); (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
// Second, gemm for ( int g = 0; g < numgroups; ++g){
%(gemm)s(&NTrans, &NTrans, // Second, gemm
&N_, &M_, &K_, %(gemm)s(&NTrans, &NTrans,
&one, &N_, &M_, &K_,
(%(float_type)s*)PyArray_DATA(col)+ tid * col_stride, &N_, &one,
(%(float_type)s*)PyArray_DATA(weight), &K_, (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
&zero, (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
(%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_); &zero,
(%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_);
}
} }
// Restore to previous blas threads // Restore to previous blas threads
%(blas_set_num_threads)s(blas_threads_saved); %(blas_set_num_threads)s(blas_threads_saved);
...@@ -304,7 +311,7 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -304,7 +311,7 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
output = weight; output = weight;
npy_intp weight_dim[2]; npy_intp weight_dim[2];
weight_dim[0] = (npy_intp)max_threads; weight_dim[0] = (npy_intp)max_threads;
weight_dim[1] = (npy_intp)(M_ * K_); weight_dim[1] = (npy_intp)(M_ * K_ * numgroups);
PyArrayObject* local_weight = (PyArrayObject*)PyArray_ZEROS(2, PyArrayObject* local_weight = (PyArrayObject*)PyArray_ZEROS(2,
weight_dim, PyArray_TYPE(weight), 0); weight_dim, PyArray_TYPE(weight), 0);
...@@ -326,21 +333,23 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -326,21 +333,23 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
for (int n = 0; n < batchSize; ++n) { for (int n = 0; n < batchSize; ++n) {
int tid = %(omp_get_thread_num)s; int tid = %(omp_get_thread_num)s;
// First, im2col // First, im2col
im2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels, bottomHeight, im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride,
bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW, nChannels, bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
(%(float_type)s*)PyArray_DATA(col)+ tid * col_stride); (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
// Second, gemm for(int g = 0; g < numgroups; ++g){
// Note that we accumulate into weight. We do so by setting beta = 0 // Second, gemm
// for the first iteration and beta = 1 for subsequent ones. (This // Note that we accumulate into weight. We do so by setting beta = 0
// is faster than setting weight to all zeros before the loop.) // for the first iteration and beta = 1 for subsequent ones. (This
%(gemm)s(&Trans, &NTrans, // is faster than setting weight to all zeros before the loop.)
&K_, &M_, &N_, %(gemm)s(&Trans, &NTrans,
&one, &K_, &M_, &N_,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride, &N_, &one,
(%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_, (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
(n == 0) ? &zero : &one, (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
(%(float_type)s*)PyArray_DATA(local_weight) + (n == 0) ? &zero : &one,
tid * weight_dim[1], &K_); (%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride +
tid * weight_dim[1], &K_);
}
} }
// Restore to previous blas threads // Restore to previous blas threads
%(blas_set_num_threads)s(blas_threads_saved); %(blas_set_num_threads)s(blas_threads_saved);
...@@ -401,19 +410,21 @@ PyArrayObject* corrMM(PyArrayObject* bottom, ...@@ -401,19 +410,21 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
%(blas_set_num_threads)s(1); %(blas_set_num_threads)s(1);
%(omp_flags)s %(omp_flags)s
for (int n = 0; n < batchSize; ++n) { for (int n = 0; n < batchSize; ++n) {
// gemm into columns
int tid = %(omp_get_thread_num)s; int tid = %(omp_get_thread_num)s;
%(gemm)s(&NTrans, &Trans, for ( int g = 0;g < numgroups; ++g){
&N_, &K_, &M_, // gemm into columns
&one, %(gemm)s(&NTrans, &Trans,
(%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_, &N_, &K_, &M_,
(%(float_type)s*)PyArray_DATA(weight), &K_, &one,
&zero, (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride, &N_); (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
&zero,
(%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_);
}
// col2im back to the data // col2im back to the data
col2im((%(float_type)s*)PyArray_DATA(col) + tid * col_stride, nChannels, bottomHeight, bottomWidth, col2im((%(float_type)s*)PyArray_DATA(col) + tid * col_stride, nChannels, bottomHeight, bottomWidth,
kH, kW, dilH, dilW, padH, padW, kH, kW, dilH, dilW, padH, padW,
dH, dW, (%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride); dH, dW, (%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride);
} }
// Restore to previous blas threads // Restore to previous blas threads
%(blas_set_num_threads)s(blas_threads_saved); %(blas_set_num_threads)s(blas_threads_saved);
......
...@@ -88,7 +88,9 @@ def local_abstractconv_gemm(node): ...@@ -88,7 +88,9 @@ def local_abstractconv_gemm(node):
kern = kern[:, :, ::-1, ::-1] kern = kern[:, :, ::-1, ::-1]
rval = CorrMM(border_mode=node.op.border_mode, rval = CorrMM(border_mode=node.op.border_mode,
subsample=node.op.subsample, subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)(img, kern) filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(img, kern)
copy_stack_trace(node.outputs[0], rval) copy_stack_trace(node.outputs[0], rval)
return [rval] return [rval]
...@@ -133,7 +135,8 @@ def local_abstractconv_gradweight_gemm(node): ...@@ -133,7 +135,8 @@ def local_abstractconv_gradweight_gemm(node):
rval = CorrMM_gradWeights(border_mode=node.op.border_mode, rval = CorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample, subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)(img, topgrad, shape) filter_dilation=node.op.filter_dilation,
num_groups=node.op.num_groups)(img, topgrad, shape)
copy_stack_trace(node.outputs[0], rval) copy_stack_trace(node.outputs[0], rval)
# need to flip the kernel if necessary # need to flip the kernel if necessary
...@@ -190,8 +193,9 @@ def local_abstractconv_gradinputs_gemm(node): ...@@ -190,8 +193,9 @@ def local_abstractconv_gradinputs_gemm(node):
kern = kern[:, :, ::-1, ::-1] kern = kern[:, :, ::-1, ::-1]
rval = CorrMM_gradInputs(border_mode=node.op.border_mode, rval = CorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample, subsample=node.op.subsample,
filter_dilation=node.op.filter_dilation)(kern, topgrad, filter_dilation=node.op.filter_dilation,
shape) num_groups=node.op.num_groups)(kern, topgrad,
shape)
copy_stack_trace(node.outputs[0], rval) copy_stack_trace(node.outputs[0], rval)
return [rval] return [rval]
...@@ -238,6 +242,8 @@ def local_conv2d_cpu(node): ...@@ -238,6 +242,8 @@ def local_conv2d_cpu(node):
if not node.op.filter_flip: if not node.op.filter_flip:
# Not tested yet # Not tested yet
return None return None
if node.op.num_groups > 1:
return None
rval = conv2d(img, kern, rval = conv2d(img, kern,
node.op.imshp, node.op.kshp, node.op.imshp, node.op.kshp,
...@@ -295,6 +301,8 @@ def local_conv2d_gradweight_cpu(node): ...@@ -295,6 +301,8 @@ def local_conv2d_gradweight_cpu(node):
if not node.op.filter_flip: if not node.op.filter_flip:
# Not tested yet # Not tested yet
return return
if node.op.num_groups > 1:
return None
if node.op.border_mode == 'valid' and \ if node.op.border_mode == 'valid' and \
(node.op.subsample != (1, 1)): (node.op.subsample != (1, 1)):
...@@ -447,6 +455,8 @@ def local_conv2d_gradinputs_cpu(node): ...@@ -447,6 +455,8 @@ def local_conv2d_gradinputs_cpu(node):
if not node.op.filter_flip: if not node.op.filter_flip:
# Not tested yet # Not tested yet
return None return None
if node.op.num_groups > 1:
return None
# Conv 3d implementation, needed when subsample > 2 # Conv 3d implementation, needed when subsample > 2
if node.op.border_mode == 'valid' and node.op.subsample != (1, 1): if node.op.border_mode == 'valid' and node.op.subsample != (1, 1):
......
...@@ -1699,3 +1699,158 @@ class TestConv2dGrads(unittest.TestCase): ...@@ -1699,3 +1699,158 @@ class TestConv2dGrads(unittest.TestCase):
) )
f_new = theano.function([self.x, self.output_grad_wrt], conv_wrt_w_out) f_new = theano.function([self.x, self.output_grad_wrt], conv_wrt_w_out)
utt.assert_allclose(f_new(input_val, out_grad_val), f_old(input_val, filter_val, out_grad_val)) utt.assert_allclose(f_new(input_val, out_grad_val), f_old(input_val, filter_val, out_grad_val))
class Grouped_conv_noOptim(unittest.TestCase):
conv2d = theano.tensor.nnet.abstract_conv.AbstractConv2d
conv2d_gradw = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
conv2d_gradi = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
conv2d_op = theano.tensor.nnet.abstract_conv.AbstractConv2d
conv2d_gradw_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
conv2d_gradi_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
mode = theano.Mode(optimizer=None)
flip_filter = False
is_dnn = False
def setUp(self):
self.num_groups = [3, 2, 4, 4]
self.border_mode = 'valid'
self.subsample = (1, 1)
self.img_shape = [(5, 6, 5, 5), (4, 4, 7, 5), (3, 8, 5, 3), (2, 4, 7, 7)]
self.kern_shape = [(6, 2, 3, 3), (6, 2, 5, 3), (4, 2, 3, 3), (4, 1, 3, 5)]
self.top_shape = [(5, 6, 3, 3), (4, 6, 3, 3), (3, 4, 3, 1), (2, 4, 5, 3)]
self.filter_dilation = (1, 1)
self.ref_mode = 'FAST_RUN'
if theano.config.cxx == "":
raise SkipTest("CorrMM needs cxx")
def test_fwd(self):
img_sym = theano.tensor.tensor4('img')
kern_sym = theano.tensor.tensor4('kern')
for imshp, kshp, groups in zip(self.img_shape, self.kern_shape, self.num_groups):
img = np.random.random(imshp).astype(theano.config.floatX)
kern = np.random.random(kshp).astype(theano.config.floatX)
split_imgs = np.split(img, groups, axis=1)
split_kern = np.split(kern, groups, axis=0)
grouped_conv_op = self.conv2d(border_mode=self.border_mode,
subsample=self.subsample,
filter_dilation=self.filter_dilation,
num_groups=groups)
if self.flip_filter:
grouped_conv_output = grouped_conv_op(img_sym, kern_sym[:, :, ::-1, ::-1])
else:
grouped_conv_output = grouped_conv_op(img_sym, kern_sym)
grouped_func = theano.function([img_sym, kern_sym], grouped_conv_output, mode=self.mode)
assert any([isinstance(node.op, self.conv2d_op)
for node in grouped_func.maker.fgraph.toposort()])
grouped_output = grouped_func(img, kern)
ref_conv_op = conv2d_corr(img_sym,
kern_sym,
border_mode=self.border_mode,
subsample=self.subsample,
filter_dilation=self.filter_dilation)
ref_func = theano.function([img_sym, kern_sym], ref_conv_op,
mode=self.ref_mode)
ref_concat_output = [ref_func(img_arr, kern_arr)
for img_arr, kern_arr in zip(split_imgs, split_kern)]
ref_concat_output = np.concatenate(ref_concat_output, axis=1)
utt.assert_allclose(grouped_output, ref_concat_output)
utt.verify_grad(grouped_conv_op,
[img, kern],
mode=self.mode,
eps=1)
def test_gradweights(self):
img_sym = theano.tensor.tensor4('img')
top_sym = theano.tensor.tensor4('top')
for imshp, kshp, tshp, groups in zip(self.img_shape, self.kern_shape, self.top_shape, self.num_groups):
img = np.random.random(imshp).astype(theano.config.floatX)
top = np.random.random(tshp).astype(theano.config.floatX)
split_imgs = np.split(img, groups, axis=1)
split_top = np.split(top, groups, axis=1)
grouped_convgrad_op = self.conv2d_gradw(border_mode=self.border_mode,
subsample=self.subsample,
filter_dilation=self.filter_dilation,
num_groups=groups)
grouped_conv_output = grouped_convgrad_op(img_sym,
top_sym,
tensor.as_tensor_variable(kshp if self.is_dnn else kshp[-2:]))
if self.flip_filter:
grouped_conv_output = grouped_conv_output[:, :, ::-1, ::-1]
grouped_func = theano.function([img_sym, top_sym], grouped_conv_output, mode=self.mode)
assert any([isinstance(node.op, self.conv2d_gradw_op)
for node in grouped_func.maker.fgraph.toposort()])
grouped_output = grouped_func(img, top)
ref_conv_op = conv2d_corr_gw(img_sym,
top_sym,
kshp,
border_mode=self.border_mode,
subsample=self.subsample,
filter_dilation=self.filter_dilation)
ref_func = theano.function([img_sym, top_sym], ref_conv_op,
mode=self.ref_mode)
ref_concat_output = [ref_func(img_arr, top_arr)
for img_arr, top_arr in zip(split_imgs, split_top)]
ref_concat_output = np.concatenate(ref_concat_output, axis=0)
utt.assert_allclose(grouped_output, ref_concat_output)
def conv_gradweight(inputs_val, output_val):
return grouped_convgrad_op(inputs_val, output_val,
tensor.as_tensor_variable(kshp if self.is_dnn else kshp[-2:]))
utt.verify_grad(conv_gradweight,
[img, top],
mode=self.mode, eps=1)
def test_gradinputs(self):
kern_sym = theano.tensor.tensor4('kern')
top_sym = theano.tensor.tensor4('top')
for imshp, kshp, tshp, groups in zip(self.img_shape, self.kern_shape, self.top_shape, self.num_groups):
kern = np.random.random(kshp).astype(theano.config.floatX)
top = np.random.random(tshp).astype(theano.config.floatX)
split_kerns = np.split(kern, groups, axis=0)
split_top = np.split(top, groups, axis=1)
grouped_convgrad_op = self.conv2d_gradi(border_mode=self.border_mode,
subsample=self.subsample,
filter_dilation=self.filter_dilation,
num_groups=groups)
if self.flip_filter:
grouped_conv_output = grouped_convgrad_op(kern_sym[:, :, ::-1, ::-1], top_sym, tensor.as_tensor_variable(imshp[-2:]))
else:
grouped_conv_output = grouped_convgrad_op(kern_sym,
top_sym,
tensor.as_tensor_variable(imshp if self.is_dnn else imshp[-2:]))
grouped_func = theano.function([kern_sym, top_sym], grouped_conv_output, mode=self.mode)
assert any([isinstance(node.op, self.conv2d_gradi_op)
for node in grouped_func.maker.fgraph.toposort()])
grouped_output = grouped_func(kern, top)
ref_conv_op = conv2d_corr_gi(kern_sym,
top_sym,
imshp,
border_mode=self.border_mode,
subsample=self.subsample,
filter_dilation=self.filter_dilation)
ref_func = theano.function([kern_sym, top_sym], ref_conv_op,
mode=self.ref_mode)
ref_concat_output = [ref_func(kern_arr, top_arr)
for kern_arr, top_arr in zip(split_kerns, split_top)]
ref_concat_output = np.concatenate(ref_concat_output, axis=1)
utt.assert_allclose(grouped_output, ref_concat_output)
def conv_gradinputs(filters_val, output_val):
return grouped_convgrad_op(filters_val, output_val,
tensor.as_tensor_variable(imshp if self.is_dnn else imshp[-2:]))
utt.verify_grad(conv_gradinputs,
[kern, top],
mode=self.mode, eps=1)
...@@ -10,6 +10,7 @@ import theano ...@@ -10,6 +10,7 @@ import theano
import theano.tensor as T import theano.tensor as T
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tensor.nnet import corr, conv from theano.tensor.nnet import corr, conv
from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
class TestCorr2D(utt.InferShapeTester): class TestCorr2D(utt.InferShapeTester):
...@@ -416,6 +417,49 @@ class TestCorr2D(utt.InferShapeTester): ...@@ -416,6 +417,49 @@ class TestCorr2D(utt.InferShapeTester):
self.validate((3, 2, 7, 5), (5, 2, 2, 3), 2, non_contiguous=True) self.validate((3, 2, 7, 5), (5, 2, 2, 3), 2, non_contiguous=True)
class TestGroupCorr2d(Grouped_conv_noOptim):
if theano.config.mode == "FAST_COMPILE":
mode = theano.compile.get_mode("FAST_RUN")
else:
mode = None
conv2d = corr.CorrMM
conv2d_gradw = corr.CorrMM_gradWeights
conv2d_gradi = corr.CorrMM_gradInputs
conv2d_op = corr.CorrMM
conv2d_gradw_op = corr.CorrMM_gradWeights
conv2d_gradi_op = corr.CorrMM_gradInputs
flip_filter = True
is_dnn = False
def test_graph(self):
# define common values first
groups = 3
bottom = np.random.rand(3, 6, 5, 5).astype(theano.config.floatX)
kern = np.random.rand(9, 2, 3, 3).astype(theano.config.floatX)
bottom_sym = T.tensor4('bottom')
kern_sym = T.tensor4('kern')
# grouped convolution graph
conv_group = self.conv2d(num_groups=groups)(bottom_sym, kern_sym)
gconv_func = theano.function([bottom_sym, kern_sym], conv_group, mode=self.mode)
# Graph for the normal hard way
kern_offset = kern_sym.shape[0] // groups
bottom_offset = bottom_sym.shape[1] // groups
split_conv_output = [self.conv2d()(bottom_sym[:, i * bottom_offset:(i + 1) * bottom_offset, :, :],
kern_sym[i * kern_offset:(i + 1) * kern_offset, :, :, :])
for i in range(groups)]
concatenated_output = T.concatenate(split_conv_output, axis=1)
conv_func = theano.function([bottom_sym, kern_sym], concatenated_output, mode=self.mode)
# calculate outputs for each graph
gconv_output = gconv_func(bottom, kern)
conv_output = conv_func(bottom, kern)
# compare values
utt.assert_allclose(gconv_output, conv_output)
if __name__ == '__main__': if __name__ == '__main__':
t = TestCorr2D('setUp') t = TestCorr2D('setUp')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论