提交 70b1100c authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Move the gradients of convolution over to v3 and fix the test for gradI

上级 59dcaf9c
......@@ -12,7 +12,7 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
#ifdef CUDNN_VERSION > 3000
#if CUDNN_VERSION > 3000
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
......@@ -64,7 +64,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
#ifdef CUDNN_VERSION > 3000
#if CUDNN_VERSION > 3000
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
......
......@@ -11,12 +11,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
void *beta_p;
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size");
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
"stack size");
return 1;
}
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
......@@ -27,6 +27,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
beta_p = (void *)β
break;
case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)⁡
beta_p = (void *)&bf;
break;
......@@ -48,19 +49,142 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1;
#endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
return 1;
err = cudnnConvolutionBackwardData(
cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;
#ifdef CHOOSE_ALGO
static int reuse_algo = 0;
static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
static size_t prev_kern_dims[5] = {0};
static size_t prev_top_dims[5] = {0};
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
if (!reuse_algo) {
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardDataAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
algo = choice.algo;
#else
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
"information on the GPU: %s\n", cudaGetErrorString(err2));
return 1;
}
err = cudnnGetConvolutionBackwardDataAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
#endif
#endif
#if CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(*input, 0) > 1024 || PyGpuArray_DIM(*input, 1) > 1024 ||
(PyGpuArray_DIM(kerns, 0) == 1 && PyGpuArray_DIM(kerns, 1) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
#endif
size_t worksize;
gpudata *workspace;
PyGpuContextObject *c;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
return 1;
}
if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory");
return 1;
}
}
err = cudnnConvolutionBackwardData_v3(
_handle,
alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
desc,
desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));
if (worksize != 0)
c->ops->buffer_release(workspace);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
......
#section support_code_struct
int
int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc,
......@@ -16,9 +16,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1;
}
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
switch (input->ga.typecode) {
......@@ -27,6 +27,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
beta_p = (void *)&beta;
break;
case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
break;
......@@ -51,16 +52,140 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
err = cudnnConvolutionBackwardFilter(
cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;
#ifdef CHOOSE_ALGO
static int reuse_algo = 0;
static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
static size_t prev_img_dims[5] = {0};
static size_t prev_top_dims[5] = {0};
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
if (!reuse_algo) {
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardFilterAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
algo = choice.algo;
#else
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
"information on the GPU: %s\n", cudaGetErrorString(err2));
return 1;
}
err = cudnnGetConvolutionBackwardFilterAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
#endif
#endif
#ifdef CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) {
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 0) > 1024 || PyGpuArray_DIM(input, 1) > 1024 ||
(PyGpuArray_DIM(*kerns, 0) == 1 && PyGpuArray_DIM(*kerns, 1) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
}
}
#endif
size_t worksize;
gpudata *workspace;
PyGpuContextObject *c;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
return 1;
}
if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
return 1;
}
}
err = cudnnConvolutionBackwardFilter_v3(
_handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
desc,
desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));
if (worksize != 0)
c->ops->buffer_release(workspace);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
......
......@@ -467,42 +467,41 @@ class TestDnnInferShapes(utt.InferShapeTester):
img = T.ftensor4('img')
kerns = T.ftensor4('kerns')
out = T.ftensor4('out')
img_val = numpy.asarray(
numpy.random.rand(3, 4, 5, 6),
dtype='float32'
)
kern_vals = numpy.asarray(
numpy.random.rand(13, 14, 15, 16),
dtype='float32'
)
out_vals = numpy.asarray(
numpy.random.rand(3, 13, 5, 6),
dtype='float32'
)
for params in product(
['valid'], # Should this work for 'full'?
[(1, 1)],
['conv', 'cross']
):
temp_kerns = kerns.dimshuffle(1, 0, 2, 3)
shape = (
img_val.shape[0], kern_vals.shape[1],
img_val.shape[2] + kern_vals.shape[2] - 1,
img_val.shape[3] + kern_vals.shape[3] - 1
out_vals.shape[0], kern_vals.shape[1],
out_vals.shape[2] + kern_vals.shape[2] - 1,
out_vals.shape[3] + kern_vals.shape[3] - 1
)
out_vals = numpy.zeros(shape, dtype='float32')
img_vals = numpy.zeros(shape, dtype='float32')
desc = dnn.GpuDnnConvDesc(
border_mode=params[0],
subsample=params[1],
conv_mode=params[2]
)(out.shape, temp_kerns.shape)
)(out.shape, kerns.shape)
conv_grad_i = dnn.GpuDnnConvGradI()(
temp_kerns,
img,
kerns,
out,
img,
desc,
)
self._compile_and_check(
[temp_kerns, img, out],
[kerns, img, out],
[conv_grad_i],
[kern_vals, img_val, out_vals],
[kern_vals, img_vals, out_vals],
dnn.GpuDnnConvGradI
)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论