提交 7806823d authored 作者: notoraptor's avatar notoraptor

Apply changes to dnn_gi and dnn_gw.

上级 403865ea
......@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
reuse_algo = 0;
hash_prefix = std::string("GI|GPU#");
#ifdef DEBUG_TIMING
total_computation_time = 0;
total_selection_time = 0;
n_computations = 0;
n_selections = 0;
if (PARAMS->choose_algo) {
if (PARAMS->choose_time) {
selection_name = "fastest";
} else {
selection_name = "best suited";
}
};
#endif
#section support_code_struct
#line 9 "dnn_gi.c"
#line 22 "dnn_gi.c"
int reuse_algo;
AlgoRec prev_algo;
std::string hash_prefix;
......@@ -15,6 +28,13 @@ std::string hash_prefix;
#ifdef DEBUG
char algorithm_name[128];
#endif
#ifdef DEBUG_TIMING
double total_computation_time;
double total_selection_time;
size_t n_computations;
size_t n_selections;
const char* selection_name;
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
......@@ -86,6 +106,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0;
#ifdef DEBUG
if (_cppver) fprintf(stderr, "%s\n", _cppver);
#endif
#ifdef DEBUG_TIMING
TheanoTimer timer;
#endif
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
......@@ -159,11 +185,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
std::string hashkey;
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx);
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
if (params->choose_algo) {
if (!reuse_algo) {
......@@ -211,11 +238,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
ip = pygpu_empty(PyGpuArray_NDIM(*input), PyGpuArray_DIMS(*input), (*input)->ga.typecode, GA_C_ORDER, c, Py_None);
}
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(ip),
1, &count, &choice, *(void **)tmpmem, maxfree);
#ifdef DEBUG_TIMING
timer.end();
#endif
gpudata_release(tmpmem);
if (beta != 0) {
Py_XDECREF(ip);
......@@ -248,10 +281,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
mathtype = choice.mathType;
#endif
} else {
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
#ifdef DEBUG_TIMING
timer.end();
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
......@@ -259,6 +298,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1;
}
}
#ifdef DEBUG_TIMING
total_selection_time += timer.milliseconds;
++n_selections;
#endif
}
}
......@@ -313,7 +356,18 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
use_cached ? "(cache)": "",
worksize,
hashkey.c_str()
);
);
#endif
#ifdef DEBUG_TIMING
if (!(reuse_algo || use_cached)) {
// We have selected an algorithm at runtime.
// `timer` still contains timing about selection step.
fprintf(stderr, "\t(selected %s gradinput algo in %g milliseconds)\n", selection_name, timer.milliseconds);
if (n_selections > 1) {
fprintf(stderr, "\t(selected %lu gradinput algos in %g milliseconds (average: %g milliseconds per selection))\n",
n_selections, total_selection_time, total_selection_time / n_selections);
}
}
#endif
if (!reuse_algo) {
......@@ -348,6 +402,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*input)->ga);
timer.start();
#endif
for ( int g = 0; g < groups; g++) {
err = cudnnConvolutionBackwardData(
params->handle,
......@@ -368,6 +427,13 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*input)->ga);
timer.end();
total_computation_time += timer.milliseconds;
++n_computations;
#endif
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -375,5 +441,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cudnnGetErrorString(err));
return 1;
}
#ifdef DEBUG_TIMING
fprintf(stderr, "\t(ran gradinput algo in %g milliseconds)\n", timer.milliseconds);
if (n_computations > 1) {
fprintf(stderr, "\t(ran %lu gradinput computations in %g milliseconds (average: %g milliseconds per call))\n",
n_computations, total_computation_time, total_computation_time / n_computations);
}
#endif
return 0;
}
......@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
reuse_algo = 0;
hash_prefix = std::string("GW|GPU#");
#ifdef DEBUG_TIMING
total_computation_time = 0;
total_selection_time = 0;
n_computations = 0;
n_selections = 0;
if (PARAMS->choose_algo) {
if (PARAMS->choose_time) {
selection_name = "fastest";
} else {
selection_name = "best suited";
}
};
#endif
#section support_code_struct
#line 9 "dnn_gw.c"
#line 22 "dnn_gw.c"
int reuse_algo;
AlgoRec prev_algo;
std::string hash_prefix;
......@@ -15,6 +28,13 @@ std::string hash_prefix;
#ifdef DEBUG
char algorithm_name[128];
#endif
#ifdef DEBUG_TIMING
double total_computation_time;
double total_selection_time;
size_t n_computations;
size_t n_selections;
const char* selection_name;
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
......@@ -73,6 +93,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0;
#ifdef DEBUG
if (_cppver) fprintf(stderr, "%s\n", _cppver);
#endif
#ifdef DEBUG_TIMING
TheanoTimer timer;
#endif
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError,
......@@ -146,11 +172,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
std::string hashkey ;
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx);
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
if (params->choose_algo) {
if (!reuse_algo) {
......@@ -198,11 +225,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
k = pygpu_empty(PyGpuArray_NDIM(*kerns), PyGpuArray_DIMS(*kerns), (*kerns)->ga.typecode, GA_C_ORDER, c, Py_None);
}
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(k),
1, &count, &choice, *(void **)tmpmem, maxfree);
#ifdef DEBUG_TIMING
timer.end();
#endif
gpudata_release(tmpmem);
if (beta != 0) {
Py_XDECREF(k);
......@@ -237,10 +270,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
mathtype = choice.mathType;
#endif
} else {
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
#ifdef DEBUG_TIMING
timer.end();
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
......@@ -249,6 +288,10 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1;
}
}
#ifdef DEBUG_TIMING
total_selection_time += timer.milliseconds;
++n_selections;
#endif
}
} /* choose_algo */
......@@ -305,6 +348,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
hashkey.c_str()
);
#endif
#ifdef DEBUG_TIMING
if (!(reuse_algo || use_cached)) {
// We have selected an algorithm at runtime.
// `timer` still contains timing about selection step.
fprintf(stderr, "\t(selected %s gradweight algo in %g milliseconds)\n", selection_name, timer.milliseconds);
if (n_selections > 1) {
fprintf(stderr, "\t(selected %lu gradweight algos in %g milliseconds (average: %g milliseconds per selection))\n",
n_selections, total_selection_time, total_selection_time / n_selections);
}
}
#endif
if (!reuse_algo) {
// save for next time/cache
......@@ -339,6 +393,11 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*kerns)->ga);
timer.start();
#endif
for ( int g = 0; g < groups; g++) {
err = cudnnConvolutionBackwardFilter(
params->handle,
......@@ -359,6 +418,13 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*kerns)->ga);
timer.end();
total_computation_time += timer.milliseconds;
++n_computations;
#endif
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -366,5 +432,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cudnnGetErrorString(err));
return 1;
}
#ifdef DEBUG_TIMING
fprintf(stderr, "\t(ran gradweight algo in %g milliseconds)\n", timer.milliseconds);
if (n_computations > 1) {
fprintf(stderr, "\t(ran %lu gradweight computations in %g milliseconds (average: %g milliseconds per call))\n",
n_computations, total_computation_time, total_computation_time / n_computations);
}
#endif
return 0;
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论