提交 f241dfac authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6348 from notoraptor/debug-info-cudnn-conv-timing

Add DEBUG infos to profile cuDNN convolutions.
...@@ -85,6 +85,34 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache; ...@@ -85,6 +85,34 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
#line 87 "dnn_conv_base.c" #line 87 "dnn_conv_base.c"
#ifdef DEBUG
#if __cplusplus < 201103L
const char* const _cppver = "No timing available: C++11 or later is required.";
#else
#define DEBUG_TIMING
#include <chrono>
const char* const _cppver = NULL;
struct TheanoTimer {
double milliseconds;
std::chrono::steady_clock::time_point base;
void start() {base = std::chrono::steady_clock::now();}
void end() {
milliseconds =
std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now() - base
).count() / 1000000.0;
}
};
#endif
#endif
pthread_mutex_t algoMutex; pthread_mutex_t algoMutex;
AlgoCache algoCache; AlgoCache algoCache;
......
...@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo; ...@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH; prev_algo.mathType = CUDNN_DEFAULT_MATH;
reuse_algo = 0; reuse_algo = 0;
hash_prefix = std::string("FWD|GPU#"); hash_prefix = std::string("FWD|GPU#");
#ifdef DEBUG_TIMING
total_computation_time = 0;
total_selection_time = 0;
n_computations = 0;
n_selections = 0;
if (PARAMS->choose_algo) {
if (PARAMS->choose_time) {
selection_name = "fastest";
} else {
selection_name = "best suited";
}
};
#endif
#section support_code_struct #section support_code_struct
#line 9 "dnn_fwd.c" #line 22 "dnn_fwd.c"
int reuse_algo; int reuse_algo;
AlgoRec prev_algo; AlgoRec prev_algo;
std::string hash_prefix; std::string hash_prefix;
...@@ -15,6 +28,13 @@ std::string hash_prefix; ...@@ -15,6 +28,13 @@ std::string hash_prefix;
#ifdef DEBUG #ifdef DEBUG
char algorithm_name[128]; char algorithm_name[128];
#endif #endif
#ifdef DEBUG_TIMING
double total_computation_time;
double total_selection_time;
size_t n_computations;
size_t n_selections;
const char* selection_name;
#endif
/** Check given algorithm against inputs and convolution descriptor, /** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail. change algorithm inplace to a fallback algorithm if checkings fail.
...@@ -121,6 +141,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -121,6 +141,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0; bool use_cached = 0;
#ifdef DEBUG
if (_cppver) fprintf(stderr, "%s\n", _cppver);
#endif
#ifdef DEBUG_TIMING
TheanoTimer timer;
#endif
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -193,7 +219,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -193,7 +219,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_enter(c->ctx); cuda_enter(c->ctx);
size_t maxfree = c_get_largest_free_block_size(c); size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1; if (PyErr_Occurred()) {
cuda_exit(c->ctx);
return 1;
}
if (params->choose_algo) { if (params->choose_algo) {
...@@ -241,6 +270,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -241,6 +270,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
o = pygpu_empty(PyGpuArray_NDIM(*output), PyGpuArray_DIMS(*output), (*output)->ga.typecode, GA_C_ORDER, c, Py_None); o = pygpu_empty(PyGpuArray_NDIM(*output), PyGpuArray_DIMS(*output), (*output)->ga.typecode, GA_C_ORDER, c, Py_None);
} }
#ifdef DEBUG_TIMING
timer.start();
#endif
// We don't sync the buffer as we don't care about the values. // We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx( err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
...@@ -248,6 +280,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -248,6 +280,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(o), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(o),
1, &count, &choice, *(void **)tmpmem, 1, &count, &choice, *(void **)tmpmem,
maxfree); maxfree);
#ifdef DEBUG_TIMING
timer.end();
#endif
gpudata_release(tmpmem); gpudata_release(tmpmem);
if (beta != 0) { if (beta != 0) {
Py_XDECREF(o); Py_XDECREF(o);
...@@ -282,10 +317,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -282,10 +317,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
mathtype = choice.mathType; mathtype = choice.mathType;
#endif #endif
} else { } else {
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnGetConvolutionForwardAlgorithm( err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo); CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
#ifdef DEBUG_TIMING
timer.end();
#endif
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
...@@ -294,6 +335,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -294,6 +335,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
} }
#ifdef DEBUG_TIMING
total_selection_time += timer.milliseconds;
++n_selections;
#endif
} }
} }
...@@ -356,7 +401,18 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -356,7 +401,18 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
use_cached ? "(cache)": "", use_cached ? "(cache)": "",
worksize, worksize,
hashkey.c_str() hashkey.c_str()
); );
#endif
#ifdef DEBUG_TIMING
if (!(reuse_algo || use_cached)) {
// We have selected an algorithm at runtime.
// `timer` still contains timing about selection step.
fprintf(stderr, "\t(selected %s fwd algo in %g milliseconds)\n", selection_name, timer.milliseconds);
if (n_selections > 1) {
fprintf(stderr, "\t(selected %lu fwd algos in %g milliseconds (average: %g milliseconds per selection))\n",
n_selections, total_selection_time, total_selection_time / n_selections);
}
}
#endif #endif
if (!reuse_algo) { if (!reuse_algo) {
...@@ -377,11 +433,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -377,11 +433,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{ {
gpudata *workspace = 0; gpudata *workspace = 0;
/*
* This is less than ideal since we need to free it after (which
* introduces a synchronization point. But we don't have a module
* to place a nice get_work_mem() function in.
*/
if (worksize != 0) { if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
...@@ -391,10 +442,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -391,10 +442,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
} }
if (worksize != 0)
cuda_wait(workspace, GPUARRAY_CUDA_WAIT_WRITE);
cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*output)->ga);
timer.start();
#endif
for ( int g = 0; g < groups; g++) { for ( int g = 0; g < groups; g++) {
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
params->handle, params->handle,
...@@ -407,14 +465,23 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -407,14 +465,23 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g); APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
} }
if (worksize != 0) if (worksize != 0) {
cuda_record(workspace, GPUARRAY_CUDA_WAIT_WRITE);
gpudata_release(workspace); gpudata_release(workspace);
}
cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
} }
#ifdef DEBUG_TIMING
GpuArray_sync(&(*output)->ga);
timer.end();
total_computation_time += timer.milliseconds;
++n_computations;
#endif
cuda_exit(c->ctx); cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -422,6 +489,13 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -422,6 +489,13 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
#ifdef DEBUG_TIMING
fprintf(stderr, "\t(ran fwd algo in %g milliseconds)\n", timer.milliseconds);
if (n_computations > 1) {
fprintf(stderr, "\t(ran %lu fwd computations in %g milliseconds (average: %g milliseconds per call))\n",
n_computations, total_computation_time, total_computation_time / n_computations);
}
#endif
return 0; return 0;
} }
......
...@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo; ...@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH; prev_algo.mathType = CUDNN_DEFAULT_MATH;
reuse_algo = 0; reuse_algo = 0;
hash_prefix = std::string("GI|GPU#"); hash_prefix = std::string("GI|GPU#");
#ifdef DEBUG_TIMING
total_computation_time = 0;
total_selection_time = 0;
n_computations = 0;
n_selections = 0;
if (PARAMS->choose_algo) {
if (PARAMS->choose_time) {
selection_name = "fastest";
} else {
selection_name = "best suited";
}
};
#endif
#section support_code_struct #section support_code_struct
#line 9 "dnn_gi.c" #line 22 "dnn_gi.c"
int reuse_algo; int reuse_algo;
AlgoRec prev_algo; AlgoRec prev_algo;
std::string hash_prefix; std::string hash_prefix;
...@@ -15,6 +28,13 @@ std::string hash_prefix; ...@@ -15,6 +28,13 @@ std::string hash_prefix;
#ifdef DEBUG #ifdef DEBUG
char algorithm_name[128]; char algorithm_name[128];
#endif #endif
#ifdef DEBUG_TIMING
double total_computation_time;
double total_selection_time;
size_t n_computations;
size_t n_selections;
const char* selection_name;
#endif
/** Check given algorithm against inputs and convolution descriptor, /** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail. change algorithm inplace to a fallback algorithm if checkings fail.
...@@ -86,6 +106,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -86,6 +106,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0; bool use_cached = 0;
#ifdef DEBUG
if (_cppver) fprintf(stderr, "%s\n", _cppver);
#endif
#ifdef DEBUG_TIMING
TheanoTimer timer;
#endif
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) { if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same " PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
...@@ -159,11 +185,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -159,11 +185,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
std::string hashkey; std::string hashkey;
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx); cuda_enter(c->ctx);
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) {
cuda_exit(c->ctx);
return 1;
}
if (params->choose_algo) { if (params->choose_algo) {
if (!reuse_algo) { if (!reuse_algo) {
...@@ -211,11 +241,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -211,11 +241,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
ip = pygpu_empty(PyGpuArray_NDIM(*input), PyGpuArray_DIMS(*input), (*input)->ga.typecode, GA_C_ORDER, c, Py_None); ip = pygpu_empty(PyGpuArray_NDIM(*input), PyGpuArray_DIMS(*input), (*input)->ga.typecode, GA_C_ORDER, c, Py_None);
} }
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnFindConvolutionBackwardDataAlgorithmEx( err = cudnnFindConvolutionBackwardDataAlgorithmEx(
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(ip), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(ip),
1, &count, &choice, *(void **)tmpmem, maxfree); 1, &count, &choice, *(void **)tmpmem, maxfree);
#ifdef DEBUG_TIMING
timer.end();
#endif
gpudata_release(tmpmem); gpudata_release(tmpmem);
if (beta != 0) { if (beta != 0) {
Py_XDECREF(ip); Py_XDECREF(ip);
...@@ -248,10 +284,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -248,10 +284,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
mathtype = choice.mathType; mathtype = choice.mathType;
#endif #endif
} else { } else {
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnGetConvolutionBackwardDataAlgorithm( err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input), desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo); CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
#ifdef DEBUG_TIMING
timer.end();
#endif
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
...@@ -259,6 +301,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -259,6 +301,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1; return 1;
} }
} }
#ifdef DEBUG_TIMING
total_selection_time += timer.milliseconds;
++n_selections;
#endif
} }
} }
...@@ -313,7 +359,18 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -313,7 +359,18 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
use_cached ? "(cache)": "", use_cached ? "(cache)": "",
worksize, worksize,
hashkey.c_str() hashkey.c_str()
); );
#endif
#ifdef DEBUG_TIMING
if (!(reuse_algo || use_cached)) {
// We have selected an algorithm at runtime.
// `timer` still contains timing about selection step.
fprintf(stderr, "\t(selected %s gradinput algo in %g milliseconds)\n", selection_name, timer.milliseconds);
if (n_selections > 1) {
fprintf(stderr, "\t(selected %lu gradinput algos in %g milliseconds (average: %g milliseconds per selection))\n",
n_selections, total_selection_time, total_selection_time / n_selections);
}
}
#endif #endif
if (!reuse_algo) { if (!reuse_algo) {
...@@ -342,10 +399,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -342,10 +399,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
} }
if (worksize != 0)
cuda_wait(workspace, GPUARRAY_CUDA_WAIT_WRITE);
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*input)->ga);
timer.start();
#endif
for ( int g = 0; g < groups; g++) { for ( int g = 0; g < groups; g++) {
err = cudnnConvolutionBackwardData( err = cudnnConvolutionBackwardData(
params->handle, params->handle,
...@@ -357,13 +421,22 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -357,13 +421,22 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(*input)) + input_offset * g); APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(*input)) + input_offset * g);
} }
if (worksize != 0) if (worksize != 0) {
cuda_record(workspace, GPUARRAY_CUDA_WAIT_WRITE);
gpudata_release(workspace); gpudata_release(workspace);
}
cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_record((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*input)->ga);
timer.end();
total_computation_time += timer.milliseconds;
++n_computations;
#endif
cuda_exit(c->ctx); cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -371,5 +444,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -371,5 +444,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
#ifdef DEBUG_TIMING
fprintf(stderr, "\t(ran gradinput algo in %g milliseconds)\n", timer.milliseconds);
if (n_computations > 1) {
fprintf(stderr, "\t(ran %lu gradinput computations in %g milliseconds (average: %g milliseconds per call))\n",
n_computations, total_computation_time, total_computation_time / n_computations);
}
#endif
return 0; return 0;
} }
...@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo; ...@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH; prev_algo.mathType = CUDNN_DEFAULT_MATH;
reuse_algo = 0; reuse_algo = 0;
hash_prefix = std::string("GW|GPU#"); hash_prefix = std::string("GW|GPU#");
#ifdef DEBUG_TIMING
total_computation_time = 0;
total_selection_time = 0;
n_computations = 0;
n_selections = 0;
if (PARAMS->choose_algo) {
if (PARAMS->choose_time) {
selection_name = "fastest";
} else {
selection_name = "best suited";
}
};
#endif
#section support_code_struct #section support_code_struct
#line 9 "dnn_gw.c" #line 22 "dnn_gw.c"
int reuse_algo; int reuse_algo;
AlgoRec prev_algo; AlgoRec prev_algo;
std::string hash_prefix; std::string hash_prefix;
...@@ -15,6 +28,13 @@ std::string hash_prefix; ...@@ -15,6 +28,13 @@ std::string hash_prefix;
#ifdef DEBUG #ifdef DEBUG
char algorithm_name[128]; char algorithm_name[128];
#endif #endif
#ifdef DEBUG_TIMING
double total_computation_time;
double total_selection_time;
size_t n_computations;
size_t n_selections;
const char* selection_name;
#endif
/** Check given algorithm against inputs and convolution descriptor, /** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail. change algorithm inplace to a fallback algorithm if checkings fail.
...@@ -73,6 +93,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -73,6 +93,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0; bool use_cached = 0;
#ifdef DEBUG
if (_cppver) fprintf(stderr, "%s\n", _cppver);
#endif
#ifdef DEBUG_TIMING
TheanoTimer timer;
#endif
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -146,11 +172,15 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -146,11 +172,15 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
std::string hashkey ; std::string hashkey ;
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx); cuda_enter(c->ctx);
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) {
cuda_exit(c->ctx);
return 1;
}
if (params->choose_algo) { if (params->choose_algo) {
if (!reuse_algo) { if (!reuse_algo) {
...@@ -198,11 +228,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -198,11 +228,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
k = pygpu_empty(PyGpuArray_NDIM(*kerns), PyGpuArray_DIMS(*kerns), (*kerns)->ga.typecode, GA_C_ORDER, c, Py_None); k = pygpu_empty(PyGpuArray_NDIM(*kerns), PyGpuArray_DIMS(*kerns), (*kerns)->ga.typecode, GA_C_ORDER, c, Py_None);
} }
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnFindConvolutionBackwardFilterAlgorithmEx( err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(k), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(k),
1, &count, &choice, *(void **)tmpmem, maxfree); 1, &count, &choice, *(void **)tmpmem, maxfree);
#ifdef DEBUG_TIMING
timer.end();
#endif
gpudata_release(tmpmem); gpudata_release(tmpmem);
if (beta != 0) { if (beta != 0) {
Py_XDECREF(k); Py_XDECREF(k);
...@@ -237,10 +273,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -237,10 +273,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
mathtype = choice.mathType; mathtype = choice.mathType;
#endif #endif
} else { } else {
#ifdef DEBUG_TIMING
timer.start();
#endif
err = cudnnGetConvolutionBackwardFilterAlgorithm( err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo); CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
#ifdef DEBUG_TIMING
timer.end();
#endif
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
...@@ -249,6 +291,10 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -249,6 +291,10 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1; return 1;
} }
} }
#ifdef DEBUG_TIMING
total_selection_time += timer.milliseconds;
++n_selections;
#endif
} }
} /* choose_algo */ } /* choose_algo */
...@@ -305,6 +351,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -305,6 +351,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
hashkey.c_str() hashkey.c_str()
); );
#endif #endif
#ifdef DEBUG_TIMING
if (!(reuse_algo || use_cached)) {
// We have selected an algorithm at runtime.
// `timer` still contains timing about selection step.
fprintf(stderr, "\t(selected %s gradweight algo in %g milliseconds)\n", selection_name, timer.milliseconds);
if (n_selections > 1) {
fprintf(stderr, "\t(selected %lu gradweight algos in %g milliseconds (average: %g milliseconds per selection))\n",
n_selections, total_selection_time, total_selection_time / n_selections);
}
}
#endif
if (!reuse_algo) { if (!reuse_algo) {
// save for next time/cache // save for next time/cache
...@@ -333,10 +390,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -333,10 +390,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
} }
if (worksize != 0)
cuda_wait(workspace, GPUARRAY_CUDA_WAIT_WRITE);
cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*kerns)->ga);
timer.start();
#endif
for ( int g = 0; g < groups; g++) { for ( int g = 0; g < groups; g++) {
err = cudnnConvolutionBackwardFilter( err = cudnnConvolutionBackwardFilter(
params->handle, params->handle,
...@@ -348,13 +412,22 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -348,13 +412,22 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(*kerns)) + kern_offset * g); APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(*kerns)) + kern_offset * g);
} }
if (worksize != 0) if (worksize != 0) {
cuda_record(workspace, GPUARRAY_CUDA_WAIT_WRITE);
gpudata_release(workspace); gpudata_release(workspace);
}
cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG_TIMING
GpuArray_sync(&(*kerns)->ga);
timer.end();
total_computation_time += timer.milliseconds;
++n_computations;
#endif
cuda_exit(c->ctx); cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -362,5 +435,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -362,5 +435,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
#ifdef DEBUG_TIMING
fprintf(stderr, "\t(ran gradweight algo in %g milliseconds)\n", timer.milliseconds);
if (n_computations > 1) {
fprintf(stderr, "\t(ran %lu gradweight computations in %g milliseconds (average: %g milliseconds per call))\n",
n_computations, total_computation_time, total_computation_time / n_computations);
}
#endif
return 0; return 0;
} }
...@@ -2639,7 +2639,7 @@ class TestDnnConv2DRuntimeAlgorithms(object): ...@@ -2639,7 +2639,7 @@ class TestDnnConv2DRuntimeAlgorithms(object):
filters = theano.tensor.TensorType(dtype, _broadcastable)() filters = theano.tensor.TensorType(dtype, _broadcastable)()
conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype, conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
subsample=unit_shape, dilation=unit_shape) subsample=unit_shape, dilation=unit_shape)
grad_i = theano.tensor.grad(conv.sum(), [inputs]) grad_i, = theano.tensor.grad(conv.sum(), [inputs])
f = theano.function([inputs, filters], grad_i, mode=mode_with_gpu) f = theano.function([inputs, filters], grad_i, mode=mode_with_gpu)
assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradI)]) assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradI)])
assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes) assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
...@@ -2649,7 +2649,7 @@ class TestDnnConv2DRuntimeAlgorithms(object): ...@@ -2649,7 +2649,7 @@ class TestDnnConv2DRuntimeAlgorithms(object):
else: else:
flipped_filters = filters[:, :, ::-1, ::-1] flipped_filters = filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters) conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
grad_i_ref = theano.tensor.grad(conv_ref.sum(), [inputs]) grad_i_ref, = theano.tensor.grad(conv_ref.sum(), [inputs])
f_ref = theano.function([inputs, filters], grad_i_ref, mode='FAST_RUN') f_ref = theano.function([inputs, filters], grad_i_ref, mode='FAST_RUN')
runtime_shapes = self.runtime_shapes runtime_shapes = self.runtime_shapes
if algo in ('time_once', 'guess_once'): if algo in ('time_once', 'guess_once'):
...@@ -2677,7 +2677,7 @@ class TestDnnConv2DRuntimeAlgorithms(object): ...@@ -2677,7 +2677,7 @@ class TestDnnConv2DRuntimeAlgorithms(object):
filters = theano.tensor.TensorType(dtype, _broadcastable)() filters = theano.tensor.TensorType(dtype, _broadcastable)()
conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype, conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
subsample=unit_shape, dilation=unit_shape) subsample=unit_shape, dilation=unit_shape)
grad_w = theano.tensor.grad(conv.sum(), [filters]) grad_w, = theano.tensor.grad(conv.sum(), [filters])
f = theano.function([inputs, filters], grad_w, mode=mode_with_gpu) f = theano.function([inputs, filters], grad_w, mode=mode_with_gpu)
assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradW)]) assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradW)])
assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes) assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
...@@ -2687,7 +2687,7 @@ class TestDnnConv2DRuntimeAlgorithms(object): ...@@ -2687,7 +2687,7 @@ class TestDnnConv2DRuntimeAlgorithms(object):
else: else:
flipped_filters = filters[:, :, ::-1, ::-1] flipped_filters = filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters) conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
grad_w_ref = theano.tensor.grad(conv_ref.sum(), [filters]) grad_w_ref, = theano.tensor.grad(conv_ref.sum(), [filters])
f_ref = theano.function([inputs, filters], grad_w_ref, mode='FAST_RUN') f_ref = theano.function([inputs, filters], grad_w_ref, mode='FAST_RUN')
runtime_shapes = self.runtime_shapes runtime_shapes = self.runtime_shapes
if algo in ('time_once', 'guess_once'): if algo in ('time_once', 'guess_once'):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论