提交 5c10bb1d authored 作者: notoraptor's avatar notoraptor

Add debug profiling for dnn_fwd

上级 1bb1bb8e
...@@ -85,6 +85,23 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache; ...@@ -85,6 +85,23 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
#line 87 "dnn_conv_base.c" #line 87 "dnn_conv_base.c"
#if __cplusplus < 201103L
/* Using C standard interface (<ctime>). */
#define theano_clock_t clock_t
#define theano_clock() clock()
#define theano_clock_to_milliseconds(t) ( 1000.0 * (t) / CLOCKS_PER_SEC )
#define theano_clock_average_to_milliseconds(t, n) ( (1000.0 * (t) / (n)) / CLOCKS_PER_SEC )
#else
/* Using C++11 standard interface (<chrono>).
I don't know if it's really more accurate, but at least
it provides interfaces up to nanoseconds. */
#include <chrono>
#define theano_clock_t std::chrono::time_point
#define theano_clock() std::chrono::steady_clock::now()
#define theano_clock_to_milliseconds(t) ( std::chrono::duration_cast<std::chrono::nanoseconds>(t).count() / 1000000.0 )
#define theano_clock_average_to_milliseconds(t, n) ( theano_clock_to_milliseconds(t) / (n) )
#endif
pthread_mutex_t algoMutex; pthread_mutex_t algoMutex;
AlgoCache algoCache; AlgoCache algoCache;
......
...@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo; ...@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH; prev_algo.mathType = CUDNN_DEFAULT_MATH;
reuse_algo = 0; reuse_algo = 0;
hash_prefix = std::string("FWD|GPU#"); hash_prefix = std::string("FWD|GPU#");
#ifdef DEBUG
total_computation_time = 0;
total_selection_time = 0;
n_computations = 0;
n_selections = 0;
if (PARAMS->choose_algo) {
if (PARAMS->choose_time) {
selection_name = "fastest";
} else {
selection_name = "best suited";
}
};
#endif
#section support_code_struct #section support_code_struct
#line 9 "dnn_fwd.c" #line 22 "dnn_fwd.c"
int reuse_algo; int reuse_algo;
AlgoRec prev_algo; AlgoRec prev_algo;
std::string hash_prefix; std::string hash_prefix;
...@@ -14,6 +27,11 @@ std::string hash_prefix; ...@@ -14,6 +27,11 @@ std::string hash_prefix;
#ifdef DEBUG #ifdef DEBUG
char algorithm_name[128]; char algorithm_name[128];
theano_clock_t total_computation_time;
theano_clock_t total_selection_time;
size_t n_computations;
size_t n_selections;
const char* selection_name;
#endif #endif
/** Check given algorithm against inputs and convolution descriptor, /** Check given algorithm against inputs and convolution descriptor,
...@@ -121,6 +139,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -121,6 +139,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0; bool use_cached = 0;
#ifdef DEBUG
theano_clock_t t;
#endif
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -242,12 +263,18 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -242,12 +263,18 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
// We don't sync the buffer as we don't care about the values. // We don't sync the buffer as we don't care about the values.
#ifdef DEBUG
t = theano_clock();
#endif
err = cudnnFindConvolutionForwardAlgorithmEx( err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(o), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(o),
1, &count, &choice, *(void **)tmpmem, 1, &count, &choice, *(void **)tmpmem,
maxfree); maxfree);
#ifdef DEBUG
t = theano_clock() - t;
#endif
gpudata_release(tmpmem); gpudata_release(tmpmem);
if (beta != 0) { if (beta != 0) {
Py_XDECREF(o); Py_XDECREF(o);
...@@ -282,10 +309,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -282,10 +309,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
mathtype = choice.mathType; mathtype = choice.mathType;
#endif #endif
} else { } else {
#ifdef DEBUG
t = theano_clock();
#endif
err = cudnnGetConvolutionForwardAlgorithm( err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo); CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
#ifdef DEBUG
t = theano_clock() - t;
#endif
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
...@@ -294,6 +327,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -294,6 +327,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
} }
#ifdef DEBUG
total_selection_time += t;
++n_selections;
#endif
} }
} }
...@@ -356,7 +393,19 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -356,7 +393,19 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
use_cached ? "(cache)": "", use_cached ? "(cache)": "",
worksize, worksize,
hashkey.c_str() hashkey.c_str()
); );
if (!(reuse_algo || use_cached)) {
// We have selected an algorithm at runtime.
// `t` still contains timing about selection step.
fprintf(stderr, "\t(selected %s fwd algo in %g milliseconds)\n", selection_name, theano_clock_to_milliseconds(t));
if (n_selections > 1) {
fprintf(stderr, "\t(selected %lu fwd algos in %g milliseconds (average: %g milliseconds per selection))\n",
n_selections,
theano_clock_to_milliseconds(total_selection_time),
theano_clock_average_to_milliseconds(total_selection_time, n_selections));
}
}
}
#endif #endif
if (!reuse_algo) { if (!reuse_algo) {
...@@ -375,45 +424,53 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -375,45 +424,53 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} // params->choose_algo } // params->choose_algo
{ gpudata *workspace = 0;
gpudata *workspace = 0; /*
/* * This is less than ideal since we need to free it after (which
* This is less than ideal since we need to free it after (which * introduces a synchronization point. But we don't have a module
* introduces a synchronization point. But we don't have a module * to place a nice get_work_mem() function in.
* to place a nice get_work_mem() function in. */
*/ if (worksize != 0) {
if (worksize != 0) { workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) {
if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx);
cuda_exit(c->ctx); return 1;
return 1;
}
} }
}
cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < groups; g++) { #ifdef DEBUG
err = cudnnConvolutionForward( t = theano_clock();
params->handle, #endif
alpha_p,
APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g, for ( int g = 0; g < groups; g++) {
APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g, err = cudnnConvolutionForward(
desc, algo, params->handle,
worksize == 0 ? NULL : *(void **)workspace, worksize, alpha_p,
beta_p, APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g,
APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g); APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
} desc, algo,
worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
}
if (worksize != 0) if (worksize != 0)
gpudata_release(workspace); gpudata_release(workspace);
cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
}
#ifdef DEBUG
t = theano_clock() - t;
total_computation_time += t;
++n_computations;
#endif
cuda_exit(c->ctx); cuda_exit(c->ctx);
...@@ -422,6 +479,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -422,6 +479,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
#ifdef DEBUG
fprintf(stderr, "\t(ran fwd algo in %g milliseconds)\n", theano_clock_to_milliseconds(t));
if (n_computations > 1) {
fprintf(stderr, "\t(ran %lu fwd computations in %g milliseconds (average: %g milliseconds per call))\n",
n_computations,
theano_clock_to_milliseconds(total_computation_time),
theano_clock_average_to_milliseconds(total_computation_time, n_computations));
}
#endif
return 0; return 0;
} }
......
...@@ -2720,6 +2720,11 @@ class TestDnnConv3DRuntimeAlgorithms(TestDnnConv2DRuntimeAlgorithms): ...@@ -2720,6 +2720,11 @@ class TestDnnConv3DRuntimeAlgorithms(TestDnnConv2DRuntimeAlgorithms):
] ]
class TestDnnConv2DRuntimeAlgorithmsWithBigInputs(TestDnnConv2DRuntimeAlgorithms):
runtime_shapes = [(5, [(12, 4, 128, 128), (5, 4, 64, 64)]),
(6, [(12, 4, 256, 256), (5, 4, 32, 64)])]
def test_conv_guess_once_with_dtypes(): def test_conv_guess_once_with_dtypes():
# This test checks that runtime conv algorithm selection does not raise any exception # This test checks that runtime conv algorithm selection does not raise any exception
# when consecutive functions with different dtypes and precisions are executed. # when consecutive functions with different dtypes and precisions are executed.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论