提交 b7dda3d3 authored 作者: notoraptor's avatar notoraptor

Use third-party high precision timing library for C++ prior to 2011:

上级 20586f0a
......@@ -85,21 +85,37 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
#line 87 "dnn_conv_base.c"
#ifdef DEBUG
#if __cplusplus < 201103L
/* Using C standard interface (<ctime>). */
#define theano_clock_t clock_t
#define theano_clock() clock()
#define theano_clock_to_milliseconds(t) ( 1000.0 * (t) / CLOCKS_PER_SEC )
#define theano_clock_average_to_milliseconds(t, n) ( (1000.0 * (t) / (n)) / CLOCKS_PER_SEC )
#include <plf_nanotimer/plf_nanotimer.h>
const char* const _cppver = "Using plf_nanotimer: http://www.plflib.org/nanotimer.htm";
struct TheanoTimer {
double milliseconds;
plf::nanotimer timer;
void start() {timer.start();}
void end() {milliseconds = timer.get_elapsed_ms();}
};
#else
/* Using C++11 standard interface (<chrono>).
I don't know if it's really more accurate, but at least
it provides interfaces up to nanoseconds. */
#include <chrono>
#define theano_clock_t std::chrono::time_point
#define theano_clock() std::chrono::steady_clock::now()
#define theano_clock_to_milliseconds(t) ( std::chrono::duration_cast<std::chrono::nanoseconds>(t).count() / 1000000.0 )
#define theano_clock_average_to_milliseconds(t, n) ( theano_clock_to_milliseconds(t) / (n) )
const char* const _cppver = "Using C++11 chrono";
struct TheanoTimer {
double milliseconds;
std::chrono::steady_clock::time_point base;
void start() {base = std::chrono::steady_clock::now();}
void end() {
milliseconds =
std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now() - base
).count() / 1000000.0;
}
};
#endif
#endif
pthread_mutex_t algoMutex;
......
......@@ -27,8 +27,8 @@ std::string hash_prefix;
#ifdef DEBUG
char algorithm_name[128];
theano_clock_t total_computation_time;
theano_clock_t total_selection_time;
double total_computation_time;
double total_selection_time;
size_t n_computations;
size_t n_selections;
const char* selection_name;
......@@ -140,7 +140,8 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0;
#ifdef DEBUG
theano_clock_t t;
TheanoTimer timer;
fprintf(stderr, "%s\n", _cppver);
#endif
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
......@@ -264,7 +265,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// We don't sync the buffer as we don't care about the values.
#ifdef DEBUG
t = theano_clock();
timer.start();
#endif
err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
......@@ -273,7 +274,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
1, &count, &choice, *(void **)tmpmem,
maxfree);
#ifdef DEBUG
t = theano_clock() - t;
timer.end();
#endif
gpudata_release(tmpmem);
if (beta != 0) {
......@@ -310,14 +311,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
#endif
} else {
#ifdef DEBUG
t = theano_clock();
timer.start();
#endif
err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
#ifdef DEBUG
t = theano_clock() - t;
timer.end();
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
......@@ -328,7 +329,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
}
}
#ifdef DEBUG
total_selection_time += t;
total_selection_time += timer.milliseconds;
++n_selections;
#endif
}
......@@ -396,16 +397,13 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
);
if (!(reuse_algo || use_cached)) {
// We have selected an algorithm at runtime.
// `t` still contains timing about selection step.
fprintf(stderr, "\t(selected %s fwd algo in %g milliseconds)\n", selection_name, theano_clock_to_milliseconds(t));
// `timer` still contains timing about selection step.
fprintf(stderr, "\t(selected %s fwd algo in %g milliseconds)\n", selection_name, timer.milliseconds);
if (n_selections > 1) {
fprintf(stderr, "\t(selected %lu fwd algos in %g milliseconds (average: %g milliseconds per selection))\n",
n_selections,
theano_clock_to_milliseconds(total_selection_time),
theano_clock_average_to_milliseconds(total_selection_time, n_selections));
n_selections, total_selection_time, total_selection_time / n_selections);
}
}
}
#endif
if (!reuse_algo) {
......@@ -441,7 +439,8 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG
t = theano_clock();
GpuArray_sync(&(*output)->ga);
timer.start();
#endif
for ( int g = 0; g < groups; g++) {
......@@ -466,8 +465,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG
t = theano_clock() - t;
total_computation_time += t;
GpuArray_sync(&(*output)->ga);
timer.end();
total_computation_time += timer.milliseconds;
++n_computations;
#endif
......@@ -479,12 +479,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1;
}
#ifdef DEBUG
fprintf(stderr, "\t(ran fwd algo in %g milliseconds)\n", theano_clock_to_milliseconds(t));
fprintf(stderr, "\t(ran fwd algo in %g milliseconds)\n", timer.milliseconds);
if (n_computations > 1) {
fprintf(stderr, "\t(ran %lu fwd computations in %g milliseconds (average: %g milliseconds per call))\n",
n_computations,
theano_clock_to_milliseconds(total_computation_time),
theano_clock_average_to_milliseconds(total_computation_time, n_computations));
n_computations, total_computation_time, total_computation_time / n_computations);
}
#endif
return 0;
......
All plf:: modules are provided under a zlib license:
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgement in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Copyright (c) 2015 Matthew Bentley
// Copyright (c) 2016, Matthew Bentley (mattreecebentley@gmail.com) www.plflib.org
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
//
// 1. The origin of this software must not be misrepresented; you must not
// claim that you wrote the original software. If you use this software
// in a product, an acknowledgement in the product documentation would be
// appreciated but is not required.
// 2. Altered source versions must be plainly marked as such, and must not be
// misrepresented as being the original software.
// 3. This notice may not be removed or altered from any source distribution.
#ifndef PLF_NANOTIMER
#define PLF_NANOTIMER
// ~Nanosecond-precision cross-platform (linux/bsd/mac/windows, C++03/C++11) simple timer class:
// Mac OSX implementation:
#if defined(__MACH__)
#include <mach/clock.h>
#include <mach/mach.h>
namespace plf
{
class nanotimer
{
private:
clock_serv_t system_clock;
mach_timespec_t time1, time2;
public:
nanotimer()
{
host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &system_clock);
}
~nanotimer()
{
mach_port_deallocate(mach_task_self(), system_clock);
}
inline void start()
{
clock_get_time(system_clock, &time1);
}
inline double get_elapsed_ms()
{
return static_cast<double>(get_elapsed_ns()) / 1000000.0;
}
inline double get_elapsed_us()
{
return static_cast<double>(get_elapsed_ns()) / 1000.0;
}
double get_elapsed_ns()
{
clock_get_time(system_clock, &time2);
return ((1000000000.0 * static_cast<double>(time2.tv_sec - time1.tv_sec)) + static_cast<double>(time2.tv_nsec - time1.tv_nsec));
}
};
// Linux/BSD implementation:
#elif (defined(linux) || defined(__linux__) || defined(__linux)) || (defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__))
#include <time.h>
#include <sys/time.h>
namespace plf
{
class nanotimer
{
private:
struct timespec time1, time2;
public:
nanotimer() {}
inline void start()
{
clock_gettime(CLOCK_MONOTONIC, &time1);
}
inline double get_elapsed_ms()
{
return get_elapsed_ns() / 1000000.0;
}
inline double get_elapsed_us()
{
return get_elapsed_ns() / 1000.0;
}
double get_elapsed_ns()
{
clock_gettime(CLOCK_MONOTONIC, &time2);
return ((1000000000.0 * static_cast<double>(time2.tv_sec - time1.tv_sec)) + static_cast<double>(time2.tv_nsec - time1.tv_nsec));
}
};
// Windows implementation:
#elif defined(_WIN32)
#if defined(_MSC_VER) && !defined(NOMINMAX)
#define NOMINMAX // Otherwise MS compilers act like idiots when using std::numeric_limits<>::max() and including windows.h
#endif
#include <windows.h>
namespace plf
{
class nanotimer
{
private:
LARGE_INTEGER ticks1, ticks2;
double frequency;
public:
nanotimer()
{
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
frequency = static_cast<double>(freq.QuadPart);
}
inline void start()
{
QueryPerformanceCounter(&ticks1);
}
double get_elapsed_ms()
{
QueryPerformanceCounter(&ticks2);
return (static_cast<double>(ticks2.QuadPart - ticks1.QuadPart) * 1000.0) / frequency;
}
inline double get_elapsed_us()
{
return get_elapsed_ms() * 1000.0;
}
inline double get_elapsed_ns()
{
return get_elapsed_ms() * 1000000.0;
}
};
#endif
// Else: failure warning - your OS is not supported
#if defined(__MACH__) || (defined(linux) || defined(__linux__) || defined(__linux)) || (defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) || defined(_WIN32)
void nanosecond_delay(double delay_ns)
{
nanotimer timer;
timer.start();
while(timer.get_elapsed_ns() < delay_ns)
{};
}
inline void microsecond_delay(double delay_us)
{
nanosecond_delay(delay_us * 1000.0);
}
inline void millisecond_delay(double delay_ms)
{
nanosecond_delay(delay_ms * 1000000.0);
}
} // namespace
#endif
#endif // PLF_NANOTIMER
plf::nanotimer is a ~microsecond-precision cross-platform simple timer class (linux/bsd/mac/windows, C++03/C++11).
Usage is as follows:
plf::nanotimer timer;
timer.start()
// Do something here
double results = timer.get_elapsed_ns();
std::cout << "Timing: " << results << " nanoseconds." << std::endl;
timer.start(); // "start" has the same semantics as "restart".
// Do something else
results = timer.get_elapsed_ms();
std::cout << "Timing: " << results << " milliseconds." << std::endl;
timer.start()
plf::microsecond_delay(15); // Delay program for 15 microseconds
results = timer.get_elapsed_us();
std::cout << "Timing: " << results << " microseconds." << std::endl;
Timer member functions:
void timer.start(): start or restart timer
double timer.get_elapsed_ns(): get elapsed time in nanoseconds
double timer.get_elapsed_us(): get elapsed time in microseconds
double timer.get_elapsed_ms(): get elapsed time in milliseconds
Free-standing functions:
void plf::millisecond_delay(double x): delay the program until x milliseconds have passed
void plf::microseconds_delay(double x): delay the program until x microseconds have passed
void plf::nanoseconds_delay(double x): delay the program until x nanoseconds have passed
I determined that a 'pause'-style function would add too much complexity to the class for simple benchmarking, which in turn might interfere with performance analysis, so if you need a 'pause' function do something like this:
{
plf::nanotimer timer;
timer.start()
// Do something here
double results = timer.get_elapsed_ns();
// Do something else - timer 'paused'
timer.start()
// Do stuff
results += timer.get_elapsed_ns();
std::cout << "Timing: " << results << " nanoseconds." << std::endl;
}
All plf:: library components are distributed under a Zlib License.
plf::nanotimer (c) Copyright 2016 Matt Bentley
Contact: mattreecebentley@gmail.com
www.plflib.org
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论