提交 403865ea authored 作者: notoraptor's avatar notoraptor

Remove third-party library and update dnn_fwd.

上级 b7dda3d3
...@@ -89,19 +89,14 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache; ...@@ -89,19 +89,14 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
#if __cplusplus < 201103L #if __cplusplus < 201103L
#include <plf_nanotimer/plf_nanotimer.h> const char* const _cppver = "No timing available: C++11 or later is required.";
const char* const _cppver = "Using plf_nanotimer: http://www.plflib.org/nanotimer.htm";
struct TheanoTimer {
double milliseconds;
plf::nanotimer timer;
void start() {timer.start();}
void end() {milliseconds = timer.get_elapsed_ms();}
};
#else #else
#define DEBUG_TIMING
#include <chrono> #include <chrono>
const char* const _cppver = "Using C++11 chrono"; const char* const _cppver = NULL;
struct TheanoTimer { struct TheanoTimer {
double milliseconds; double milliseconds;
std::chrono::steady_clock::time_point base; std::chrono::steady_clock::time_point base;
......
...@@ -3,7 +3,7 @@ prev_algo.algo = PARAMS->conv_algo; ...@@ -3,7 +3,7 @@ prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH; prev_algo.mathType = CUDNN_DEFAULT_MATH;
reuse_algo = 0; reuse_algo = 0;
hash_prefix = std::string("FWD|GPU#"); hash_prefix = std::string("FWD|GPU#");
#ifdef DEBUG #ifdef DEBUG_TIMING
total_computation_time = 0; total_computation_time = 0;
total_selection_time = 0; total_selection_time = 0;
n_computations = 0; n_computations = 0;
...@@ -27,6 +27,8 @@ std::string hash_prefix; ...@@ -27,6 +27,8 @@ std::string hash_prefix;
#ifdef DEBUG #ifdef DEBUG
char algorithm_name[128]; char algorithm_name[128];
#endif
#ifdef DEBUG_TIMING
double total_computation_time; double total_computation_time;
double total_selection_time; double total_selection_time;
size_t n_computations; size_t n_computations;
...@@ -140,8 +142,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -140,8 +142,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0; bool use_cached = 0;
#ifdef DEBUG #ifdef DEBUG
if (_cppver) fprintf(stderr, "%s\n", _cppver);
#endif
#ifdef DEBUG_TIMING
TheanoTimer timer; TheanoTimer timer;
fprintf(stderr, "%s\n", _cppver);
#endif #endif
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
...@@ -263,17 +267,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -263,17 +267,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
o = pygpu_empty(PyGpuArray_NDIM(*output), PyGpuArray_DIMS(*output), (*output)->ga.typecode, GA_C_ORDER, c, Py_None); o = pygpu_empty(PyGpuArray_NDIM(*output), PyGpuArray_DIMS(*output), (*output)->ga.typecode, GA_C_ORDER, c, Py_None);
} }
// We don't sync the buffer as we don't care about the values. #ifdef DEBUG_TIMING
#ifdef DEBUG
timer.start(); timer.start();
#endif #endif
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx( err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(o), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(o),
1, &count, &choice, *(void **)tmpmem, 1, &count, &choice, *(void **)tmpmem,
maxfree); maxfree);
#ifdef DEBUG #ifdef DEBUG_TIMING
timer.end(); timer.end();
#endif #endif
gpudata_release(tmpmem); gpudata_release(tmpmem);
...@@ -310,14 +314,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -310,14 +314,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
mathtype = choice.mathType; mathtype = choice.mathType;
#endif #endif
} else { } else {
#ifdef DEBUG #ifdef DEBUG_TIMING
timer.start(); timer.start();
#endif #endif
err = cudnnGetConvolutionForwardAlgorithm( err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo); CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
#ifdef DEBUG #ifdef DEBUG_TIMING
timer.end(); timer.end();
#endif #endif
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -328,7 +332,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -328,7 +332,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
} }
#ifdef DEBUG #ifdef DEBUG_TIMING
total_selection_time += timer.milliseconds; total_selection_time += timer.milliseconds;
++n_selections; ++n_selections;
#endif #endif
...@@ -395,6 +399,8 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -395,6 +399,8 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
worksize, worksize,
hashkey.c_str() hashkey.c_str()
); );
#endif
#ifdef DEBUG_TIMING
if (!(reuse_algo || use_cached)) { if (!(reuse_algo || use_cached)) {
// We have selected an algorithm at runtime. // We have selected an algorithm at runtime.
// `timer` still contains timing about selection step. // `timer` still contains timing about selection step.
...@@ -422,6 +428,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -422,6 +428,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} // params->choose_algo } // params->choose_algo
{
gpudata *workspace = 0; gpudata *workspace = 0;
if (worksize != 0) { if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
...@@ -438,7 +445,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -438,7 +445,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
#ifdef DEBUG #ifdef DEBUG_TIMING
GpuArray_sync(&(*output)->ga); GpuArray_sync(&(*output)->ga);
timer.start(); timer.start();
#endif #endif
...@@ -463,8 +470,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -463,8 +470,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
}
#ifdef DEBUG #ifdef DEBUG_TIMING
GpuArray_sync(&(*output)->ga); GpuArray_sync(&(*output)->ga);
timer.end(); timer.end();
total_computation_time += timer.milliseconds; total_computation_time += timer.milliseconds;
...@@ -478,7 +486,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -478,7 +486,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
#ifdef DEBUG #ifdef DEBUG_TIMING
fprintf(stderr, "\t(ran fwd algo in %g milliseconds)\n", timer.milliseconds); fprintf(stderr, "\t(ran fwd algo in %g milliseconds)\n", timer.milliseconds);
if (n_computations > 1) { if (n_computations > 1) {
fprintf(stderr, "\t(ran %lu fwd computations in %g milliseconds (average: %g milliseconds per call))\n", fprintf(stderr, "\t(ran %lu fwd computations in %g milliseconds (average: %g milliseconds per call))\n",
......
All plf:: modules are provided under a zlib license:
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgement in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Copyright (c) 2015 Matthew Bentley
// Copyright (c) 2016, Matthew Bentley (mattreecebentley@gmail.com) www.plflib.org
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
//
// 1. The origin of this software must not be misrepresented; you must not
// claim that you wrote the original software. If you use this software
// in a product, an acknowledgement in the product documentation would be
// appreciated but is not required.
// 2. Altered source versions must be plainly marked as such, and must not be
// misrepresented as being the original software.
// 3. This notice may not be removed or altered from any source distribution.
#ifndef PLF_NANOTIMER
#define PLF_NANOTIMER
// ~Nanosecond-precision cross-platform (linux/bsd/mac/windows, C++03/C++11) simple timer class:
// Mac OSX implementation:
#if defined(__MACH__)
#include <mach/clock.h>
#include <mach/mach.h>
namespace plf
{
class nanotimer
{
private:
clock_serv_t system_clock;
mach_timespec_t time1, time2;
public:
nanotimer()
{
host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &system_clock);
}
~nanotimer()
{
mach_port_deallocate(mach_task_self(), system_clock);
}
inline void start()
{
clock_get_time(system_clock, &time1);
}
inline double get_elapsed_ms()
{
return static_cast<double>(get_elapsed_ns()) / 1000000.0;
}
inline double get_elapsed_us()
{
return static_cast<double>(get_elapsed_ns()) / 1000.0;
}
double get_elapsed_ns()
{
clock_get_time(system_clock, &time2);
return ((1000000000.0 * static_cast<double>(time2.tv_sec - time1.tv_sec)) + static_cast<double>(time2.tv_nsec - time1.tv_nsec));
}
};
// Linux/BSD implementation:
#elif (defined(linux) || defined(__linux__) || defined(__linux)) || (defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__))
#include <time.h>
#include <sys/time.h>
namespace plf
{
class nanotimer
{
private:
struct timespec time1, time2;
public:
nanotimer() {}
inline void start()
{
clock_gettime(CLOCK_MONOTONIC, &time1);
}
inline double get_elapsed_ms()
{
return get_elapsed_ns() / 1000000.0;
}
inline double get_elapsed_us()
{
return get_elapsed_ns() / 1000.0;
}
double get_elapsed_ns()
{
clock_gettime(CLOCK_MONOTONIC, &time2);
return ((1000000000.0 * static_cast<double>(time2.tv_sec - time1.tv_sec)) + static_cast<double>(time2.tv_nsec - time1.tv_nsec));
}
};
// Windows implementation:
#elif defined(_WIN32)
#if defined(_MSC_VER) && !defined(NOMINMAX)
#define NOMINMAX // Otherwise MS compilers act like idiots when using std::numeric_limits<>::max() and including windows.h
#endif
#include <windows.h>
namespace plf
{
class nanotimer
{
private:
LARGE_INTEGER ticks1, ticks2;
double frequency;
public:
nanotimer()
{
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
frequency = static_cast<double>(freq.QuadPart);
}
inline void start()
{
QueryPerformanceCounter(&ticks1);
}
double get_elapsed_ms()
{
QueryPerformanceCounter(&ticks2);
return (static_cast<double>(ticks2.QuadPart - ticks1.QuadPart) * 1000.0) / frequency;
}
inline double get_elapsed_us()
{
return get_elapsed_ms() * 1000.0;
}
inline double get_elapsed_ns()
{
return get_elapsed_ms() * 1000000.0;
}
};
#endif
// Else: failure warning - your OS is not supported
#if defined(__MACH__) || (defined(linux) || defined(__linux__) || defined(__linux)) || (defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) || defined(_WIN32)
void nanosecond_delay(double delay_ns)
{
nanotimer timer;
timer.start();
while(timer.get_elapsed_ns() < delay_ns)
{};
}
inline void microsecond_delay(double delay_us)
{
nanosecond_delay(delay_us * 1000.0);
}
inline void millisecond_delay(double delay_ms)
{
nanosecond_delay(delay_ms * 1000000.0);
}
} // namespace
#endif
#endif // PLF_NANOTIMER
plf::nanotimer is a ~microsecond-precision cross-platform simple timer class (linux/bsd/mac/windows, C++03/C++11).
Usage is as follows:
plf::nanotimer timer;
timer.start()
// Do something here
double results = timer.get_elapsed_ns();
std::cout << "Timing: " << results << " nanoseconds." << std::endl;
timer.start(); // "start" has the same semantics as "restart".
// Do something else
results = timer.get_elapsed_ms();
std::cout << "Timing: " << results << " milliseconds." << std::endl;
timer.start()
plf::microsecond_delay(15); // Delay program for 15 microseconds
results = timer.get_elapsed_us();
std::cout << "Timing: " << results << " microseconds." << std::endl;
Timer member functions:
void timer.start(): start or restart timer
double timer.get_elapsed_ns(): get elapsed time in nanoseconds
double timer.get_elapsed_us(): get elapsed time in microseconds
double timer.get_elapsed_ms(): get elapsed time in milliseconds
Free-standing functions:
void plf::millisecond_delay(double x): delay the program until x milliseconds have passed
void plf::microseconds_delay(double x): delay the program until x microseconds have passed
void plf::nanoseconds_delay(double x): delay the program until x nanoseconds have passed
I determined that a 'pause'-style function would add too much complexity to the class for simple benchmarking, which in turn might interfere with performance analysis, so if you need a 'pause' function do something like this:
{
plf::nanotimer timer;
timer.start()
// Do something here
double results = timer.get_elapsed_ns();
// Do something else - timer 'paused'
timer.start()
// Do stuff
results += timer.get_elapsed_ns();
std::cout << "Timing: " << results << " nanoseconds." << std::endl;
}
All plf:: library components are distributed under a Zlib License.
plf::nanotimer (c) Copyright 2016 Matt Bentley
Contact: mattreecebentley@gmail.com
www.plflib.org
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论