Use third-party high precision timing library for C++ prior to 2011:

http://www.plflib.org/nanotimer.htm (zlib license)

Use third-party high precision timing library for C++ prior to 2011:
b7dda3d3 · notoraptor · 20586f0a · b7dda3d3 · b7dda3d3 · b7dda3d3
--- a/theano/gpuarray/c_code/dnn_conv_base.c
+++ b/theano/gpuarray/c_code/dnn_conv_base.c
@@ -85,21 +85,37 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache;

 #line 87 "dnn_conv_base.c"

+#ifdef DEBUG
+
 #if __cplusplus < 201103L
-/* Using C standard interface (<ctime>). */
-#define theano_clock_t clock_t
-#define theano_clock() clock()
-#define theano_clock_to_milliseconds(t) ( 1000.0 * (t) / CLOCKS_PER_SEC )
-#define theano_clock_average_to_milliseconds(t, n) ( (1000.0 * (t) / (n)) / CLOCKS_PER_SEC )
+
+#include <plf_nanotimer/plf_nanotimer.h>
+const char* const _cppver = "Using plf_nanotimer: http://www.plflib.org/nanotimer.htm";
+struct TheanoTimer {
+    double milliseconds;
+    plf::nanotimer timer;
+    void start() {timer.start();}
+    void end() {milliseconds = timer.get_elapsed_ms();}
+};
+
 #else
-/* Using C++11 standard interface (<chrono>).
-I don't know if it's really more accurate, but at least
-it provides interfaces up to nanoseconds. */
+
 #include <chrono>
-#define theano_clock_t std::chrono::time_point
-#define theano_clock() std::chrono::steady_clock::now()
-#define theano_clock_to_milliseconds(t) ( std::chrono::duration_cast<std::chrono::nanoseconds>(t).count() / 1000000.0 )
-#define theano_clock_average_to_milliseconds(t, n) ( theano_clock_to_milliseconds(t) / (n) )
+const char* const _cppver = "Using C++11 chrono";
+struct TheanoTimer {
+    double milliseconds;
+    std::chrono::steady_clock::time_point base;
+    void start() {base = std::chrono::steady_clock::now();}
+    void end() {
+        milliseconds =
+            std::chrono::duration_cast<std::chrono::nanoseconds>(
+                std::chrono::steady_clock::now() - base
+            ).count() / 1000000.0;
+    }
+};
+
+#endif
+
 #endif

 pthread_mutex_t  algoMutex;

--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
@@ -27,8 +27,8 @@ std::string hash_prefix;

 #ifdef DEBUG
 char algorithm_name[128];
-theano_clock_t total_computation_time;
-theano_clock_t total_selection_time;
+double total_computation_time;
+double total_selection_time;
 size_t n_computations;
 size_t n_selections;
 const char* selection_name;
@@ -140,7 +140,8 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  bool use_cached = 0;
  #ifdef DEBUG
-  theano_clock_t t;
+  TheanoTimer timer;
+  fprintf(stderr, "%s\n", _cppver);
  #endif

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
@@ -264,7 +265,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,

        // We don't sync the buffer as we don't care about the values.
        #ifdef DEBUG
-        t = theano_clock();
+        timer.start();
        #endif
        err = cudnnFindConvolutionForwardAlgorithmEx(
          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
@@ -273,7 +274,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          1, &count, &choice, *(void **)tmpmem,
          maxfree);
        #ifdef DEBUG
-        t = theano_clock() - t;
+        timer.end();
        #endif
        gpudata_release(tmpmem);
        if (beta != 0) {
@@ -310,14 +311,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
 #endif
      } else {
        #ifdef DEBUG
-        t = theano_clock();
+        timer.start();
        #endif
        err = cudnnGetConvolutionForwardAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
          desc, APPLY_SPECIFIC(output),
          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
        #ifdef DEBUG
-        t = theano_clock() - t;
+        timer.end();
        #endif
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,
@@ -328,7 +329,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        }
      }
      #ifdef DEBUG
-      total_selection_time += t;
+      total_selection_time += timer.milliseconds;
      ++n_selections;
      #endif
    }
@@ -396,16 +397,13 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    );
    if (!(reuse_algo || use_cached)) {
        // We have selected an algorithm at runtime.
-        // `t` still contains timing about selection step.
-        fprintf(stderr, "\t(selected %s fwd algo in %g milliseconds)\n", selection_name, theano_clock_to_milliseconds(t));
+        // `timer` still contains timing about selection step.
+        fprintf(stderr, "\t(selected %s fwd algo in %g milliseconds)\n", selection_name, timer.milliseconds);
        if (n_selections > 1) {
            fprintf(stderr, "\t(selected %lu fwd algos in %g milliseconds (average: %g milliseconds per selection))\n",
-                    n_selections,
-                    theano_clock_to_milliseconds(total_selection_time),
-                    theano_clock_average_to_milliseconds(total_selection_time, n_selections));
+                    n_selections, total_selection_time, total_selection_time / n_selections);
        }
    }
-  }
 #endif

    if (!reuse_algo) {
@@ -441,7 +439,8 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  #ifdef DEBUG
-  t = theano_clock();
+  GpuArray_sync(&(*output)->ga);
+  timer.start();
  #endif

  for ( int g = 0; g < groups; g++) {
@@ -466,8 +465,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  #ifdef DEBUG
-  t = theano_clock() - t;
-  total_computation_time += t;
+  GpuArray_sync(&(*output)->ga);
+  timer.end();
+  total_computation_time += timer.milliseconds;
  ++n_computations;
  #endif

@@ -479,12 +479,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    return 1;
  }
  #ifdef DEBUG
-  fprintf(stderr, "\t(ran fwd algo in %g milliseconds)\n", theano_clock_to_milliseconds(t));
+  fprintf(stderr, "\t(ran fwd algo in %g milliseconds)\n", timer.milliseconds);
  if (n_computations > 1) {
    fprintf(stderr, "\t(ran %lu fwd computations in %g milliseconds (average: %g milliseconds per call))\n",
-            n_computations,
-            theano_clock_to_milliseconds(total_computation_time),
-            theano_clock_average_to_milliseconds(total_computation_time, n_computations));
+            n_computations, total_computation_time, total_computation_time / n_computations);
  }
  #endif
  return 0;

--- a/theano/gpuarray/c_code/plf_nanotimer/plf_licensing.txt
+++ b/theano/gpuarray/c_code/plf_nanotimer/plf_licensing.txt
+All plf:: modules are provided under a zlib license:
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgement in the product documentation would be
+   appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+Copyright (c) 2015 Matthew Bentley
--- a/theano/gpuarray/c_code/plf_nanotimer/plf_nanotimer.h
+++ b/theano/gpuarray/c_code/plf_nanotimer/plf_nanotimer.h
+// Copyright (c) 2016, Matthew Bentley (mattreecebentley@gmail.com) www.plflib.org
+
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+
+#ifndef PLF_NANOTIMER
+#define PLF_NANOTIMER
+
+
+// ~Nanosecond-precision cross-platform (linux/bsd/mac/windows, C++03/C++11) simple timer class:
+
+// Mac OSX implementation:
+#if defined(__MACH__)
+	#include <mach/clock.h>
+	#include <mach/mach.h>
+
+	namespace plf
+	{
+
+	class nanotimer
+	{
+	private:
+		clock_serv_t system_clock;
+		mach_timespec_t time1, time2;
+	public:
+		nanotimer()
+		{
+			host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &system_clock);
+		}
+
+		~nanotimer()
+		{
+			mach_port_deallocate(mach_task_self(), system_clock);
+		}
+
+		inline void start()
+		{
+			clock_get_time(system_clock, &time1);
+		}
+
+		inline double get_elapsed_ms()
+		{
+			return static_cast<double>(get_elapsed_ns()) / 1000000.0;
+		}
+
+		inline double get_elapsed_us()
+		{
+			return static_cast<double>(get_elapsed_ns()) / 1000.0;
+		}
+
+		double get_elapsed_ns()
+		{
+			clock_get_time(system_clock, &time2);
+			return ((1000000000.0 * static_cast<double>(time2.tv_sec - time1.tv_sec)) + static_cast<double>(time2.tv_nsec - time1.tv_nsec));
+		}
+	};
+
+
+
+
+// Linux/BSD implementation:
+#elif (defined(linux) || defined(__linux__) || defined(__linux)) || (defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__))
+	#include <time.h>
+	#include <sys/time.h>
+
+	namespace plf
+	{
+
+	class nanotimer
+	{
+	private:
+		struct timespec time1, time2;
+	public:
+		nanotimer() {}
+		
+		inline void start()
+		{
+			clock_gettime(CLOCK_MONOTONIC, &time1);
+		}
+		
+		inline double get_elapsed_ms()
+		{
+			return get_elapsed_ns() / 1000000.0;
+		}
+
+		inline double get_elapsed_us()
+		{
+			return get_elapsed_ns() / 1000.0;
+		}
+
+		double get_elapsed_ns()
+		{
+			clock_gettime(CLOCK_MONOTONIC, &time2);
+			return ((1000000000.0 * static_cast<double>(time2.tv_sec - time1.tv_sec)) + static_cast<double>(time2.tv_nsec - time1.tv_nsec));
+		}
+	};
+
+
+
+
+// Windows implementation:
+#elif defined(_WIN32)
+	#if defined(_MSC_VER) && !defined(NOMINMAX)
+		#define NOMINMAX // Otherwise MS compilers act like idiots when using std::numeric_limits<>::max() and including windows.h
+	#endif
+	
+	#include <windows.h>
+	
+	namespace plf
+	{
+
+	class nanotimer
+	{
+	private:
+		LARGE_INTEGER ticks1, ticks2;
+		double frequency;
+	public:
+		nanotimer()
+		{
+			LARGE_INTEGER freq;
+			QueryPerformanceFrequency(&freq);
+			frequency = static_cast<double>(freq.QuadPart);
+		}
+
+		inline void start()
+		{
+			QueryPerformanceCounter(&ticks1);
+		}
+
+		double get_elapsed_ms()
+		{
+			QueryPerformanceCounter(&ticks2);
+			return (static_cast<double>(ticks2.QuadPart - ticks1.QuadPart) * 1000.0) / frequency;
+		}
+
+		inline double get_elapsed_us()
+		{
+			return get_elapsed_ms() * 1000.0;
+		}
+
+		inline double get_elapsed_ns()
+		{
+			return get_elapsed_ms() * 1000000.0;
+		}
+	};
+#endif
+// Else: failure warning - your OS is not supported
+
+
+
+#if defined(__MACH__) || (defined(linux) || defined(__linux__) || defined(__linux)) || (defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) || defined(_WIN32)
+void nanosecond_delay(double delay_ns)
+{
+	nanotimer timer;
+	timer.start();
+
+	while(timer.get_elapsed_ns() < delay_ns)
+	{};
+}
+
+
+
+inline void microsecond_delay(double delay_us)
+{
+	nanosecond_delay(delay_us * 1000.0);
+}
+
+
+inline void millisecond_delay(double delay_ms)
+{
+	nanosecond_delay(delay_ms * 1000000.0);
+}
+
+
+} // namespace
+#endif
+
+#endif // PLF_NANOTIMER
--- a/theano/gpuarray/c_code/plf_nanotimer/plf_nanotimer_readme.txt
+++ b/theano/gpuarray/c_code/plf_nanotimer/plf_nanotimer_readme.txt
+plf::nanotimer is a ~microsecond-precision cross-platform simple timer class (linux/bsd/mac/windows, C++03/C++11).
+
+
+Usage is as follows:
+
+
+	plf::nanotimer timer;
+
+
+	timer.start()
+	// Do something here
+	double results = timer.get_elapsed_ns();
+	std::cout << "Timing: " << results << " nanoseconds." << std::endl;
+	
+
+	timer.start(); // "start" has the same semantics as "restart".
+	// Do something else
+	results = timer.get_elapsed_ms();
+	std::cout << "Timing: " << results << " milliseconds." << std::endl;
+
+
+	timer.start()
+	plf::microsecond_delay(15); // Delay program for 15 microseconds
+	results = timer.get_elapsed_us();
+	std::cout << "Timing: " << results << " microseconds." << std::endl;
+
+
+
+
+Timer member functions:
+
+void timer.start(): start or restart timer
+
+double timer.get_elapsed_ns(): get elapsed time in nanoseconds
+
+double timer.get_elapsed_us(): get elapsed time in microseconds
+
+double timer.get_elapsed_ms(): get elapsed time in milliseconds
+
+
+
+Free-standing functions:
+
+void plf::millisecond_delay(double x): delay the program until x milliseconds have passed
+
+void plf::microseconds_delay(double x): delay the program until x microseconds have passed
+
+void plf::nanoseconds_delay(double x): delay the program until x nanoseconds have passed
+
+
+
+I determined that a 'pause'-style function would add too much complexity to the class for simple benchmarking, which in turn might interfere with performance analysis, so if you need a 'pause' function do something like this:
+
+{
+	plf::nanotimer timer;
+
+
+	timer.start()
+	// Do something here
+	double results = timer.get_elapsed_ns();
+	
+	// Do something else - timer 'paused'
+	
+	timer.start()
+	
+	// Do stuff
+	
+	results += timer.get_elapsed_ns();
+	
+	std::cout << "Timing: " << results << " nanoseconds." << std::endl;
+}
+
+
+All plf:: library components are distributed under a Zlib License.
+plf::nanotimer (c) Copyright 2016 Matt Bentley
+Contact: mattreecebentley@gmail.com
+www.plflib.org
\ No newline at end of file