Remove third-party library and update dnn_fwd.

403865ea · notoraptor · b7dda3d3 · 403865ea · 403865ea · b7dda3d3
--- a/theano/gpuarray/c_code/dnn_conv_base.c
+++ b/theano/gpuarray/c_code/dnn_conv_base.c
@@ -89,19 +89,14 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
 #if __cplusplus < 201103L
-#include <plf_nanotimer/plf_nanotimer.h>
+const char* const _cppver = "No timing available: C++11 or later is required.";
-const char* const _cppver = "Using plf_nanotimer: http://www.plflib.org/nanotimer.htm";
-struct TheanoTimer {
-    double milliseconds;
-    plf::nanotimer timer;
-    void start() {timer.start();}
-    void end() {milliseconds = timer.get_elapsed_ms();}
-};
 #else
+#define DEBUG_TIMING
 #include <chrono>
-const char* const _cppver = "Using C++11 chrono";
+const char* const _cppver = NULL;
 struct TheanoTimer {
    double milliseconds;
    std::chrono::steady_clock::time_point base;

--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
@@ -3,7 +3,7 @@ prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
 reuse_algo = 0;
 hash_prefix = std::string("FWD|GPU#");
-#ifdef DEBUG
+#ifdef DEBUG_TIMING
 total_computation_time = 0;
 total_selection_time = 0;
 n_computations = 0;
@@ -27,6 +27,8 @@ std::string hash_prefix;
 #ifdef DEBUG
 char algorithm_name[128];
+#endif
+#ifdef DEBUG_TIMING
 double total_computation_time;
 double total_selection_time;
 size_t n_computations;
@@ -140,8 +142,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  bool use_cached = 0;
  #ifdef DEBUG
+  if (_cppver) fprintf(stderr, "%s\n", _cppver);
+  #endif
+  #ifdef DEBUG_TIMING
  TheanoTimer timer;
-  fprintf(stderr, "%s\n", _cppver);
  #endif
  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
@@ -263,17 +267,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
            o = pygpu_empty(PyGpuArray_NDIM(*output), PyGpuArray_DIMS(*output), (*output)->ga.typecode, GA_C_ORDER, c, Py_None);
        }
-        // We don't sync the buffer as we don't care about the values.
+        #ifdef DEBUG_TIMING
-        #ifdef DEBUG
        timer.start();
        #endif
+        // We don't sync the buffer as we don't care about the values.
        err = cudnnFindConvolutionForwardAlgorithmEx(
          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
          desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(o),
          1, &count, &choice, *(void **)tmpmem,
          maxfree);
-        #ifdef DEBUG
+        #ifdef DEBUG_TIMING
        timer.end();
        #endif
        gpudata_release(tmpmem);
@@ -310,14 +314,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          mathtype = choice.mathType;
 #endif
      } else {
-        #ifdef DEBUG
+        #ifdef DEBUG_TIMING
        timer.start();
        #endif
        err = cudnnGetConvolutionForwardAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
          desc, APPLY_SPECIFIC(output),
          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
-        #ifdef DEBUG
+        #ifdef DEBUG_TIMING
        timer.end();
        #endif
        if (err != CUDNN_STATUS_SUCCESS) {
@@ -328,7 +332,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          return 1;
        }
      }
-      #ifdef DEBUG
+      #ifdef DEBUG_TIMING
      total_selection_time += timer.milliseconds;
      ++n_selections;
      #endif
@@ -395,6 +399,8 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
            worksize,
            hashkey.c_str()
    );
+#endif
+#ifdef DEBUG_TIMING
    if (!(reuse_algo || use_cached)) {
        // We have selected an algorithm at runtime.
        // `timer` still contains timing about selection step.
@@ -422,6 +428,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  } // params->choose_algo
+  {
    gpudata *workspace = 0;
    if (worksize != 0) {
      workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
@@ -438,7 +445,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-  #ifdef DEBUG
+    #ifdef DEBUG_TIMING
    GpuArray_sync(&(*output)->ga);
    timer.start();
    #endif
@@ -463,8 +470,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
+  }
-  #ifdef DEBUG
+  #ifdef DEBUG_TIMING
  GpuArray_sync(&(*output)->ga);
  timer.end();
  total_computation_time += timer.milliseconds;
@@ -478,7 +486,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
 		 cudnnGetErrorString(err));
    return 1;
  }
-  #ifdef DEBUG
+  #ifdef DEBUG_TIMING
  fprintf(stderr, "\t(ran fwd algo in %g milliseconds)\n", timer.milliseconds);
  if (n_computations > 1) {
    fprintf(stderr, "\t(ran %lu fwd computations in %g milliseconds (average: %g milliseconds per call))\n",

--- a/theano/gpuarray/c_code/plf_nanotimer/plf_licensing.txt
+++ b/theano/gpuarray/c_code/plf_nanotimer/plf_licensing.txt
-All plf:: modules are provided under a zlib license:
-This software is provided 'as-is', without any express or implied
-warranty. In no event will the authors be held liable for any damages
-arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it
-freely, subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not
-   claim that you wrote the original software. If you use this software
-   in a product, an acknowledgement in the product documentation would be
-   appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-Copyright (c) 2015 Matthew Bentley
--- a/theano/gpuarray/c_code/plf_nanotimer/plf_nanotimer.h
+++ b/theano/gpuarray/c_code/plf_nanotimer/plf_nanotimer.h
-// Copyright (c) 2016, Matthew Bentley (mattreecebentley@gmail.com) www.plflib.org
-// This software is provided 'as-is', without any express or implied
-// warranty. In no event will the authors be held liable for any damages
-// arising from the use of this software.
-// 
-// Permission is granted to anyone to use this software for any purpose,
-// including commercial applications, and to alter it and redistribute it
-// freely, subject to the following restrictions:
-// 
-// 1. The origin of this software must not be misrepresented; you must not
-//    claim that you wrote the original software. If you use this software
-//    in a product, an acknowledgement in the product documentation would be
-//    appreciated but is not required.
-// 2. Altered source versions must be plainly marked as such, and must not be
-//    misrepresented as being the original software.
-// 3. This notice may not be removed or altered from any source distribution.
-#ifndef PLF_NANOTIMER
-#define PLF_NANOTIMER
-// ~Nanosecond-precision cross-platform (linux/bsd/mac/windows, C++03/C++11) simple timer class:
-// Mac OSX implementation:
-#if defined(__MACH__)
-	#include <mach/clock.h>
-	#include <mach/mach.h>
-	namespace plf
-	{
-	class nanotimer
-	{
-	private:
-		clock_serv_t system_clock;
-		mach_timespec_t time1, time2;
-	public:
-		nanotimer()
-		{
-			host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &system_clock);
-		}
-		~nanotimer()
-		{
-			mach_port_deallocate(mach_task_self(), system_clock);
-		}
-		inline void start()
-		{
-			clock_get_time(system_clock, &time1);
-		}
-		inline double get_elapsed_ms()
-		{
-			return static_cast<double>(get_elapsed_ns()) / 1000000.0;
-		}
-		inline double get_elapsed_us()
-		{
-			return static_cast<double>(get_elapsed_ns()) / 1000.0;
-		}
-		double get_elapsed_ns()
-		{
-			clock_get_time(system_clock, &time2);
-			return ((1000000000.0 * static_cast<double>(time2.tv_sec - time1.tv_sec)) + static_cast<double>(time2.tv_nsec - time1.tv_nsec));
-		}
-	};
-// Linux/BSD implementation:
-#elif (defined(linux) || defined(__linux__) || defined(__linux)) || (defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__))
-	#include <time.h>
-	#include <sys/time.h>
-	namespace plf
-	{
-	class nanotimer
-	{
-	private:
-		struct timespec time1, time2;
-	public:
-		nanotimer() {}
-		inline void start()
-		{
-			clock_gettime(CLOCK_MONOTONIC, &time1);
-		}
-		inline double get_elapsed_ms()
-		{
-			return get_elapsed_ns() / 1000000.0;
-		}
-		inline double get_elapsed_us()
-		{
-			return get_elapsed_ns() / 1000.0;
-		}
-		double get_elapsed_ns()
-		{
-			clock_gettime(CLOCK_MONOTONIC, &time2);
-			return ((1000000000.0 * static_cast<double>(time2.tv_sec - time1.tv_sec)) + static_cast<double>(time2.tv_nsec - time1.tv_nsec));
-		}
-	};
-// Windows implementation:
-#elif defined(_WIN32)
-	#if defined(_MSC_VER) && !defined(NOMINMAX)
-		#define NOMINMAX // Otherwise MS compilers act like idiots when using std::numeric_limits<>::max() and including windows.h
-	#endif
-	#include <windows.h>
-	namespace plf
-	{
-	class nanotimer
-	{
-	private:
-		LARGE_INTEGER ticks1, ticks2;
-		double frequency;
-	public:
-		nanotimer()
-		{
-			LARGE_INTEGER freq;
-			QueryPerformanceFrequency(&freq);
-			frequency = static_cast<double>(freq.QuadPart);
-		}
-		inline void start()
-		{
-			QueryPerformanceCounter(&ticks1);
-		}
-		double get_elapsed_ms()
-		{
-			QueryPerformanceCounter(&ticks2);
-			return (static_cast<double>(ticks2.QuadPart - ticks1.QuadPart) * 1000.0) / frequency;
-		}
-		inline double get_elapsed_us()
-		{
-			return get_elapsed_ms() * 1000.0;
-		}
-		inline double get_elapsed_ns()
-		{
-			return get_elapsed_ms() * 1000000.0;
-		}
-	};
-#endif
-// Else: failure warning - your OS is not supported
-#if defined(__MACH__) || (defined(linux) || defined(__linux__) || defined(__linux)) || (defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) || defined(_WIN32)
-void nanosecond_delay(double delay_ns)
-{
-	nanotimer timer;
-	timer.start();
-	while(timer.get_elapsed_ns() < delay_ns)
-	{};
-}
-inline void microsecond_delay(double delay_us)
-{
-	nanosecond_delay(delay_us * 1000.0);
-}
-inline void millisecond_delay(double delay_ms)
-{
-	nanosecond_delay(delay_ms * 1000000.0);
-}
-} // namespace
-#endif
-#endif // PLF_NANOTIMER
--- a/theano/gpuarray/c_code/plf_nanotimer/plf_nanotimer_readme.txt
+++ b/theano/gpuarray/c_code/plf_nanotimer/plf_nanotimer_readme.txt
-plf::nanotimer is a ~microsecond-precision cross-platform simple timer class (linux/bsd/mac/windows, C++03/C++11).
-Usage is as follows:
-	plf::nanotimer timer;
-	timer.start()
-	// Do something here
-	double results = timer.get_elapsed_ns();
-	std::cout << "Timing: " << results << " nanoseconds." << std::endl;
-	timer.start(); // "start" has the same semantics as "restart".
-	// Do something else
-	results = timer.get_elapsed_ms();
-	std::cout << "Timing: " << results << " milliseconds." << std::endl;
-	timer.start()
-	plf::microsecond_delay(15); // Delay program for 15 microseconds
-	results = timer.get_elapsed_us();
-	std::cout << "Timing: " << results << " microseconds." << std::endl;
-Timer member functions:
-void timer.start(): start or restart timer
-double timer.get_elapsed_ns(): get elapsed time in nanoseconds
-double timer.get_elapsed_us(): get elapsed time in microseconds
-double timer.get_elapsed_ms(): get elapsed time in milliseconds
-Free-standing functions:
-void plf::millisecond_delay(double x): delay the program until x milliseconds have passed
-void plf::microseconds_delay(double x): delay the program until x microseconds have passed
-void plf::nanoseconds_delay(double x): delay the program until x nanoseconds have passed
-I determined that a 'pause'-style function would add too much complexity to the class for simple benchmarking, which in turn might interfere with performance analysis, so if you need a 'pause' function do something like this:
-{
-	plf::nanotimer timer;
-	timer.start()
-	// Do something here
-	double results = timer.get_elapsed_ns();
-	// Do something else - timer 'paused'
-	timer.start()
-	// Do stuff
-	results += timer.get_elapsed_ns();
-	std::cout << "Timing: " << results << " nanoseconds." << std::endl;
-}
-All plf:: library components are distributed under a Zlib License.
-plf::nanotimer (c) Copyright 2016 Matt Bentley
-Contact: mattreecebentley@gmail.com
-www.plflib.org
\ No newline at end of file