Implement cudnn implementation selection for FWD pass

f9b85e1e · --global · 2b83b6ac · f9b85e1e · f9b85e1e · f9b85e1e
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -416,12 +416,19 @@ class GpuDnnConv(DnnBase, COp):
        else:
            if self.workmem == 'none':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
+                choose_alg = '0'
            elif self.workmem == 'small':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
+                choose_alg = '0'
            elif self.workmem == 'large':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
+                choose_alg = '0'
+            elif self.workmem == 'time':
+                alg = "0"
+                choose_alg = '1'
            alg_def = ('CONV_ALGO', alg)
-        return [alg_def] + inpl_def
+            alg_choose_def = ('CHOOSE_ALGO', choose_alg)
+        return [alg_def, alg_choose_def] + inpl_def
    def make_node(self, img, kern, output, desc, alpha=None, beta=None):
        img = as_cuda_ndarray_variable(img)

--- a/theano/sandbox/cuda/dnn_conv_base.c
+++ b/theano/sandbox/cuda/dnn_conv_base.c
@@ -3,6 +3,15 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
 cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
+/* Keep track, from one execution to another, of the dimension of the inputs
+and the algorithm, if any, that was selected according to these dimensions
+and the amount of memory available at that time.
+*/
+int APPLY_SPECIFIC(previous_input_shape)[4];
+int APPLY_SPECIFIC(previous_kerns_shape)[4];
+cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
 #section init_code_struct
 cudnnStatus_t APPLY_SPECIFIC(err);
@@ -20,11 +29,20 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
  FAIL;
 }
 if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s", 
+  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
 	       cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
+APPLY_SPECIFIC(previous_input_shape)[0] = 0;
+APPLY_SPECIFIC(previous_input_shape)[1] = 0;
+APPLY_SPECIFIC(previous_input_shape)[2] = 0;
+APPLY_SPECIFIC(previous_input_shape)[3] = 0;
+APPLY_SPECIFIC(previous_kerns_shape)[0] = 0;
+APPLY_SPECIFIC(previous_kerns_shape)[1] = 0;
+APPLY_SPECIFIC(previous_kerns_shape)[2] = 0;
+APPLY_SPECIFIC(previous_kerns_shape)[3] = 0;
 #section cleanup_code_struct
 if (APPLY_SPECIFIC(input) != NULL)

--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
@@ -33,13 +33,86 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
  {
    size_t worksize;
    void *workspace;
+    cudnnConvolutionFwdAlgo_t chosen_algo;
+    if (CHOOSE_ALGO){
+      // Check if the input and the kernels have the same shape as they have
+      // last time the apply node was executed
+      bool same_shapes = true;
+      for (int i = 0; (i < 4) && same_shapes; i++)
+      {
+          same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] !=
+                          APPLY_SPECIFIC(previous_input_shape)[i]);
+          same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] !=
+                          APPLY_SPECIFIC(previous_kerns_shape)[i]);
+      }
+      if (same_shapes)
+      {
+        // The shape of the inputs and/or the kernels is different from the
+        // last execution. Use the current shapes to infer the implementation
+        // to use from now on.
+        // Get the amount of available memory
+        size_t free = 0, total = 0;
+        cudaError_t err2 = cudaMemGetInfo(&free, &total);
+        if (err2 != cudaSuccess){
+          cudaGetLastError();
+          fprintf(stderr,
+                  "Error when trying to find the memory information"
+                  " on the GPU: %s\n", cudaGetErrorString(err2));
+        }
+        // Obtain a convolution algorithm appropriate for the input and kernel
+        // shapes
+        err = cudnnGetConvolutionForwardAlgorithm(_handle,
+                                                  APPLY_SPECIFIC(input),
+                                                  APPLY_SPECIFIC(kerns),
+                                                  desc,
+                                                  APPLY_SPECIFIC(output),
+                                                  CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+                                                  free,
+                                                  &chosen_algo);
+        if (err != CUDNN_STATUS_SUCCESS) {
+          PyErr_Format(PyExc_RuntimeError,
+                       "GpuDnnConv: error selecting convolution algo: %s",
+                       cudnnGetErrorString(err));
+          return 1;
+        }
+        // Store the shapes of the inputs and kernels as well as the chosen
+        // algorithm for future use.
+        APPLY_SPECIFIC(previous_algo) = chosen_algo;
+        for (int i = 0; i < 4; i++)
+        {
+            APPLY_SPECIFIC(previous_input_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(input)[i];
+            APPLY_SPECIFIC(previous_kerns_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(kerns)[i];
+        }
+      }
+      else
+      {
+          // The shapes of the inputs and the kernels are the same as for the
+          // last execution. The convolution algorithm used last time can also
+          // be used here
+          chosen_algo = APPLY_SPECIFIC(previous_algo);
+      }
+    }
+    else
+    {
+      chosen_algo = CONV_ALGO;
+    }
    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
                                                  desc,
                                                  APPLY_SPECIFIC(output),
-                                                  CONV_ALGO,
+                                                  chosen_algo,
                                                  &worksize);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
@@ -58,7 +131,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
      desc,
-      CONV_ALGO,
+      chosen_algo,
      workspace, worksize,
      (void *)&beta,
      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));