Merge pull request #540 from nouiz/gpuconv

Gpuconv

Merge pull request #540 from nouiz/gpuconv
553b256e · Olivier Delalleau · 045826a6 · dc6633bb · 553b256e · 553b256e
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -10,7 +10,7 @@ Documentation
 Interface changes
 * In 0.5, we removed the deprecated sharedvar.value property.
-   Now we raise an error if you access it.
+   Now we raise an error if you access it. (Frederic B.)
 * theano.function does not accept duplicate inputs, so function([x, x], ...)
   does not work anymore. (Pascal L.)
 * theano.function now raises an error if some of the provided inputs are
@@ -23,15 +23,16 @@ New Features
 * debugprint new param ids=["CHAR", "id", "int", ""]
   This makes the identifier printed to be the python id, a unique char, a
   unique int, or not have it printed. We changed the default to be "CHAR"
-   as this is more readable.
+   as this is more readable. (Frederic B.)
 * debugprint new param stop_on_name=[False, True]. If True, we don't print
   anything below an intermediate variable that has a name. Defaults to False.
- * debugprint does not print anymore the "|" symbol in a column after the last input.
+   (Frederic B.)
+ * debugprint does not print anymore the "|" symbol in a column after the last input. (Frederic B.)
 * If you use Enthought Python Distribution (EPD) now we use its blas
-   implementation by default.
+   implementation by default. (Frederic B.)
 Sparse Sandbox graduate
- * Remove0 op: it remove store element with value 0.
+ * Remove0 op: it remove store element with value 0. (Frederic B.)
 Sparse Sandbox Addition (Not reviewed/documented/tested, but used by some people)
 * They are all in the theano.sparse.sandbox.sp2 module
@@ -50,7 +51,9 @@ Crash Fix
   empty string (Frederic B.)
 * When importing theano on a computer without GPU with the Theano
   flags 'device' or 'init_gpu_device' set to gpu* (Frederic B., reported by  Luo Heng)
+ * Optimization print useless error when scipy is not available. (Frederic B.)
+ * Gpu conv crash/slowdown on newer hardware? (James B.)
+ * Better error handling in gpu conv (Frederic B.)
 =============
 Release Notes

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -704,7 +704,7 @@ class GpuConv(GpuOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 17)
+        return (0, 18)
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -32,14 +32,29 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    if (verbose>1)
    {
-        fprintf(stderr, "INFO: Running conv_valid version=%d, MACRO kern_width=%d with inputs:\n",version,THEANO_KERN_WID);
+        fprintf(stderr,
-        fprintf(stderr, "INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n", 
+                "INFO: Running conv_valid version=%d,"
-                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
+                " MACRO kern_width=%d with inputs:\n",
-                CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]);
+                version, THEANO_KERN_WID);
-        fprintf(stderr, "INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
+        fprintf(stderr,
-                CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3],
+                "INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n",
-                CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]);
+                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
-        fprintf(stderr, "INFO:   subsample_rows=%d, subsample_cols=%d\n", subsample_rows, subsample_cols);
+                CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
+                CudaNdarray_HOST_STRIDES(img)[0],
+                CudaNdarray_HOST_STRIDES(img)[1],
+                CudaNdarray_HOST_STRIDES(img)[2],
+                CudaNdarray_HOST_STRIDES(img)[3]);
+        fprintf(stderr,
+                "INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
+                CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
+                CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
+                CudaNdarray_HOST_STRIDES(kern)[0],
+                CudaNdarray_HOST_STRIDES(kern)[1],
+                CudaNdarray_HOST_STRIDES(kern)[2],
+                CudaNdarray_HOST_STRIDES(kern)[3]);
+        fprintf(stderr,
+                "INFO:   subsample_rows=%d, subsample_cols=%d\n",
+                subsample_rows, subsample_cols);
    }
    //Check the output size is valid
@@ -98,9 +113,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
-    //if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
+    //if the lower 2 dims are c_contiguous but flipped, unflipping the
+    // stride and not flipping the kernel in shared memroy
    //allow to use a version that use less registers(so is faster)
-    //the unflipped version of variable haev the original value when we don't need to unflip it, but have the new value when we unflip it.
+    //the unflipped version of variable have the original value when
+    //we don't need to unflip it, but have the new value when we unflip it.
    bool kern_flipped=true;
    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
    float * kern_data_unflipped = kern->devdata;
@@ -115,8 +132,12 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
      kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
    }
-    //if we remove the restriction img_size_byte+kern_size_byte>8*1024, we can enter in condition where we will lower the occupency due to shared memory and/or registers.
+    //if we remove the restriction
-    if ((version == -1) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) && out_size<=256){
+    //img_size_byte+kern_size_byte>8*1024, we can enter in condition where
+    //we will lower the occupency due to shared memory and/or registers.
+    if ((version == -1) &&
+        (out_size<64 || img_size_byte+kern_size_byte>8*1024) &&
+        out_size<=256){
      //condition for exec 
      if(!subsample &&
        out_contiguous &&
@@ -158,13 +179,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
-            if (verbose) fprintf(stderr, "INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: used 'conv_patch' version %s nb_split=%d\n",
+                      threads.y==out_len ? "no split": "split", nb_split);
            work_complete = true;
        }
        else
        {
-            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split);
+            if (verbose)
-            if (verbose) fprintf(stderr, "INFO: impl 'conv_patch' failed (%s), trying next implementation\n",
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i, nb_split=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y, nb_split);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_patch' failed (%s),"
+                      " trying next implementation\n",
                      cudaGetErrorString(sts));
        }
    }
@@ -246,30 +278,47 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        {
            if (verbose>1)
              fprintf(stderr,
-                     "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		     " kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i,"
+                      " shared_size=%i, nb_threads=%i,"
-		     " kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i,",
+                      " kern_flipped=true, accumulate=false, kern_width=%i,"
+                      " img_c_contiguous_2d=%i,"
+                      " kern_c_contiguous_2d=%i, nb_split=%i,"
+                      " preload_full_kernel=%i,",
                      " subsample_rows=%i, subsample_cols=%i\n",
-		     threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y,
                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-		     nb_split, preload_full_kernel, subsample_rows, subsample_cols);
+                      nb_split, preload_full_kernel,
-            if (verbose) fprintf(stderr,
+                      subsample_rows, subsample_cols);
-                    "INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i,"
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: used 'conv_patch_stack' version with nb_split=%i"
+                      " and preload_full_kernel=%i,"
                      " subsample_rows=%i, subsample_cols=%i\n",
-				 nb_split,preload_full_kernel, subsample_rows, subsample_cols);
+                      nb_split, preload_full_kernel,
+                      subsample_rows, subsample_cols);
            work_complete = true;
        }
        else
        {
            if (verbose)
-	      fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
+              fprintf(stderr,
-		     " kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i,"
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		     " kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i,",
+                      " shared_size=%i, nb_threads=%i,"
+                      " kern_flipped=true, accumulate=false,"
+                      " kern_width=%i, img_c_contiguous_2d=%i,"
+                      " kern_c_contiguous_2d=%i, nb_split=%i,"
+                      " preload_full_kernel=%i,"
                      " subsample_rows=%i, subsample_cols=%i\n",
-		     threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y,
                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-		     nb_split, preload_full_kernel, subsample_rows, subsample_cols);
+                      nb_split, preload_full_kernel,
-            if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n",
+                      subsample_rows, subsample_cols);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_patch_stack' failed (%s),"
+                      " trying next implementation\n",
                      cudaGetErrorString(sts));
        }
    }
@@ -309,12 +358,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        if (cudaSuccess == sts)
        {
            work_complete = true;
-            if (verbose) fprintf(stderr, "INFO: used 'conv_rows' version\n");
+            if (verbose)
+              fprintf(stderr, "INFO: used 'conv_rows' version\n");
        }
        else
        {
-            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose)
-            if (verbose) fprintf(stderr, "INFO: impl 'conv_rows' failed (%s), trying next implementation\n",
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_rows' failed (%s),"
+                      " trying next implementation\n",
                      cudaGetErrorString(sts));
        }
    }
@@ -327,7 +385,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    {
        int nb_row=1;
        int max_threads=512;
-	//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
+        //TODO:if not c_contiguous, lower max_thread as we use 22
+        //registers by thread and we won't execute 2 block in one MP.
        for(int i=2;i<=out_len;i++){
          if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
            nb_row=i;
@@ -345,7 +404,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
                  int, int);
        if (0)
-          fprintf(stderr, "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
+          fprintf(stderr,
+                  "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
                  img_contiguous_2d, kern_contiguous_2d,
                  threads.x, threads.y, threads.z,
                  grid.x, grid.y, grid.z);
@@ -373,13 +433,27 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        if (cudaSuccess == sts)
        {
            work_complete = true;
-	    if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose>1)
-            if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
        }
        else
        {
-            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose)
-            if (verbose) fprintf(stderr, "INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n",
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_rows_stack' failed (%s),"
+                      " trying next implementation\n",
                      cudaGetErrorString(sts));
        }
    }
@@ -448,15 +522,31 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        if (cudaSuccess == sts) 
        {
            work_complete = true;
-	    if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n",
+            if (verbose>1)
-				  threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+              fprintf(stderr,
-            if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: used 'conv_rows_stack2' version %s with"
+                      " %d row(s).\n",
+                      (version==9?"'load full kernel'":
+                       "'load 1 kern row at a time'"),nb_row);
        }
        else
        {
-            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n",
+            if (verbose)
-				threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,(version==9?2:3));
+              fprintf(stderr,
-            if (verbose) fprintf(stderr, "INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i version=%d\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y,(version==9?2:3));
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_rows_stack2' failed (%s),"
+                      " trying next implementation\n",
                      cudaGetErrorString(sts));
        }
    }
@@ -556,9 +646,16 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
                else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
            CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
-            if (verbose) fprintf(stderr, "INFO: using 'conv_patch_stack_reduce' version kern_flipped=%i ccontig=%i nb_split=%d, preload_full_kern=%d\n",
+            if (verbose)
-                                kern_flipped,ccontig,nb_split,full_kern);
+              fprintf(stderr,
-            if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n",
+                      "INFO: using 'conv_patch_stack_reduce' version"
+                      " kern_flipped=%i ccontig=%i nb_split=%d,"
+                      " preload_full_kern=%d\n",
+                      kern_flipped, ccontig, nb_split, full_kern);
+            if (verbose>1)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i,"
+                      " grid.y=%i, shared_size=%i, nb_threads=%i\n",
                      threads.x, threads.y, threads.z, grid.x, grid.y,
                      shared_size, threads.x * threads.y * threads.z);
            f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
@@ -575,8 +672,18 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
            }
            else
            {
-                if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z);
+                if (verbose)
-                if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n",
+                  fprintf(stderr,
+                          "threads.x=%i, threads.y=%i, threads.z=%i,"
+                          " grid.x=%i, grid.y=%i,shared_size=%i,"
+                          " nb_threads=%i\n",
+                          threads.x, threads.y, threads.z,
+                          grid.x, grid.y, shared_size,
+                          threads.x * threads.y * threads.z);
+                if (verbose)
+                  fprintf(stderr,
+                          "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
+                          " trying next implementation\n",
                          cudaGetErrorString(sts));
            }
        } // else no good nb_splits was found
@@ -651,12 +758,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        if (cudaSuccess == sts) 
        {
            work_complete = true;
-            if (verbose) fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
+            if (verbose)
+              fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
        }
        else
        {
-            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, shared_size=%i, nb_threads=%i\n", n_threads.x, n_threads.y, n_blocks, n_reduce_buf, n_threads.x * n_threads.y);
+            if (verbose)
-            if (verbose) fprintf(stderr, "INFO: impl 'conv_valid_row_reduce' failed (%s), trying next implementation\n",
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      n_threads.x, n_threads.y, n_blocks,
+                      n_reduce_buf, n_threads.x * n_threads.y);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_valid_row_reduce' failed (%s),"
+                      " trying next implementation\n",
                      cudaGetErrorString(sts));
        }
    }
@@ -665,32 +781,61 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    {
        int outsize = CudaNdarray_SIZE(out);
        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
-        int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
+        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
+                                 NUM_VECTOR_OP_THREADS_PER_BLOCK);
        if (1)
        {
-            if (verbose) fprintf(stderr, "INFO: launching conv_reference_valid\n");
+            if (verbose)
-            if (verbose>1) fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
+              fprintf(stderr, "INFO: launching conv_reference_valid\n");
+            if (verbose>1)
+              fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
                      nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
                      img->devdata,
-                    CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
+                      CudaNdarray_HOST_STRIDES(img)[0],
-            if (verbose>1) fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n", 
+                      CudaNdarray_HOST_STRIDES(img)[1],
+                      CudaNdarray_HOST_STRIDES(img)[2],
+                      CudaNdarray_HOST_STRIDES(img)[3]);
+            if (verbose>1)
+              fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n",
                      nkern, nstack, kern_len, kern_wid,
                      kern->devdata,
-                    CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
+                      CudaNdarray_HOST_STRIDES(kern)[0],
-                        );
+                      CudaNdarray_HOST_STRIDES(kern)[1],
-            if (verbose>1) fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
+                      CudaNdarray_HOST_STRIDES(kern)[2],
-                    CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
+                      CudaNdarray_HOST_STRIDES(kern)[3]);
+            if (verbose>1)
+              fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
+                      CudaNdarray_HOST_DIMS(out)[0],
+                      CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
                      out->devdata,
-                    CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
+                      CudaNdarray_HOST_STRIDES(out)[0],
-            if (verbose>1) fprintf(stderr, "   launch params: %i %i %i\n", outsize, n_blocks, n_threads);
+                      CudaNdarray_HOST_STRIDES(out)[1],
+                      CudaNdarray_HOST_STRIDES(out)[2],
+                      CudaNdarray_HOST_STRIDES(out)[3]);
+            if (verbose>1)
+              fprintf(stderr, "   launch params: %i %i %i\n",
+                      outsize, n_blocks, n_threads);
        }
-        conv_reference_valid<<<n_blocks, n_threads>>>( nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
+        conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
+                CudaNdarray_HOST_DIMS(img)[1],
                img_len, img_wid,
                kern_len, kern_wid,
                out_len, out_wid,
-                img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3],
+                img->devdata,
-                kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
+                CudaNdarray_HOST_STRIDES(img)[0],
-                out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
+                CudaNdarray_HOST_STRIDES(img)[1],
+                CudaNdarray_HOST_STRIDES(img)[2],
+                CudaNdarray_HOST_STRIDES(img)[3],
+                kern->devdata,
+                CudaNdarray_HOST_STRIDES(kern)[0],
+                CudaNdarray_HOST_STRIDES(kern)[1],
+                CudaNdarray_HOST_STRIDES(kern)[2],
+                CudaNdarray_HOST_STRIDES(kern)[3],
+                out->devdata,
+                CudaNdarray_HOST_STRIDES(out)[0],
+                CudaNdarray_HOST_STRIDES(out)[1],
+                CudaNdarray_HOST_STRIDES(out)[2],
+                CudaNdarray_HOST_STRIDES(out)[3],
                subsample_rows, subsample_cols);
        CNDA_THREAD_SYNC;
@@ -698,26 +843,37 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        if (cudaSuccess == sts)
        {
            work_complete = true;
-            if (verbose) fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
+            if (verbose)
+              fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
        }
        else
        {
-            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for CudaNdarray_conv_valid! (%s)",
+            PyErr_Format(PyExc_RuntimeError,
+                         "ERROR: all implementations failed for"
+                         " CudaNdarray_conv_valid! (%s)",
                         cudaGetErrorString(sts));
            return -1;
        }
    }
-    assert (work_complete);
+    if (!work_complete)
+    {
+      PyErr_Format(PyExc_RuntimeError,
+                   "ERROR: no implementation(s) worked for"
+                   " CudaNdarray_conv_valid!"
+                   " Version asked(%d) (-1 mean use an heuristic)",
+                   version);
+        return -1;
+    }
    return 0;
-            //PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
-            //return -1;
 }
 int
-CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
+CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
+                      CudaNdarray * out, int subsample_rows,
+                      int subsample_cols, int version = -1, int verbose=0)
 {
-    const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
+  //144 is the biggest static shared size used with compiling this file.
+    const int shared_avail = SHARED_SIZE - 150;
    int work_complete = 0;
    if (img->nd != 4)
@@ -775,9 +931,12 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
    //const int out_size_byte = out_size*sizeof(float); // unused 
-    if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
+    if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) ||
-      PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
+          (THEANO_KERN_WID == 0))){
-		   " %d kernel columns, but the kernel we received had %d columns!",
+      PyErr_Format(PyExc_ValueError,
+                   "ERROR: This GpuConv code was compiled for"
+                   " %d kernel columns, but the kernel we received"
+                   " had %d columns!",
                   THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
      return -1;
    }
@@ -793,9 +952,11 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
    bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
-    //if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
+    //if the lower 2 dims are c_contiguous but flipped, unflipping the
+    //stride and not flipping the kernel in shared memroy
    //allow to use a version that use less registers(so is faster)
-    //the unflipped version of variable have the original value when we don't need to unflip it, but have the new value when we unflip it.
+    //the unflipped version of variable have the original value when
+    //we don't need to unflip it, but have the new value when we unflip it.
    bool kern_flipped=true;
    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
    float * kern_data_unflipped = kern->devdata;
@@ -812,13 +973,22 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
    if (verbose>1)
    {
-        printf("INFO: Running conv_full version=%d, MACRO kern_width=%d with inputs:\n",version,THEANO_KERN_WID);
+        printf("INFO: Running conv_full version=%d,"
+               " MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
        printf("INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n", 
-                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
+               CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
-                CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]);
+               CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
+               CudaNdarray_HOST_STRIDES(img)[0],
+               CudaNdarray_HOST_STRIDES(img)[1],
+               CudaNdarray_HOST_STRIDES(img)[2],
+               CudaNdarray_HOST_STRIDES(img)[3]);
        printf("INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
-                CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3],
+               CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
-                CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]);
+               CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
+               CudaNdarray_HOST_STRIDES(kern)[0],
+               CudaNdarray_HOST_STRIDES(kern)[1],
+               CudaNdarray_HOST_STRIDES(kern)[2],
+               CudaNdarray_HOST_STRIDES(kern)[3]);
    }
    if (!subsample &&
@@ -840,13 +1010,16 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
        }
        if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
-	//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
+        //we pass by ceil_intdiv in case the out_len is not a multiple
+        //of nb_split, we want nb_split the number of iteration.
        //Max of 16k of shared memory
        if(version==5)
          while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
        //327 as we use 25 register
-	//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
+        //version 5 will have only 1 block running at a time, so we
+        //can use 32 registers per threads, but their is some other stuff that
+        //for the limit to bu lower then 512.
        int max_thread = (version!=5?327:450);
        while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
        if(version==-1 && out_size>512)version=4;
@@ -855,7 +1028,8 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
        if(version==-1 && nb_split>1) version=4;
        else if(version==-1) version=3;
-	else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more than 1 split as to be always execute.
+        //force version 4 when more than 1 split are needed to always execute.
+        else if(version==3 && nb_split!=1) version=4;
        assert(version!=3 || nb_split==1);
        assert(version!=5 || kern_len>1);
@@ -901,15 +1075,39 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
-	  if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version);
+          if (verbose>1)
-            if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false"));
+            fprintf(stderr,
+                    "threads.x=%i, threads.y=%i, threads.z=%i,"
+                    " grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
+                    " out_len=%i, nb_split=%i, version=%i\n",
+                    threads.x, threads.y, threads.z,
+                    grid.x, grid.y, shared_size,
+                    threads.x * threads.y * threads.z,
+                    out_len, nb_split, version);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: used 'conv_full_patch_stack_padded'"
+                      " nb_split=%d low_mem=%s\n",
+                      nb_split, (version==5?"true":"false"));
            work_complete = true;
        }
        else
        {
-	  if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version);
+          if (verbose)
-            if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch_stack_padded' %s %s failed (%s), trying next implementation\n",
+            fprintf(stderr,
-				version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
+                    "threads.x=%i, threads.y=%i, threads.z=%i,"
+                    " grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
+                    " out_len=%i, nb_split=%i, version=%i\n",
+                    threads.x, threads.y, threads.z,
+                    grid.x, grid.y, shared_size,
+                    threads.x * threads.y * threads.z,
+                    out_len, nb_split, version);
+          if (verbose)
+            fprintf(stderr,
+                    "INFO: impl 'conv_full_patch_stack_padded' %s %s"
+                    " failed (%s), trying next implementation\n",
+                    version==3?"no split": "split",
+                    (version==5?"low_mem":"not_low_mem"),
                    cudaGetErrorString(sts));
        }                         
    }
@@ -943,8 +1141,16 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
        }
        else
        {
-            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose)
-            if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n",
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y, shared_size,
+                      threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_full_patch' failed (%s),"
+                      " trying next implementation\n",
                      cudaGetErrorString(sts));
        }                         
    }
@@ -993,8 +1199,15 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
        }
        else
        {
-            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose)
-            if (verbose) fprintf(stderr, "INFO: impl 'conv_full_load_everything' failed (%s), trying next implementation\n",
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y, shared_size,
+                      threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
+                      " failed (%s), trying next implementation\n",
                      cudaGetErrorString(sts));
        }
    }
@@ -1034,13 +1247,20 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
        {
-            if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
+            if (verbose)
+              fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
            work_complete = true;
        }
        else
        {
-            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose)
-            if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
                      cudaGetErrorString(sts));
        }                         
    }
@@ -1050,50 +1270,98 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
        int outsize = CudaNdarray_SIZE(out);
        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
-        int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
+        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
+                                 NUM_VECTOR_OP_THREADS_PER_BLOCK);
        if (0)
        {
-            if (verbose) fprintf(stderr, "INFO: launching conv_reference_valid\n");
+            if (verbose)
-            if (verbose) fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
+              fprintf(stderr, "INFO: launching conv_reference_valid\n");
-                    CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
+            if (verbose)
+              fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
+                      CudaNdarray_HOST_DIMS(img)[0],
+                      CudaNdarray_HOST_DIMS(img)[1],
+                      CudaNdarray_HOST_DIMS(img)[2],
+                      CudaNdarray_HOST_DIMS(img)[3],
                      img->devdata,
-                    CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
+                      CudaNdarray_HOST_STRIDES(img)[0],
-            if (verbose) fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n", 
+                      CudaNdarray_HOST_STRIDES(img)[1],
-                    CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
+                      CudaNdarray_HOST_STRIDES(img)[2],
+                      CudaNdarray_HOST_STRIDES(img)[3]);
+            if (verbose)
+              fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n",
+                      CudaNdarray_HOST_DIMS(kern)[0],
+                      CudaNdarray_HOST_DIMS(kern)[1],
+                      CudaNdarray_HOST_DIMS(kern)[2],
+                      CudaNdarray_HOST_DIMS(kern)[3],
                      kern->devdata,
-                    CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
+                      CudaNdarray_HOST_STRIDES(kern)[0],
+                      CudaNdarray_HOST_STRIDES(kern)[1],
+                      CudaNdarray_HOST_STRIDES(kern)[2],
+                      CudaNdarray_HOST_STRIDES(kern)[3]
                        );
-            if (verbose) fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
+            if (verbose)
-                    CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
+              fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
+                      CudaNdarray_HOST_DIMS(out)[0],
+                      CudaNdarray_HOST_DIMS(out)[1],
+                      CudaNdarray_HOST_DIMS(out)[2],
+                      CudaNdarray_HOST_DIMS(out)[3],
                      out->devdata,
-                    CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
+                      CudaNdarray_HOST_STRIDES(out)[0],
-            if (verbose) fprintf(stderr, "   launch params: %i %i %i\n", outsize, n_blocks, n_threads);
+                      CudaNdarray_HOST_STRIDES(out)[1],
-            if (verbose) fprintf(stderr, "   subsample params: %i %i\n", subsample_rows, subsample_cols);
+                      CudaNdarray_HOST_STRIDES(out)[2],
+                      CudaNdarray_HOST_STRIDES(out)[3]);
+            if (verbose)
+              fprintf(stderr, "   launch params: %i %i %i\n",
+                      outsize, n_blocks, n_threads);
+            if (verbose)
+              fprintf(stderr, "   subsample params: %i %i\n",
+                      subsample_rows, subsample_cols);
        }
-        conv_reference_full<<<n_blocks, n_threads>>>(CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(img)[1],
+        conv_reference_full<<<n_blocks, n_threads>>>(
+                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
+                CudaNdarray_HOST_DIMS(img)[1],
                CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
                CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
                CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
-                img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3],
+                img->devdata, CudaNdarray_HOST_STRIDES(img)[0],
-                kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
+                CudaNdarray_HOST_STRIDES(img)[1],
-                out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
+                CudaNdarray_HOST_STRIDES(img)[2],
+                CudaNdarray_HOST_STRIDES(img)[3],
+                kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0],
+                CudaNdarray_HOST_STRIDES(kern)[1],
+                CudaNdarray_HOST_STRIDES(kern)[2],
+                CudaNdarray_HOST_STRIDES(kern)[3],
+                out->devdata, CudaNdarray_HOST_STRIDES(out)[0],
+                CudaNdarray_HOST_STRIDES(out)[1],
+                CudaNdarray_HOST_STRIDES(out)[2],
+                CudaNdarray_HOST_STRIDES(out)[3],
                subsample_rows, subsample_cols);
        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
        {
-            if (verbose) fprintf(stderr, "INFO: used 'conv_reference_full' version ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d nkern=%d nstack=%d subsample=%d\n",
+            if (verbose)
+              fprintf(stderr, "INFO: used 'conv_reference_full' version"
+                      " ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d"
+                      " nkern=%d nstack=%d subsample=%d\n",
                      img_len,img_wid, kern_len, kern_wid,
                      out_len, out_wid, nbatch, nkern, nstack, subsample);
            work_complete = true;
        }
        else
        {
-	  if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", n_threads, 1, n_blocks, 1, 0, n_threads);
+          if (verbose)
-	  if (verbose) fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s), trying next implementation\n",
+            fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                    " shared_size=%i, nb_threads=%i\n",
+                    n_threads, 1, n_blocks, 1, 0, n_threads);
+          if (verbose)
+            fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
+                    " trying next implementation\n",
                    cudaGetErrorString(sts));
-	  PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for CudaNdarray_conv_full! (%s)",
+          PyErr_Format(PyExc_RuntimeError,
+                       "ERROR: all implementations failed for"
+                       " CudaNdarray_conv_full! (%s)",
                       cudaGetErrorString(sts));
          return -1;
        }
@@ -1110,8 +1378,16 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
    // Re-use the out object if possible.  If the out object it not used, then its refcount is not modified.
    //  If the out object is re-used then it is returned, and its refcount is incremented by 1.
    //
-    if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
+    if (img->nd != 4)
-    if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
+    {
+      PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
+      return NULL;
+    }
+    if (kern->nd != 4)
+    {
+      PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
+      return NULL;
+    }
    int out_dim[4];
    out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
@@ -1145,7 +1421,10 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
    }
    else
    {
-      if (out && verbose) fprintf(stderr, "INFO: Conv is ignoring 'out' argument with wrong structure.\n");
+      if (out && verbose)
+        fprintf(stderr,
+                "INFO: Conv is ignoring 'out' argument with wrong"
+                " structure.\n");
      rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
      //rval might be null
    }
@@ -1162,3 +1441,13 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
    return (PyObject*)rval;
 }
+/*
+  Local Variables:
+  mode:c++
+  c-basic-offset:4
+  c-file-style:"stroustrup"
+  indent-tabs-mode:nil
+  fill-column:79
+  End:
+*/
+// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/cuda/conv_full_kernel.cu
+++ b/theano/sandbox/cuda/conv_full_kernel.cu
@@ -442,3 +442,13 @@ conv_full_load_everything( float* img, float* kern, float* out,
        __syncthreads(); //don't start loading another kernel until we're done here
    }
 }
+/*
+  Local Variables:
+  mode:c++
+  c-basic-offset:4
+  c-file-style:"stroustrup"
+  indent-tabs-mode:nil
+  fill-column:79
+  End:
+*/
+// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
@@ -1030,3 +1030,13 @@ conv_reference_full(int nB, int nK, int stacklen,
 }
 #endif // #ifndef CONV_KERNEL_CU
+/*
+  Local Variables:
+  mode:c++
+  c-basic-offset:4
+  c-file-style:"stroustrup"
+  indent-tabs-mode:nil
+  fill-column:79
+  End:
+*/
+// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -4132,7 +4132,6 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
  mode:c++
  c-basic-offset:4
  c-file-style:"stroustrup"
-  c-file-offsets:((innamespace . 0)(inline-open . 0))
  indent-tabs-mode:nil
  fill-column:79
  End:

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -347,7 +347,6 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
  mode:c++
  c-basic-offset:4
  c-file-style:"stroustrup"
-  c-file-offsets:((innamespace . 0)(inline-open . 0))
  indent-tabs-mode:nil
  fill-column:79
  End:

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -24,12 +24,13 @@ if cuda_ndarray.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')
 #needed as the gpu conv don't have a perform implementation.
-if theano.config.mode=='FAST_COMPILE':
+if theano.config.mode == 'FAST_COMPILE':
    theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
 else:
    theano_mode = theano.compile.mode.get_default_mode().including('gpu')
-cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False]*4)
+cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4)
 def py_conv_valid_numpy(img, kern):
    assert img.shape[1] == kern.shape[1]
@@ -42,19 +43,27 @@ def py_conv_valid_numpy(img, kern):
            for rr in xrange(out.shape[2]):
                for cc in xrange(out.shape[3]):
                    #rr, cc is the upper-left corner of img patches
-                    imgpatch = img[b,:,rr:rr+kern.shape[2], cc:cc+kern.shape[3]]
+                    imgpatch = img[b, :, rr:rr + kern.shape[2],
+                                   cc:cc + kern.shape[3]]
                    #print img.shape, kern.shape, imgpatch.shape, rr+kern.shape[2]-1, rr-1, -1
-                    innerprod = (imgpatch[:,::-1,::-1] * kern[k,:,:,:]).sum()
+                    innerprod = (imgpatch[:, ::-1, ::-1] *
+                                 kern[k, :, :, :]).sum()
                    out[b, k, rr, cc] = innerprod
    return out
 def py_conv_full_numpy(img, kern):
-    # manually pad the img with zeros all around, and then run it through py_conv_valid
+    # manually pad the img with zeros all around, and then run it
-    pad_rows = 2*(kern.shape[2]-1) + img.shape[2]
+    # through py_conv_valid
-    pad_cols = 2*(kern.shape[3]-1) + img.shape[3]
+    pad_rows = 2 * (kern.shape[2] - 1) + img.shape[2]
-    padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols), dtype=img.dtype)
+    pad_cols = 2 * (kern.shape[3] - 1) + img.shape[3]
-    padded_img[:,:,kern.shape[2]-1:kern.shape[2]-1+img.shape[2],kern.shape[3]-1:kern.shape[3]-1+img.shape[3]] = img
+    padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols),
+                             dtype=img.dtype)
+    padded_img[:, :, kern.shape[2] - 1: kern.shape[2] - 1 + img.shape[2],
+                     kern.shape[3] - 1: kern.shape[3] - 1 + img.shape[3]] = img
    return py_conv_valid_numpy(padded_img, kern)
 def py_conv(img, kern, mode, subsample):
    """
    use a scipy or numpy implementation depending is scipy is available.
@@ -62,13 +71,16 @@ def py_conv(img, kern, mode, subsample):
    """
    if imported_scipy_convolve2d:
        return py_conv_scipy(img, kern, mode, subsample)
-    elif mode=='valid':
+    elif mode == 'valid':
-        return py_conv_valid_numpy(img,kern)[:,:,::subsample[0],::subsample[1]]
+        return py_conv_valid_numpy(img, kern)[:, :, ::subsample[0],
-    elif mode=='full':
+                                                      ::subsample[1]]
-        return py_conv_full_numpy(img,kern)[:,:,::subsample[0],::subsample[1]]
+    elif mode == 'full':
+        return py_conv_full_numpy(img, kern)[:, :, ::subsample[0],
+                                                     ::subsample[1]]
    else:
        raise Exception("Can't execute this kernel.")
 def py_conv_scipy(img, kern, mode, subsample):
    assert img.shape[1] == kern.shape[1]
    if mode == 'valid':
@@ -83,17 +95,20 @@ def py_conv_scipy(img, kern, mode, subsample):
    for b in xrange(out.shape[0]):
        for k in xrange(out.shape[1]):
            for s in xrange(img.shape[1]):
-                out[b,k,:,:] += convolve2d(img[b,s,:,:]
+                out[b, k, :, :] += convolve2d(img[b, s, :, :],
-                        , kern[k,s,:,:]
+                                              kern[k, s, :, :],
-                        , mode)
+                                              mode)
-    return out[:,:,::subsample[0], ::subsample[1]]
+    return out[:, :, ::subsample[0], ::subsample[1]]
 def _params_allgood_header():
    print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup"
-def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
-        kern_stride=(1,1), version=-1, verbose=0, random=True, print_=None,
+def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
-        id=None, rtol=1e-5, atol = 1e-8, nb_iter=0, ones=False, compile_kshp=None):
+                    kern_stride=(1, 1), version=-1, verbose=0, random=True,
+                    print_=None, id=None, rtol=1e-5, atol=1e-8,
+                    nb_iter=0, ones=False, compile_kshp=None):
    #
    # This function is the core of several of the big unit-test drivers,
    # but it can also be used very directly on its own to test a specific
@@ -111,22 +126,27 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
        npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
        npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
    elif random:
-        npy_img = theano._asarray(numpy.random.rand(*ishape)+1, dtype='float32')
+        npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
-        npy_kern = theano._asarray(numpy.random.rand(*kshape)-2, dtype='float32')
+                                  dtype='float32')
+        npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
+                                   dtype='float32')
    else:
-        npy_img = theano._asarray(numpy.arange(numpy.prod(ishape)).reshape(ishape), dtype='float32')+1
+        npy_img = theano._asarray(numpy.arange(
-        npy_kern = -(theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32')+1)
+                numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
+        npy_kern = -(theano._asarray(numpy.arange(
+                    numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
    img = cuda_ndarray.CudaNdarray(npy_img)
    kern = cuda_ndarray.CudaNdarray(npy_kern)
-    #we take the stride after the transfert as we make c_contiguous data on the GPU.
+    #we take the stride after the transfert as we make c_contiguous
-    if img_stride!=(1,1):
+    #data on the GPU.
-        img=img[:,:,::img_stride[0],::img_stride[1]]
+    if img_stride != (1, 1):
-        npy_img = npy_img[:,:,::img_stride[0],::img_stride[1]]
+        img = img[:, :, ::img_stride[0], ::img_stride[1]]
-    if kern_stride!=(1,1):
+        npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]]
-        kern=kern[:,:,::kern_stride[0],::kern_stride[1]]
+    if kern_stride != (1, 1):
-        npy_kern = npy_kern[:,:,::kern_stride[0],::kern_stride[1]]
+        kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]]
+        npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]
    t2 = None
    rval = True
@@ -139,20 +159,23 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
        op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
                                              subsample=subsample,
                                              version=version,
-                                              verbose=verbose, kshp=compile_kshp)(i,k)
+                                              verbose=verbose,
-        f=theano.function([i,k],op, mode=theano_mode)
+                                              kshp=compile_kshp)(i, k)
-        gpuval = f(img,kern)
+        f = theano.function([i, k], op, mode=theano_mode)
+        gpuval = f(img, kern)
        t2 = time.time()
        for i in range(nb_iter):
-            gpuval2 = f(img,kern)
+            gpuval2 = f(img, kern)
-            assert numpy.allclose(numpy.asarray(gpuval),numpy.asarray(gpuval2))
+            assert numpy.allclose(numpy.asarray(gpuval),
-            assert (numpy.asarray(gpuval)==numpy.asarray(gpuval2)).all()
+                                  numpy.asarray(gpuval2))
+            assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
        gpuval = numpy.asarray(gpuval)
        if gpuval.shape != cpuval.shape:
-            print >> sys.stdout, "ERROR: shape mismatch", gpuval.shape, cpuval.shape
+            print >> sys.stdout, "ERROR: shape mismatch",
+            print >> sys.stdout, gpuval.shape, cpuval.shape
            rval = False
        if rval:
-            rval = numpy.allclose(cpuval, gpuval, rtol = rtol)
+            rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
            assert numpy.all(numpy.isfinite(gpuval))
    except NotImplementedError, e:
        print >> sys.stdout, '_params_allgood Failed allclose', e
@@ -164,49 +187,52 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
        else:
            approx_fp = ishape[0] * kshape[0] * kshape[1] * kshape[2] * kshape[3] * ishape[2] * ishape[3] * 2
        approx_fp /= 1e6
-        cpu_mflops = approx_fp / (t1-t0)
+        cpu_mflops = approx_fp / (t1 - t0)
-        gpu_mflops = approx_fp / (t2-t1)
+        gpu_mflops = approx_fp / (t2 - t1)
-        if verbose>0:
+        if verbose > 0:
-            print >> sys.stdout, '%15s'% str(ishape), '%15s'% str(kshape),
+            print >> sys.stdout, '%15s' % str(ishape), '%15s' % str(kshape),
            print >> sys.stdout, '%12.5f  %7.2f %7.2f %7.1f' % (approx_fp,
-                    cpu_mflops, gpu_mflops,(t1-t0)/(t2-t1))
+                    cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1))
    if not rval:
        print >> sys.stdout, 'test_'+mode+' id='+str(id)+' FAILED for ishape, kshape, mode, subsample, img_stride, kern_stride, version', ishape, kshape, mode, subsample, img_stride, kern_stride, version
-        diff=cpuval-gpuval
+        diff = cpuval - gpuval
-        diffabs=numpy.absolute(diff)
+        diffabs = numpy.absolute(diff)
-        pr_diff=diffabs/numpy.absolute(cpuval)
+        pr_diff = diffabs / numpy.absolute(cpuval)
-        nb_close=(diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
+        nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
        print "max absolute diff:",diffabs.max(),"avg abs diff:",numpy.average(diffabs)
        print "median abs diff:", numpy.median(diffabs), "nb close:",nb_close, "/", diff.size
        print "max relatif diff:",pr_diff.max(), "avg rel diff:", numpy.average(pr_diff)
-    if not rval and print_!=False:
+    if not rval and print_ != False:
-        if npy_img.shape[0]>5:
+        if npy_img.shape[0] > 5:
-            print "img",npy_img[0]
+            print "img", npy_img[0]
-            print "kern",npy_kern[0]
+            print "kern", npy_kern[0]
-            print "gpu",gpuval[0][0]
+            print "gpu", gpuval[0][0]
-            print "cpu",cpuval[0][0]
+            print "cpu", cpuval[0][0]
-            print "diff",diff[0][0]
+            print "diff", diff[0][0]
        else:
-            print "img",npy_img
+            print "img", npy_img
-            print "kern",npy_kern
+            print "kern", npy_kern
-            print "gpu",gpuval
+            print "gpu", gpuval
-            print "cpu",cpuval
+            print "cpu", cpuval
-            print "diff",diff
+            print "diff", diff
    return rval
 def exec_conv(version, shapes, verbose, random, mode,
              print_=None, rtol=1e-5, ones=False):
-    if verbose>0:
+    if verbose > 0:
        _params_allgood_header()
    nb_failed = 0
    nb_tests = 0
-    failed_version=set()
+    failed_version = set()
-    failed_id=[]
+    failed_id = []
-    for ver in version:# I put -1 in case we forget to add version in the test to.
+    # I put -1 in case we forget to add version in the test to.
-        for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+    for ver in version:
-            ret=False
+        for id, (ishape, kshape, subshape,
+                 istride, kstride) in enumerate(shapes):
+            ret = False
            try:
                ret = _params_allgood(ishape,
                        kshape,
@@ -222,19 +248,21 @@ def exec_conv(version, shapes, verbose, random, mode,
                        rtol=rtol,
                        ones=ones)
            except Exception, e:
-                print ver, id,(ishape, kshape, subshape, istride, kstride)
+                print ver, id, (ishape, kshape, subshape, istride, kstride)
                print e
                pass
            if not ret:
                failed_version.add(ver)
                failed_id.append(id)
-                nb_failed+=1
+                nb_failed += 1
-            nb_tests+=1
+            nb_tests += 1
-    if nb_failed>0:
+    if nb_failed > 0:
-        print "nb_failed",nb_failed,"on",nb_tests, "failed_version",failed_version, "failed_id",failed_id
+        print "nb_failed", nb_failed, "on", nb_tests,
-        assert nb_failed==0, nb_failed
+        print "failed_version", failed_version, "failed_id", failed_id
+        assert nb_failed == 0, nb_failed
    else:
-        print 'Executed',nb_tests,'different shapes'
+        print 'Executed', nb_tests, 'different shapes'
 def get_basic_shapes():
    return [
@@ -249,8 +277,12 @@ def get_basic_shapes():
            , ((1, 1, 4, 4), (1, 1, 3, 2), (1,1), (1,1), (1,1))
            , ((1, 1, 4, 4), (1, 1, 2, 3), (1,1), (1,1), (1,1))]
-def get_shapes(imshp=(1,1), kshp=(1,1), subsample=(1,1), img_stride=(1,1), kern_stride=(1,1)):
-    """ all possible case if we one or more of stack size, batch size, nkern. We use the gived image shape, kernel shape and subsmaple shape."""
+def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
+               img_stride=(1, 1), kern_stride=(1, 1)):
+    """ all possible case if we one or more of stack size, batch size,
+    nkern. We use the gived image shape, kernel shape and subsmaple
+    shape."""
    return [  ((1, 2)+imshp, (1, 2)+kshp,subsample, img_stride, kern_stride)#stack only
            , ((3, 1)+imshp, (1, 1)+kshp,subsample, img_stride, kern_stride)#batch only
            , ((1, 1)+imshp, (2, 1)+kshp,subsample, img_stride, kern_stride)#nkern only
@@ -260,7 +292,10 @@ def get_shapes(imshp=(1,1), kshp=(1,1), subsample=(1,1), img_stride=(1,1), kern_
            , ((2, 2)+imshp, (2, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack
            , ((3, 2)+imshp, (4, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack
            ]
-def get_shapes2(scales_img=(1,1), scales_kern=(1,1), subsample=(1,1), img_stride=(1,1), kern_stride=(1,1)):
+def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
+                img_stride=(1, 1), kern_stride=(1, 1)):
    #basic test of stack, batch and nkern paramter
    shapes =get_shapes((1*scales_img[0],1*scales_img[1]),
                       (1*scales_kern[0],1*scales_kern[1]),subsample, img_stride, kern_stride)
@@ -284,19 +319,20 @@ def get_shapes2(scales_img=(1,1), scales_kern=(1,1), subsample=(1,1), img_stride
                        (2*scales_kern[0],3*scales_kern[1]),subsample, img_stride, kern_stride)
    return shapes
 def get_valid_shapes():
    #          img shape,     kern shape, subsample shape
    shapes = get_basic_shapes()
-    shapes +=get_shapes2()
+    shapes += get_shapes2()
    #test image stride
-    shapes += get_shapes2(scales_img=(2,2),img_stride=(1,2))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
-    shapes += get_shapes2(scales_img=(2,2),img_stride=(2,1))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
-    shapes += get_shapes2(scales_img=(2,2),img_stride=(2,2))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
-    shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
-    shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
+    shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
    #test subsample done in a separate fct
@@ -333,161 +369,192 @@ def get_valid_shapes():
            ]
    return shapes
 def test_valid_0_2():
    shapes = get_valid_shapes()
-    version=[0,2]
+    version = [0, 2]
-    verbose=0
+    verbose = 0
    random = True
    print_ = False
    ones = False
    if ones:
        random = False
-    shapes2=[]
+    shapes2 = []
-    for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
-        oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1]))
+        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
-        if oshape[3]> 512:
+                                                  numpy.asarray(kshape[2:]) +
+                                                  numpy.asarray([1, 1]))
+        if oshape[3] > 512:
            continue
-        if ishape[1]>1:
+        if ishape[1] > 1:
            continue
-        if (numpy.prod(ishape[2:])+numpy.prod(kshape[2:]))*4>(16*1024-150):
+        if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
+            (16 * 1024 - 150)):
            continue
-        if subshape==(1,1):
+        if subshape == (1, 1):
            shapes2.append((ishape, kshape, subshape, istride, kstride))
    shapes = shapes2
-    exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5)
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 def test_valid_1_3_11_12():
    shapes = get_valid_shapes()
-    version=[1,3,11,12]
+    version = [1, 3, 11, 12]
-    verbose=0
+    verbose = 0
    random = True
    print_ = False
    ones = False
    if ones:
        random = False
-    shapes2=[]
+    shapes2 = []
-    for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
-        oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1]))
+        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
-        if oshape[3]> 512:
+                                                  numpy.asarray(kshape[2:]) +
+                                                  numpy.asarray([1, 1]))
+        if oshape[3] > 512:
            continue
-        if (numpy.prod(ishape[2:])+numpy.prod(kshape[2:]))*4>(16*1024-150):
+        if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
+            (16 * 1024 - 150)):
            continue
-        if subshape==(1,1):
+        if subshape == (1, 1):
            shapes2.append((ishape, kshape, subshape, istride, kstride))
    shapes = shapes2
-    exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5)
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 def test_valid_4():
    shapes = get_valid_shapes()
-    version=[4]
+    version = [4]
-    verbose=0
+    verbose = 0
    random = True
    print_ = False
    ones = False
    if ones:
        random = False
-    shapes2=[]
+    shapes2 = []
-    for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
-        oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1]))
+        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
-        if oshape[3]> 512:
+                                                  numpy.asarray(kshape[2:]) +
+                                                  numpy.asarray([1, 1]))
+        if oshape[3] > 512:
            continue
-        if ishape[1]>1:
+        if ishape[1] > 1:
            continue
-        if (kshape[2]*ishape[3]*4+numpy.prod(kshape[2:])*4)>(16*1024-150):
+        if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
+            (16 * 1024 - 150)):
            continue
-        if subshape==(1,1):
+        if subshape == (1, 1):
            shapes2.append((ishape, kshape, subshape, istride, kstride))
    shapes = shapes2
-    exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5)
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 def test_valid_5():
    shapes = get_valid_shapes()
-    version=[5]
+    version = [5]
-    verbose=0
+    verbose = 0
    random = True
    print_ = False
    ones = False
    if ones:
        random = False
-    shapes2=[]
+    shapes2 = []
    print len(shapes)
-    for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
-        oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1]))
+        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
-        if oshape[3]> 512:
+                                                  numpy.asarray(kshape[2:]) +
+                                                  numpy.asarray([1, 1]))
+        if oshape[3] > 512:
            continue
-        if (kshape[2]*ishape[3]*4+numpy.prod(kshape[2:])*4)>(16*1024-150):
+        if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
+            (16 * 1024 - 150)):
            continue
-        if subshape==(1,1):
+        if subshape == (1, 1):
            shapes2.append((ishape, kshape, subshape, istride, kstride))
    shapes = shapes2
    print len(shapes2)
-    exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5)
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 def test_valid_7_8_13():
    shapes = get_valid_shapes()
    # This is to test the "new" lower shared memory usage.
-    shapes.append(((10,30,60,60),(20,30,40,40), (1,1), (1,1), (1,1)))
+    shapes.append(((10, 30, 60, 60), (20, 30, 40, 40),
-    version=[7,8,13]
+                   (1, 1), (1, 1), (1, 1)))
-    verbose=0
+    version = [7, 8, 13]
+    verbose = 0
    random = True
    print_ = False
    ones = False
    if ones:
        random = False
-    shapes2=[]
+    shapes2 = []
    print len(shapes)
-    for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
-        oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1]))
+        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
-        if oshape[2]*oshape[3]>512:
+                                                  numpy.asarray(kshape[2:]) +
+                                                  numpy.asarray([1, 1]))
+        if oshape[2] * oshape[3] > 512:
            continue
-        if max(numpy.prod(ishape[2:])*4+2*kshape[3]*4, oshape[2]*oshape[3]*4*2)>(16*1024-150):
+        if max(numpy.prod(ishape[2:]) * 4 + 2 * kshape[3] * 4,
+               oshape[2] * oshape[3] * 4 * 2) > (16 * 1024 - 150):
            continue
-        if subshape==(1,1):
+        if subshape == (1, 1):
            shapes2.append((ishape, kshape, subshape, istride, kstride))
    shapes = shapes2
    print len(shapes2)
-    exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5)
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 def test_valid_9_10():
    shapes = get_valid_shapes()
-    version=[9,10]
+    version = [9, 10]
-    verbose=0
+    verbose = 0
    random = True
    print_ = False
    ones = False
    if ones:
        random = False
-    shapes2=[]
+    shapes2 = []
    print len(shapes)
-    for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
-        oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1]))
+        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
-        if oshape[3]> 512:
+                                                  numpy.asarray(kshape[2:]) +
+                                                  numpy.asarray([1, 1]))
+        if oshape[3] > 512:
            continue
-        if (kshape[3]*4+ishape[3])>(16*1024-150):
+        if (kshape[3] * 4 + ishape[3]) > (16 * 1024 - 150):
            continue
-        if subshape==(1,1):
+        if subshape == (1, 1):
            shapes2.append((ishape, kshape, subshape, istride, kstride))
    shapes = shapes2
    print len(shapes2)
-    exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5)
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 def test_valid():
    shapes = get_valid_shapes()
@@ -495,8 +562,8 @@ def test_valid():
    #shapes=shapes[400:426]
    # I put -1 in case we forget to add version in the test to.
    # I put -2 to test the reference version.
-    version=[-2,-1,6]
+    version = [-2, -1, 6]
-    verbose=0
+    verbose = 0
 #    version=[1]
    random = True
@@ -505,17 +572,19 @@ def test_valid():
    if ones:
        random = False
-    exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5)
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 def test_full():
    shapes = get_basic_shapes()
-    shapes +=get_shapes2()
+    shapes += get_shapes2()
    #test image stride
-    shapes += get_shapes2(scales_img=(2,2),img_stride=(1,2))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
-    shapes += get_shapes2(scales_img=(2,2),img_stride=(2,1))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
-    shapes += get_shapes2(scales_img=(2,2),img_stride=(2,2))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
-    shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
-    shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
+    shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
    #test subsample done in a separate fct
@@ -557,13 +626,14 @@ def test_full():
            ]
 #    shapes=shapes[:277]
-    version=[-2,-1,0,1,2,3,4,5]
+    version = [-2, -1, 0, 1, 2, 3, 4, 5]
-    verbose=0
+    verbose = 0
 #    version=[4]
-    random=True
+    random = True
    exec_conv(version, shapes, verbose, random, 'full')
 def test_subsample():
    # implement when
    shapes = [
@@ -573,14 +643,14 @@ def test_subsample():
            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1,1), (1,1))
            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1,1), (1,1))
            ]
-    shapes += get_shapes2(scales_img=(2,2),subsample=(1,1))
+    shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 1))
-    shapes += get_shapes2(scales_img=(2,2),subsample=(1,2))
+    shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 2))
-    shapes += get_shapes2(scales_img=(2,2),subsample=(2,1))
+    shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 1))
-    shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
+    shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 2))
 #We put only the version that implement the subsample to make the test faster.
-    version_valid = [-2,-1,1,3,11,12]
+    version_valid = [-2, -1, 1, 3, 11, 12]
-    version_full = [-2,-1]
+    version_full = [-2, -1]
    verbose = 0
    random = True
    print_ = False
@@ -588,8 +658,10 @@ def test_subsample():
    if ones:
        random = False
-    exec_conv(version_valid, shapes, verbose, random, 'valid', print_=print_, ones=ones)
+    exec_conv(version_valid, shapes, verbose, random, 'valid',
-    exec_conv(version_full, shapes, verbose, random, 'full', print_=print_, ones=ones)
+              print_=print_, ones=ones)
+    exec_conv(version_full, shapes, verbose, random, 'full',
+              print_=print_, ones=ones)
 ## See #616
 #def test_logical_shapes():
@@ -614,7 +686,8 @@ class TestConv2DGPU(unittest.TestCase):
        theano_mode_orig = theano_mode
        try:
            if theano.config.mode in ['DebugMode', 'DEBUG_MODE']:
-                theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+                theano_mode = theano.compile.mode.get_mode(
+                    'FAST_RUN').including('gpu')
                for mode in ['valid', 'full']:
                    for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)),
                                   ((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)),
@@ -622,16 +695,21 @@ class TestConv2DGPU(unittest.TestCase):
                                   # We use only the number of columns.
                                   ]:
-                        self.assertRaises(ValueError, _params_allgood, shapes[0], shapes[1],
+                        self.assertRaises(ValueError, _params_allgood,
-                                          verbose=verbose, random=random, mode=mode,
+                                          shapes[0], shapes[1],
-                                          print_=print_, ones=ones, compile_kshp=shapes[2])
+                                          verbose=verbose, random=random,
+                                          mode=mode,
+                                          print_=print_, ones=ones,
+                                          compile_kshp=shapes[2])
        finally:
            theano_mode = theano_mode_orig
 def _test_dummy():
    ishape = (1, 1, 5, 5)
    kshape = (1, 1, 3, 3)
    mode = 'valid'
-    subsample = (1,1)
+    subsample = (1, 1)
    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
@@ -696,14 +774,14 @@ def benchmark():
         ,((2, 30,116,116), (20, 30, 9,9), (1,1), (1,1), (1,1))#full conv_reference_full
            ]
 #    shapes_valid=shapes_valid[-1:]
 #    shapes_full=shapes_full[-1:]
-    version=[-1]
+    version = [-1]
-    verbose=1
+    verbose = 1
-    random=True
+    random = True
-    exec_conv(version, shapes_valid, verbose, random, 'valid', print_=None, rtol=1e-3)
+    exec_conv(version, shapes_valid, verbose, random, 'valid',
+              print_=None, rtol=1e-3)
    exec_conv(version, shapes_full, verbose, random, 'full')
@@ -719,5 +797,3 @@ def test_stack_rows_segfault_070312():
            nkern=1, bsize=1)
    f = theano.function([], [], updates={out: op(img, kern)})
    f()