GpuConv - new refcounting rules, and rewrite of valid version=13 aka conv_patch_stack_reduce

- fixes errors in conv_patch_stack_reduce when the entire kernel doesn't fit into shared memory - lowers the shared memory requirement of conv_patch_stack_reduce

GpuConv - new refcounting rules, and rewrite of valid version=13 aka conv_patch_stack_reduce
2d1cdeaa · James Bergstra · 616089ff · 2d1cdeaa · 2d1cdeaa · 2d1cdeaa
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -363,9 +363,10 @@ class GpuConv(Op):
        return ['cuda_ndarray.cuh','<stdio.h>']

    def c_code_cache_version(self):
-        return (0,8)
+        return (0,9) # raise this whenever modifying any of the support_code_files

    def c_support_code_apply(self, node, nodename):
+        # REMEMBER TO RAISE c_code_cache_version when changing any of these files
        return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\
            open(os.path.join(os.path.split(__file__)[0],'conv_full_kernel.cu')).read()+\
            open(os.path.join(os.path.split(__file__)[0],'conv.cu')).read()
@@ -405,8 +406,7 @@ class GpuConv(Op):

    CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, %(out)s,
                     mode, dx, dy, version, verbose);
-    if(%(out)s && %(out)s==out2)
-         Py_DECREF(out2);//CudaNdarray_Conv incremented the count to out
+    Py_XDECREF(%(out)s);
    %(out)s = out2;
 """%sub


--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
+// REMEMBER TO RAISE c_code_cache_version when changing this file
+//
 enum { ConvMode_FULL, ConvMode_VALID };
 PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);

@@ -425,7 +427,10 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        }
    }

-    //version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient.
+    //version 8 is the same but we force the split.
+    // The split is need in case we have too much threads.
+    // This happen frequently if the kernel length is big.
+    // Big kernel is frequent in the gradient.
    //version 8 need a minimum of kernel length as we force the split.
    //version 8 is needed to test more easily this kernel template parameter.
    //version 13 load only 1 kernel row at a time.
@@ -434,8 +439,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
 	out_size<512 &&//Maximum of 512 theads by block
 	(version==7||version==8||version==13||version==-1) &&
 	(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
-	(version!=13||kern_len>1) && //version 13 need a minimal kernel length as big as the split.
-	(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
+        //version 13 need a minimal kernel length as big as the split.
+	(version!=13||kern_len>1) &&
 	!work_complete) //conv_patch_stack_reduce
    {
 	int nb_split=1;
@@ -443,83 +448,99 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,

 	if(version==8||version==13) nb_split++;//force the split.
 	if(version==13)full_kern=false;
-	while(ceil_intdiv(kern_len,nb_split)>64)nb_split++;//device 1.3 have a max of 64 thread in z
-	while(out_size*ceil_intdiv(kern_len,nb_split)>512)nb_split++;
-	int shared_size=(img_size + kern_size + out_size*kern_len)*sizeof(float);
-	if(shared_size>=shared_avail){
-	  //if we can't fit the kernel in shared memory, we can split it more.
-	  full_kern=false;	  
-	  assert((img_size+kern_wid*2+out_size*2)*sizeof(float)<=shared_avail);
-	  shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
-	  while(shared_size>=shared_avail || ceil_intdiv(kern_len,nb_split)>64){
-	    nb_split++;
-	    shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
-	  }
-	}

-	int thread_z=ceil_intdiv(kern_len,nb_split);
-        assert(thread_z>0);//should not happen, but in case...
-	assert(shared_size<=shared_avail);
-	if(!full_kern)
-	  assert(thread_z!=kern_len);
+        //thread_z is going to be ceil_intdiv(kern_len, nb_split)
+        // we need enough splits so that
+        // a) thread_z fits in the 'z' threadIdx (i.e. is less than 64)
+        // b) thread_z * out_len * out_wid fits in the thread count
+        // c) the kernel doesn't need too much shared memory

-        dim3 threads(out_wid, out_len, thread_z);
-        dim3 grid(nbatch,nkern);
-	
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int,
-		  int, int,
-		  int, int);
+        // constraint (a)
+        // device 1.3 have a max of 64 thread in z
+	while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;

-	const bool split=thread_z!=kern_len;
-	const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
+        // constraint (b)
+        //  (TODO: read the number of threads per block from the device)
+	while(out_size*ceil_intdiv(kern_len,nb_split)>512) nb_split++;

-	//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
-	//We will always be split when we don't load the full kernel
-#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
-            if     (kern_flipped  && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
-            else if(kern_flipped  && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
-            else if(kern_flipped  && ccontig  && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
-            else if(kern_flipped  && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
-            else if(!kern_flipped && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
-            else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
-            else if(!kern_flipped && ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
-            else if(!kern_flipped && !ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
-	    /*else if(kern_flipped  && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
-	    /*else if(kern_flipped  && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
-            else if(kern_flipped  && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
-            else if(kern_flipped  && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
-            /*else if(!kern_flipped && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
-            /*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
-            else if(!kern_flipped && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
-            else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
-	CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
-
-	if (verbose) printf("INFO: using 'conv_patch_stack_reduce' version kern_flipped=%i ccontig=%i nb_split=%d, preload_full_kern=%d\n",
-			    kern_flipped,ccontig,nb_split,full_kern);
-	if (verbose>1) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n",
-			      threads.x, threads.y, threads.z, grid.x, grid.y,
-			      shared_size, threads.x * threads.y * threads.z);
-	f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
-					   img_len, img_wid, kern_len, kern_wid,
-					   nkern, nstack, 
-					   img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
-					   kern_stride_col_unflipped, kern_stride_row_unflipped,
-					   kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            work_complete = true;
+        // tentative estimates (prior to contraint c)
+	int thread_z=ceil_intdiv(kern_len,nb_split);
+	int shared_size = sizeof(float)*(full_kern
+                ? std::max(img_size + kern_size, out_size*thread_z)
+                : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
+
+        // constraint (c)
+        while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
+	    //if we can't fit the kernel in shared memory, we must split it more.
+            nb_split++;
+            thread_z=ceil_intdiv(kern_len,nb_split);
+	    shared_size=sizeof(float)*std::max(
+                    img_size + kern_wid*thread_z,
+                    out_size*thread_z);
        }
-        else
+        if (nb_split <= kern_len)
        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z);
-            if (verbose) printf("INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n",
-                                cudaGetErrorString(sts));
-        }                         
+            assert(thread_z>0);//should not happen, but in case...
+            if(!full_kern) assert(thread_z!=kern_len);
+
+            dim3 threads(out_wid, out_len, thread_z);
+            dim3 grid(nbatch,nkern);
+
+            void (*f)(float*, float*, float*,
+                      int, int, int, int,
+                      int, int, int, int,
+                      int, int,
+                      int, int,
+                      int, int);
+
+            const bool split=thread_z!=kern_len;
+            const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
+
+            //printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
+            //We will always be split when we don't load the full kernel
+#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
+                if     (kern_flipped  && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
+                else if(kern_flipped  && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
+                else if(kern_flipped  && ccontig  && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
+                else if(kern_flipped  && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
+                else if(!kern_flipped && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
+                else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
+                else if(!kern_flipped && ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
+                else if(!kern_flipped && !ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
+                /*else if(kern_flipped  && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
+                /*else if(kern_flipped  && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
+                else if(kern_flipped  && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
+                else if(kern_flipped  && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
+                /*else if(!kern_flipped && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
+                /*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
+                else if(!kern_flipped && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
+                else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
+            CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
+
+            if (verbose) fprintf(stderr, "INFO: using 'conv_patch_stack_reduce' version kern_flipped=%i ccontig=%i nb_split=%d, preload_full_kern=%d\n",
+                                kern_flipped,ccontig,nb_split,full_kern);
+            if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n",
+                                  threads.x, threads.y, threads.z, grid.x, grid.y,
+                                  shared_size, threads.x * threads.y * threads.z);
+            f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
+                                               img_len, img_wid, kern_len, kern_wid,
+                                               nkern, nstack,
+                                               img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
+                                               kern_stride_col_unflipped, kern_stride_row_unflipped,
+                                               kern_stride_stack, kern_stride_nkern);
+            CNDA_THREAD_SYNC;
+            cudaError_t sts = cudaGetLastError();
+            if (cudaSuccess == sts)
+            {
+                work_complete = true;
+            }
+            else
+            {
+                if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z);
+                if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n",
+                                    cudaGetErrorString(sts));
+            }
+        } // else no good nb_splits was found
    }

    if (1 && (version==6||version==-1) &&
@@ -647,6 +668,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
            return -1;
        }
    }
+    assert (work_complete);
    return 0;

            //PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
@@ -1043,6 +1065,9 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
 		 const int subsample_rows, const int subsample_cols,
 		 const int version, const int verbose)
 {
+    // Re-use the out object if possible.  If the out object it not used, then its refcount is not modified.
+    //  If the out object is re-used then it is returned, and its refcount is incremented by 1.
+    //
    if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
    if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}

@@ -1062,33 +1087,36 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
    }
    out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
    out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
-    
-    CudaNdarray * rval = out;
-    if(!(out && out->nd==4 && CudaNdarray_is_c_contiguous(out) 
+
+    CudaNdarray * rval = NULL;
+
+    if ( out
+         && out->nd==4
+         && CudaNdarray_is_c_contiguous(out)
 	 && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
 	 && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
 	 && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
-	 && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])){
-      if (out)
-      {
-          fprintf(stderr, "Warning: Conv is ignoring 'out' argument with wrong structure.\n");
-	  Py_DECREF(out);
-      }
+	 && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])
+    {
+      rval = out;
+      Py_INCREF(rval);
+    }
+    else
+    {
+      if (verbose) fprintf(stderr, "INFO: Conv is ignoring 'out' argument with wrong structure.\n");
      rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
+      //rval might be null
    }
-    if ((rval==NULL) 
+    if ((rval==NULL)
            || ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
            || ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
            )
    {
        // if rval is something we just allocated,
        // and there was a problem, then we have to free it.
-        if (rval != out) Py_XDECREF(rval);
+        Py_XDECREF(rval);
        return NULL;
    }
-    //TODO: Get refcount story clearer!
-    //      This function does a weird thing as work-around with Conv_VARARGS
-    if (rval == out) Py_INCREF(rval);
    return (PyObject*)rval;
 }

--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
+// REMEMBER TO RAISE c_code_cache_version when changing this file
+//
 //implement the valid convolution only

 /*
@@ -38,6 +40,8 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
 #define BS(i, j) Bs[i][j]
 #endif
 */
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))

 const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
 #define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float))
@@ -45,8 +49,9 @@ const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the tr
 __device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
  if (nb_thread < 64)
    {
-      if(flipped) 
-        //TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
+      if(flipped)
+        //TODO very slow on device before 1.3.
+        //     make access to kern sequential and access to d_kern flipped.
        for(int i=thread_id;i<N;i+=nb_thread)
          dst[i]=src[N - 1 - i];
        //dst[N-1-i]=src[i];
@@ -88,10 +93,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
 			       const bool flipped=false, const bool c_contiguous=true){
  if(flipped && ! c_contiguous){
    for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
-      dst[nb_row*nb_col-1-i]=src[i/nb_col*stride_row+i%nb_col*stride_col];
+      dst[nb_row*nb_col-1-i]=src[(i/nb_col)*stride_row+(i%nb_col)*stride_col];
  }else if(c_contiguous){
    load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
-  
  }else if(flipped){//c_contiguous==true
    //TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
    int N=nb_col*nb_row;
@@ -440,10 +444,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
 		  int kern_stride_col, int kern_stride_row,
 		  int kern_stride_stack, int kern_stride_nkern)
 {
-  int __shared__ out_len, out_wid, nb_thread_id;
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+  //int __shared__ out_len, out_wid, nb_thread_id;
+  //out_len = img_len - kern_len + 1;
+  //out_wid = img_wid - kern_wid + 1;
+  const int out_wid = blockDim.x;
+  const int out_len = blockDim.y;
+  const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;

  extern __shared__ float s_data[];

@@ -458,9 +464,16 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
    int out_row = ty;//output row
    const int thread_id  = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;

-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID];
-    float * d_reduce=&s_data[img_len*img_wid+(preload_full_kern?kern_len:blockDim.z)*kern_wid];
+    //d_img size [IMAGE_LEN * IMAGE_WID];
+    float * d_img=&s_data[0];
+
+    //d_kern size[(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]
+    float * d_kern=&s_data[img_len * img_wid];
+
+    //d_reduce size [n_threads]
+    //N.B. this overlaps with d_img and d_kern!
+    float * d_reduce=&s_data[0];
+
    float sum = 0.0f;

    kern+=kern_stride_nkern*blockIdx.y;//the good nkern
@@ -471,30 +484,31 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
      __syncthreads();
      load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
 		     img_stride_col, img_stride_row, false, c_contiguous);
-      if(!(split && ! preload_full_kern))
-	load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
-		       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-      __syncthreads();
      if(split && ! preload_full_kern){
-	for(int first_row=0, row=tz;first_row<kern_len;row+=blockDim.z, first_row+=blockDim.z){
-	  int idx3;
-	  //TODO: test/check for flipped_kern
-	  if(flipped_kern)
-	    idx3=(kern_len-(first_row)-blockDim.z);//the current last row flipped
-	  else
-	    idx3=first_row;
+	for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
+            //N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
+            // (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
+            // details, but that seemed really weird. tricky bug to find too.
+          int idx3 = flipped_kern
+              ? max((kern_len - (int)blockDim.z - first_row),0)
+              : first_row;
+          int len3 = min(blockDim.z, kern_len - first_row);
+
 	  __syncthreads();
-	  load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, blockDim.z,
+	  load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
 			 kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
 	  __syncthreads();
-	  const float* idx_kern=&d_kern[tz*kern_stride_row];
-	  const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	  const float* idx_kern=&d_kern[tz*kern_wid];
+	  const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
 	  float sum2 = 0;
-	  if(row<kern_len)
+	  if(tz<len3)
 	    convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
 	  sum+=sum2;
 	}
      }else if(split){
+	load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
+		       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+        __syncthreads();
 	for(int row=tz;row<kern_len;row+=blockDim.z){
 	  const float* idx_kern=&d_kern[row*kern_wid];
 	  const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
@@ -504,18 +518,21 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
 	int row = tz;//The row of the kernel.
 	const float* idx_kern=&d_kern[row*kern_wid];
 	const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
+		       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+        __syncthreads();
 	convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
      }
 	__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
    }

-    //reduce
+    //reduce no sync because previous loop ends with sync
    d_reduce[thread_id]=sum;
    __syncthreads();
-    if(thread_id<out_len*out_wid){
-      sum=0;
-      for(int i=0;i<blockDim.z;i++){
-	sum+=d_reduce[thread_id+i*blockDim.x*blockDim.y];
+    if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
+      //sum=0;
+      for(int i=1;i<blockDim.z;i++){
+	sum+=d_reduce[thread_id+i*out_wid*out_len];
      }
      out[batch_id*out_wid*out_len*nkern+//the good batch
 	  out_wid*out_len*blockIdx.y+//the output image