merge

42927b2b · James Bergstra · af5a7fe6 · c237dc5c · 42927b2b · 42927b2b
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -441,7 +441,7 @@ class GpuSum(Op):

        j = 0
        for i in xrange(nd_in):
-            if not self.reduce_mask[i]: 
+            if not self.reduce_mask[i]:
                print >> sio, 'new_dims[%(j)s] = CudaNdarray_HOST_DIMS(%(x)s)[%(i)s];' % locals()
                j += 1

@@ -453,11 +453,10 @@ class GpuSum(Op):
                PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
                %(fail)s;
            }
-
        }
        """ %locals()

-        # \begin bracket the reduction in a check that there is actually work to do 
+        # \begin bracket the reduction in a check that there is actually work to do
        print >> sio, """
        if (CudaNdarray_SIZE(%(z)s))
        {
@@ -472,12 +471,10 @@ class GpuSum(Op):
            #TODO: check if we are ccontiguous when we un-dimshuffle
            #TODO: if only some dims are ccontiguous, call version with less dims.
            print >> sio, 'if(CudaNdarray_is_c_contiguous(%(x)s)){'%locals()
-            
            self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
            print >> sio, "}else{"
            getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
            print >> sio, "}"
-        
        else:
            getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)

@@ -826,8 +823,16 @@ class GpuSum(Op):
            dim3 n_threads(
                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-            dim3 n_blocks(1,CudaNdarray_HOST_DIMS(%(x)s)[1]);
-            if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n");
+            dim3 n_blocks(1,
+                std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
+                    NUM_VECTOR_OP_BLOCKS));
+            if (verbose) {
+              fprintf(stderr,
+                "running kernel_reduce_sum_10_%(name)s n_blocks=(%%i,%%i)\\n",
+                n_blocks.x,
+                n_blocks.y);
+            }
+            assert( CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]);
            int n_shared = sizeof(float) * n_threads.x;
            kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                    1,
@@ -843,7 +848,7 @@ class GpuSum(Op):
                    );
            CNDA_THREAD_SYNC;
            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts) 
+            if (cudaSuccess != sts)
            {
                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
                    "kernel_reduce_sum_010_%(name)s",
@@ -1175,9 +1180,7 @@ class GpuSum(Op):
        """ %locals()

    def c_code_cache_version(self):
-        #return ()
-        return (19,)
-
+        return (20,)

    def c_support_code_apply(self, node, nodename):
        sio = StringIO.StringIO()

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -363,9 +363,10 @@ class GpuConv(Op):
        return ['cuda_ndarray.cuh','<stdio.h>']

    def c_code_cache_version(self):
-        return (0,8)
+        return (0,9) # raise this whenever modifying any of the support_code_files

    def c_support_code_apply(self, node, nodename):
+        # REMEMBER TO RAISE c_code_cache_version when changing any of these files
        return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\
            open(os.path.join(os.path.split(__file__)[0],'conv_full_kernel.cu')).read()+\
            open(os.path.join(os.path.split(__file__)[0],'conv.cu')).read()
@@ -405,8 +406,7 @@ class GpuConv(Op):

    CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, %(out)s,
                     mode, dx, dy, version, verbose);
-    if(%(out)s && %(out)s==out2)
-         Py_DECREF(out2);//CudaNdarray_Conv incremented the count to out
+    Py_XDECREF(%(out)s);
    %(out)s = out2;
 """%sub


--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
+// REMEMBER TO RAISE c_code_cache_version when changing this file
+//
 enum { ConvMode_FULL, ConvMode_VALID };
 PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);

@@ -5,7 +7,7 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray *
 * version: -1, autodetect, >=0 a specific version to use.
 *          If it can't be executed, we revert to the reference implementation
 */
-int 
+int
 CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
 		       CudaNdarray * out, int subsample_rows, int subsample_cols,
 		       int version = -1, int verbose=0)
@@ -38,8 +40,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);

    // we now search through a few implementations until one applies to our arguments.
-    
-    //TODO: make separate version as if all fill this is slower. 
+
+    //TODO: make separate version as if all fill this is slower.
    //TODO: Make a switch with power of 2 max size as template
    //TODO: make a parameter the number of division
    //TODO: Should we make them in separate grid block instead?
@@ -99,11 +101,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,

    if (verbose>1)
    {
-        printf("INFO: Running conv_valid version=%d, MACRO kern_width=%d with inputs:\n",version,THEANO_KERN_WID);
-        printf("INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n", 
+        fprintf(stderr, "INFO: Running conv_valid version=%d, MACRO kern_width=%d with inputs:\n",version,THEANO_KERN_WID);
+        fprintf(stderr, "INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n", 
                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
                CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]);
-        printf("INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
+        fprintf(stderr, "INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
                CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3],
                CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]);
    }
@@ -149,17 +151,17 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
 	      img_len, img_wid, kern_len, kern_wid, nkern, nstack);
        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+        if (cudaSuccess == sts)
        {
-            if (verbose) printf("INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split);
+            if (verbose) fprintf(stderr, "INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split);
            work_complete = true;
        }
        else
        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split);
-            if (verbose) printf("INFO: impl 'conv_patch' failed (%s), trying next implementation\n",
+            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split);
+            if (verbose) fprintf(stderr, "INFO: impl 'conv_patch' failed (%s), trying next implementation\n",
                                cudaGetErrorString(sts));
-        }                         
+        }
    }
    if (!subsample &&
 	out_contiguous &&
@@ -218,31 +220,33 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+        if (cudaSuccess == sts)
        {
-            if (verbose>1) 
-	      printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
+            if (verbose>1)
+	      fprintf(stderr,
+                     "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
 		     " kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i,"
 		     " kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i\n",
 		     threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,
 		     THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
 		     nb_split, preload_full_kernel);
-            if (verbose) printf("INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i\n",
+            if (verbose) fprintf(stderr,
+                    "INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i\n",
 				nb_split,preload_full_kernel);
            work_complete = true;
        }
        else
        {
-            if (verbose) 
-	      printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
+            if (verbose)
+	      fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
 		     " kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i,"
 		     " kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i\n",
 		     threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,
 		     THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
 		     nb_split, preload_full_kernel);
-            if (verbose) printf("INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n",
+            if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n",
                                cudaGetErrorString(sts));
-        }                         
+        }
    }

    if (!subsample && out_contiguous &&
@@ -277,17 +281,17 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+        if (cudaSuccess == sts)
        {
            work_complete = true;
-            if (verbose) printf("INFO: used 'conv_rows' version\n");
+            if (verbose) fprintf(stderr, "INFO: used 'conv_rows' version\n");
        }
        else
        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: impl 'conv_rows' failed (%s), trying next implementation\n",
+            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) fprintf(stderr, "INFO: impl 'conv_rows' failed (%s), trying next implementation\n",
                    cudaGetErrorString(sts));
-        }                         
+        }
    }
    if (!subsample && out_contiguous &&
 	(version==5||version==-1) &&
@@ -306,7 +310,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,

        dim3 threads(out_wid,nb_row);
        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
-	  
+
        int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);

 	void (*f)(float*, float*, float*,
@@ -332,18 +336,18 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+        if (cudaSuccess == sts)
        {
            work_complete = true;
-	    if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: used 'conv_rows_stack' version\n");
+	    if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
        }
        else
        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n",
+            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) fprintf(stderr, "INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n",
 		     cudaGetErrorString(sts));
-        }                         
+        }
    }

    if (!subsample && out_contiguous &&
@@ -410,20 +414,23 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        if (cudaSuccess == sts) 
        {
            work_complete = true;
-	    if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n",
+	    if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n",
 				  threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
+            if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
        }
        else
        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n",
+            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n",
 				threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,(version==9?2:3));
-            if (verbose) printf("INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
+            if (verbose) fprintf(stderr, "INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
 		     cudaGetErrorString(sts));
-        }                         
+        }
    }

-    //version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient.
+    //version 8 is the same but we force the split.
+    // The split is need in case we have too much threads.
+    // This happen frequently if the kernel length is big.
+    // Big kernel is frequent in the gradient.
    //version 8 need a minimum of kernel length as we force the split.
    //version 8 is needed to test more easily this kernel template parameter.
    //version 13 load only 1 kernel row at a time.
@@ -432,8 +439,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
 	out_size<512 &&//Maximum of 512 theads by block
 	(version==7||version==8||version==13||version==-1) &&
 	(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
-	(version!=13||kern_len>1) && //version 13 need a minimal kernel length as big as the split.
-	(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
+        //version 13 need a minimal kernel length as big as the split.
+	(version!=13||kern_len>1) &&
 	!work_complete) //conv_patch_stack_reduce
    {
 	int nb_split=1;
@@ -441,83 +448,99 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,

 	if(version==8||version==13) nb_split++;//force the split.
 	if(version==13)full_kern=false;
-	while(ceil_intdiv(kern_len,nb_split)>64)nb_split++;//device 1.3 have a max of 64 thread in z
-	while(out_size*ceil_intdiv(kern_len,nb_split)>512)nb_split++;
-	int shared_size=(img_size + kern_size + out_size*kern_len)*sizeof(float);
-	if(shared_size>=shared_avail){
-	  //if we can't fit the kernel in shared memory, we can split it more.
-	  full_kern=false;	  
-	  assert((img_size+kern_wid*2+out_size*2)*sizeof(float)<=shared_avail);
-	  shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
-	  while(shared_size>=shared_avail || ceil_intdiv(kern_len,nb_split)>64){
-	    nb_split++;
-	    shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
-	  }
-	}

-	int thread_z=ceil_intdiv(kern_len,nb_split);
-        assert(thread_z>0);//should not happen, but in case...
-	assert(shared_size<=shared_avail);
-	if(!full_kern)
-	  assert(thread_z!=kern_len);
+        //thread_z is going to be ceil_intdiv(kern_len, nb_split)
+        // we need enough splits so that
+        // a) thread_z fits in the 'z' threadIdx (i.e. is less than 64)
+        // b) thread_z * out_len * out_wid fits in the thread count
+        // c) the kernel doesn't need too much shared memory

-        dim3 threads(out_wid, out_len, thread_z);
-        dim3 grid(nbatch,nkern);
-	
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int,
-		  int, int,
-		  int, int);
+        // constraint (a)
+        // device 1.3 have a max of 64 thread in z
+	while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;

-	const bool split=thread_z!=kern_len;
-	const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
+        // constraint (b)
+        //  (TODO: read the number of threads per block from the device)
+	while(out_size*ceil_intdiv(kern_len,nb_split)>512) nb_split++;

-	//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
-	//We will always be split when we don't load the full kernel
-#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
-            if     (kern_flipped  && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
-            else if(kern_flipped  && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
-            else if(kern_flipped  && ccontig  && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
-            else if(kern_flipped  && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
-            else if(!kern_flipped && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
-            else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
-            else if(!kern_flipped && ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
-            else if(!kern_flipped && !ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
-	    /*else if(kern_flipped  && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
-	    /*else if(kern_flipped  && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
-            else if(kern_flipped  && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
-            else if(kern_flipped  && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
-            /*else if(!kern_flipped && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
-            /*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
-            else if(!kern_flipped && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
-            else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
-	CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
-
-	if (verbose) printf("INFO: using 'conv_patch_stack_reduce' version kern_flipped=%i ccontig=%i nb_split=%d, preload_full_kern=%d\n",
-			    kern_flipped,ccontig,nb_split,full_kern);
-	if (verbose>1) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n",
-			      threads.x, threads.y, threads.z, grid.x, grid.y,
-			      shared_size, threads.x * threads.y * threads.z);
-	f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
-					   img_len, img_wid, kern_len, kern_wid,
-					   nkern, nstack, 
-					   img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
-					   kern_stride_col_unflipped, kern_stride_row_unflipped,
-					   kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            work_complete = true;
+        // tentative estimates (prior to contraint c)
+	int thread_z=ceil_intdiv(kern_len,nb_split);
+	int shared_size = sizeof(float)*(full_kern
+                ? std::max(img_size + kern_size, out_size*thread_z)
+                : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
+
+        // constraint (c)
+        while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
+	    //if we can't fit the kernel in shared memory, we must split it more.
+            nb_split++;
+            thread_z=ceil_intdiv(kern_len,nb_split);
+	    shared_size=sizeof(float)*std::max(
+                    img_size + kern_wid*thread_z,
+                    out_size*thread_z);
        }
-        else
+        if (nb_split <= kern_len)
        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z);
-            if (verbose) printf("INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n",
-                                cudaGetErrorString(sts));
-        }                         
+            assert(thread_z>0);//should not happen, but in case...
+            if(!full_kern) assert(thread_z!=kern_len);
+
+            dim3 threads(out_wid, out_len, thread_z);
+            dim3 grid(nbatch,nkern);
+
+            void (*f)(float*, float*, float*,
+                      int, int, int, int,
+                      int, int, int, int,
+                      int, int,
+                      int, int,
+                      int, int);
+
+            const bool split=thread_z!=kern_len;
+            const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
+
+            //printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
+            //We will always be split when we don't load the full kernel
+#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
+                if     (kern_flipped  && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
+                else if(kern_flipped  && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
+                else if(kern_flipped  && ccontig  && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
+                else if(kern_flipped  && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
+                else if(!kern_flipped && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
+                else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
+                else if(!kern_flipped && ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
+                else if(!kern_flipped && !ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
+                /*else if(kern_flipped  && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
+                /*else if(kern_flipped  && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
+                else if(kern_flipped  && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
+                else if(kern_flipped  && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
+                /*else if(!kern_flipped && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
+                /*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
+                else if(!kern_flipped && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
+                else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
+            CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
+
+            if (verbose) fprintf(stderr, "INFO: using 'conv_patch_stack_reduce' version kern_flipped=%i ccontig=%i nb_split=%d, preload_full_kern=%d\n",
+                                kern_flipped,ccontig,nb_split,full_kern);
+            if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n",
+                                  threads.x, threads.y, threads.z, grid.x, grid.y,
+                                  shared_size, threads.x * threads.y * threads.z);
+            f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
+                                               img_len, img_wid, kern_len, kern_wid,
+                                               nkern, nstack,
+                                               img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
+                                               kern_stride_col_unflipped, kern_stride_row_unflipped,
+                                               kern_stride_stack, kern_stride_nkern);
+            CNDA_THREAD_SYNC;
+            cudaError_t sts = cudaGetLastError();
+            if (cudaSuccess == sts)
+            {
+                work_complete = true;
+            }
+            else
+            {
+                if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z);
+                if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n",
+                                    cudaGetErrorString(sts));
+            }
+        } // else no good nb_splits was found
    }

    if (1 && (version==6||version==-1) &&
@@ -589,12 +612,12 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        if (cudaSuccess == sts) 
        {
            work_complete = true;
-            if (verbose) printf("INFO: used 'conv_valid_row_reduce' version\n");
+            if (verbose) fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
        }
        else
        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, shared_size=%i, nb_threads=%i\n", n_threads.x, n_threads.y, n_blocks, n_reduce_buf, n_threads.x * n_threads.y);
-            if (verbose) printf("INFO: impl 'conv_valid_row_reduce' failed (%s), trying next implementation\n",
+            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, shared_size=%i, nb_threads=%i\n", n_threads.x, n_threads.y, n_blocks, n_reduce_buf, n_threads.x * n_threads.y);
+            if (verbose) fprintf(stderr, "INFO: impl 'conv_valid_row_reduce' failed (%s), trying next implementation\n",
                    cudaGetErrorString(sts));
        }
    }
@@ -604,23 +627,23 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        int outsize = CudaNdarray_SIZE(out);
        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
        int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
-        if (0)
+        if (1)
        {
-            if (verbose) printf("INFO: launching conv_reference_valid\n");
-            if (verbose) printf("      img : %i %i %i %i %p  %i %i %i %i\n",
+            if (verbose) fprintf(stderr, "INFO: launching conv_reference_valid\n");
+            if (verbose) fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
                    nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
                    img->devdata,
                    CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
-            if (verbose) printf("      kern: %i %i %i %i %p  %i %i %i %i\n", 
+            if (verbose) fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n", 
                    nkern, nstack, kern_len, kern_wid,
                    kern->devdata,
                    CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
                        );
-            if (verbose) printf("      out : %i %i %i %i %p  %i %i %i %i\n",
+            if (verbose) fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
                    CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
                    out->devdata,
                    CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
-            if (verbose) printf("   launch params: %i %i %i\n", outsize, n_blocks, n_threads);
+            if (verbose) fprintf(stderr, "   launch params: %i %i %i\n", outsize, n_blocks, n_threads);
        }
        conv_reference_valid<<<n_blocks, n_threads>>>( nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
                img_len, img_wid,
@@ -633,10 +656,10 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        CNDA_THREAD_SYNC;

        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+        if (cudaSuccess == sts)
        {
            work_complete = true;
-            if (verbose) printf("INFO: used 'conv_reference_valid' version\n");
+            if (verbose) fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
        }
        else
        {
@@ -645,13 +668,14 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
            return -1;
        }
    }
+    assert (work_complete);
    return 0;

            //PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
            //return -1;
 }

-int 
+int
 CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
 {
    const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
@@ -770,7 +794,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
 	if(kern_len==1 && version==5){
 	  //version 5 don't support kern_len==1 as 1%0 return -1.
 	  version=-1;
-	  if(verbose)printf("WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
+	  if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
 	}
 	if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;

@@ -833,16 +857,16 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+        if (cudaSuccess == sts)
        {
-	  if (verbose>1) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version);
-            if (verbose) printf("INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false"));
+	  if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version);
+            if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false"));
            work_complete = true;
        }
        else
        {
-	  if (verbose) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version);
-            if (verbose) printf("INFO: impl 'conv_full_patch_stack_padded' %s %s failed (%s), trying next implementation\n",
+	  if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version);
+            if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch_stack_padded' %s %s failed (%s), trying next implementation\n",
 				version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
                                cudaGetErrorString(sts));
        }                         
@@ -872,13 +896,13 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
        {
-            if (verbose) printf("INFO: used 'conv_full_patch' version\n");
+            if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n");
            work_complete = true;
        }
        else
        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n",
+            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n",
                                cudaGetErrorString(sts));
        }                         
    }
@@ -922,13 +946,13 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
        {
-            if (verbose) printf("INFO: used 'conv_full_load_everything' version\n");
+            if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n");
            work_complete = true;
        }
        else
        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: impl 'conv_full_load_everything' failed (%s), trying next implementation\n",
+            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) fprintf(stderr, "INFO: impl 'conv_full_load_everything' failed (%s), trying next implementation\n",
                                cudaGetErrorString(sts));
        }                         
    }
@@ -968,41 +992,41 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
        {
-            if (verbose) printf("INFO: used 'conv_full_patch_stack' version\n");
+            if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
            work_complete = true;
        }
        else
        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
+            if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
                                cudaGetErrorString(sts));
        }                         
    }
    if (1 && !work_complete) //conv_reference_full
    {
-        if(verbose>1)printf("INFO: will start conv_reference_full\n");
+        if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");

        int outsize = CudaNdarray_SIZE(out);
        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
        int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
        if (0)
        {
-            if (verbose) printf("INFO: launching conv_reference_valid\n");
-            if (verbose) printf("      img : %i %i %i %i %p  %i %i %i %i\n",
+            if (verbose) fprintf(stderr, "INFO: launching conv_reference_valid\n");
+            if (verbose) fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
                    CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
                    img->devdata,
                    CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
-            if (verbose) printf("      kern: %i %i %i %i %p  %i %i %i %i\n", 
+            if (verbose) fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n", 
                    CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
                    kern->devdata,
                    CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
                        );
-            if (verbose) printf("      out : %i %i %i %i %p  %i %i %i %i\n",
+            if (verbose) fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
                    CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
                    out->devdata,
                    CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
-            if (verbose) printf("   launch params: %i %i %i\n", outsize, n_blocks, n_threads);
-            if (verbose) printf("   subsample params: %i %i\n", subsample_rows, subsample_cols);
+            if (verbose) fprintf(stderr, "   launch params: %i %i %i\n", outsize, n_blocks, n_threads);
+            if (verbose) fprintf(stderr, "   subsample params: %i %i\n", subsample_rows, subsample_cols);
        }
        conv_reference_full<<<n_blocks, n_threads>>>(CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(img)[1],
                CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
@@ -1017,15 +1041,15 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
        {
-            if (verbose) printf("INFO: used 'conv_reference_full' version ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d nkern=%d nstack=%d subsample=%d\n",
+            if (verbose) fprintf(stderr, "INFO: used 'conv_reference_full' version ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d nkern=%d nstack=%d subsample=%d\n",
 				img_len,img_wid, kern_len, kern_wid,
 				out_len, out_wid, nbatch, nkern, nstack, subsample);
            work_complete = true;
        }
        else
        {
-	  if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", n_threads, 1, n_blocks, 1, 0, n_threads);
-	  if (verbose) printf("INFO: impl 'conv_reference_full' failed (%s), trying next implementation\n",
+	  if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", n_threads, 1, n_blocks, 1, 0, n_threads);
+	  if (verbose) fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s), trying next implementation\n",
 			      cudaGetErrorString(sts));
 	  PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for CudaNdarray_conv_full! (%s)",
 		       cudaGetErrorString(sts));
@@ -1035,12 +1059,15 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
    return 0;
 }

-PyObject * 
+PyObject *
 CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
 		 CudaNdarray * out, const int mode,
 		 const int subsample_rows, const int subsample_cols,
 		 const int version, const int verbose)
 {
+    // Re-use the out object if possible.  If the out object it not used, then its refcount is not modified.
+    //  If the out object is re-used then it is returned, and its refcount is incremented by 1.
+    //
    if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
    if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}

@@ -1060,33 +1087,36 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
    }
    out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
    out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
-    
-    CudaNdarray * rval = out;
-    if(!(out && out->nd==4 && CudaNdarray_is_c_contiguous(out) 
+
+    CudaNdarray * rval = NULL;
+
+    if ( out
+         && out->nd==4
+         && CudaNdarray_is_c_contiguous(out)
 	 && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
 	 && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
 	 && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
-	 && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])){
-      if (out)
-      {
-          fprintf(stderr, "Warning: Conv is ignoring 'out' argument with wrong structure.\n");
-	  Py_DECREF(out);
-      }
+	 && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])
+    {
+      rval = out;
+      Py_INCREF(rval);
+    }
+    else
+    {
+      if (verbose) fprintf(stderr, "INFO: Conv is ignoring 'out' argument with wrong structure.\n");
      rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
+      //rval might be null
    }
-    if ((rval==NULL) 
+    if ((rval==NULL)
            || ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
            || ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
            )
    {
        // if rval is something we just allocated,
        // and there was a problem, then we have to free it.
-        if (rval != out) Py_XDECREF(rval);
+        Py_XDECREF(rval);
        return NULL;
    }
-    //TODO: Get refcount story clearer!
-    //      This function does a weird thing as work-around with Conv_VARARGS
-    if (rval == out) Py_INCREF(rval);
    return (PyObject*)rval;
 }

--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
+// REMEMBER TO RAISE c_code_cache_version when changing this file
+//
 //implement the valid convolution only

 /*
@@ -38,6 +40,8 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
 #define BS(i, j) Bs[i][j]
 #endif
 */
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))

 const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
 #define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float))
@@ -45,8 +49,9 @@ const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the tr
 __device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
  if (nb_thread < 64)
    {
-      if(flipped) 
-        //TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
+      if(flipped)
+        //TODO very slow on device before 1.3.
+        //     make access to kern sequential and access to d_kern flipped.
        for(int i=thread_id;i<N;i+=nb_thread)
          dst[i]=src[N - 1 - i];
        //dst[N-1-i]=src[i];
@@ -88,10 +93,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
 			       const bool flipped=false, const bool c_contiguous=true){
  if(flipped && ! c_contiguous){
    for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
-      dst[nb_row*nb_col-1-i]=src[i/nb_col*stride_row+i%nb_col*stride_col];
+      dst[nb_row*nb_col-1-i]=src[(i/nb_col)*stride_row+(i%nb_col)*stride_col];
  }else if(c_contiguous){
    load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
-  
  }else if(flipped){//c_contiguous==true
    //TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
    int N=nb_col*nb_row;
@@ -440,10 +444,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
 		  int kern_stride_col, int kern_stride_row,
 		  int kern_stride_stack, int kern_stride_nkern)
 {
-  int __shared__ out_len, out_wid, nb_thread_id;
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+  //int __shared__ out_len, out_wid, nb_thread_id;
+  //out_len = img_len - kern_len + 1;
+  //out_wid = img_wid - kern_wid + 1;
+  const int out_wid = blockDim.x;
+  const int out_len = blockDim.y;
+  const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;

  extern __shared__ float s_data[];

@@ -458,9 +464,16 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
    int out_row = ty;//output row
    const int thread_id  = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;

-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID];
-    float * d_reduce=&s_data[img_len*img_wid+(preload_full_kern?kern_len:blockDim.z)*kern_wid];
+    //d_img size [IMAGE_LEN * IMAGE_WID];
+    float * d_img=&s_data[0];
+
+    //d_kern size[(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]
+    float * d_kern=&s_data[img_len * img_wid];
+
+    //d_reduce size [n_threads]
+    //N.B. this overlaps with d_img and d_kern!
+    float * d_reduce=&s_data[0];
+
    float sum = 0.0f;

    kern+=kern_stride_nkern*blockIdx.y;//the good nkern
@@ -471,30 +484,31 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
      __syncthreads();
      load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
 		     img_stride_col, img_stride_row, false, c_contiguous);
-      if(!(split && ! preload_full_kern))
-	load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
-		       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-      __syncthreads();
      if(split && ! preload_full_kern){
-	for(int first_row=0, row=tz;first_row<kern_len;row+=blockDim.z, first_row+=blockDim.z){
-	  int idx3;
-	  //TODO: test/check for flipped_kern
-	  if(flipped_kern)
-	    idx3=(kern_len-(first_row)-blockDim.z);//the current last row flipped
-	  else
-	    idx3=first_row;
+	for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
+            //N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
+            // (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
+            // details, but that seemed really weird. tricky bug to find too.
+          int idx3 = flipped_kern
+              ? max((kern_len - (int)blockDim.z - first_row),0)
+              : first_row;
+          int len3 = min(blockDim.z, kern_len - first_row);
+
 	  __syncthreads();
-	  load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, blockDim.z,
+	  load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
 			 kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
 	  __syncthreads();
-	  const float* idx_kern=&d_kern[tz*kern_stride_row];
-	  const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	  const float* idx_kern=&d_kern[tz*kern_wid];
+	  const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
 	  float sum2 = 0;
-	  if(row<kern_len)
+	  if(tz<len3)
 	    convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
 	  sum+=sum2;
 	}
      }else if(split){
+	load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
+		       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+        __syncthreads();
 	for(int row=tz;row<kern_len;row+=blockDim.z){
 	  const float* idx_kern=&d_kern[row*kern_wid];
 	  const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
@@ -504,18 +518,21 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
 	int row = tz;//The row of the kernel.
 	const float* idx_kern=&d_kern[row*kern_wid];
 	const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
+		       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+        __syncthreads();
 	convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
      }
 	__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
    }

-    //reduce
+    //reduce no sync because previous loop ends with sync
    d_reduce[thread_id]=sum;
    __syncthreads();
-    if(thread_id<out_len*out_wid){
-      sum=0;
-      for(int i=0;i<blockDim.z;i++){
-	sum+=d_reduce[thread_id+i*blockDim.x*blockDim.y];
+    if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
+      //sum=0;
+      for(int i=1;i<blockDim.z;i++){
+	sum+=d_reduce[thread_id+i*out_wid*out_len];
      }
      out[batch_id*out_wid*out_len*nkern+//the good batch
 	  out_wid*out_len*blockIdx.y+//the output image

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -134,7 +134,9 @@ CudaNdarray_uninit(CudaNdarray*self)
        assert(self->devdata);
        if (device_free(self->devdata))
        {
-            std::cerr << "!!!! error freeing device memory\n";
+            fprintf(stderr,
+                    "!!!! error freeing device memory %p (self=%p)\n",
+                    self->devdata, self);
            rval = -1;
        }
        self->devdata = NULL;
@@ -144,7 +146,9 @@ CudaNdarray_uninit(CudaNdarray*self)
    {
        if (device_free(self->dev_structure))
        {
-            std::cerr << "!!!! error freeing device memory\n";
+            fprintf(stderr,
+                    "!!!! error freeing dev_structure memory %p (self=%p)\n",
+                    self->dev_structure, self);
            rval = -1;
        }
        self->dev_structure = NULL;
@@ -1848,6 +1852,8 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
  }
  get_gpu_ptr_size<<<1,1>>>(gpu_data);
  if (cudaSuccess != cublasGetError()){
+
+    device_free(gpu_data);
    return PyErr_Format(PyExc_RuntimeError,
                        "CudaNdarray_ptr_int_size: error when calling the gpu code.");
  }

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -403,6 +403,11 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
            self->devdata = 0;
            return -1;
        }
+        if (0)
+            fprintf(stderr,
+                "Allocated devdata %p (self=%p)\n",
+                self->devdata,
+                self);
        self->data_allocated = size;
    }
    return 0;

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -84,14 +84,36 @@ def py_conv_scipy(img, kern, mode, subsample):
 def _params_allgood_header():
    print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup"

-def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), kern_stride=(1,1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol = 1e-8, nb_iter=0, ones=False):
+def test_example():
+    # Test a specific configuration that was failing in one of the big unit-tests
+    # This configuration information was read from one of the 'FAIL' lines printed by
+    # _params_allgood during a nosetest run
+    #
+    # now it can be tested directly by nosetests test_conv_cuda_ndarray.py:test_example
+    assert _params_allgood(
+            (1,1,4,4),
+            (1,1,3,2),
+            'valid',
+            version=13,
+            random=False)
+
+def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
+        kern_stride=(1,1), version=-1, verbose=0, random=True, print_=None,
+        id=None, rtol=1e-5, atol = 1e-8, nb_iter=0, ones=False):
+    #
+    # This function is the core of several of the big unit-test drivers,
+    # but it can also be used very directly on its own to test a specific
+    # kind of convolution.
+    #
+    # See `test_example` (above) for an example of how to use this directly.
+    #
    if ones:
        assert not random
        npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
        npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
    elif random:
-        npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
-        npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
+        npy_img = theano._asarray(numpy.random.rand(*ishape)+1, dtype='float32')
+        npy_kern = theano._asarray(numpy.random.rand(*kshape)-2, dtype='float32')
    else:
        npy_img = theano._asarray(numpy.arange(numpy.prod(ishape)).reshape(ishape), dtype='float32')+1
        npy_kern = -(theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32')+1)
@@ -155,8 +177,6 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ker
        print "max absolute diff:",diffabs.max(),"avg abs diff:",numpy.average(diffabs)
        print "median abs diff:", numpy.median(diffabs), "nb close:",nb_close, "/", diff.size
        print "max relatif diff:",pr_diff.max(), "avg rel diff:", numpy.average(pr_diff)
-
-        print rval
    if not rval and print_!=False:
        if npy_img.shape[0]>5:
            print "img",npy_img[0]
@@ -185,9 +205,19 @@ def exec_conv(version, shapes, verbose, random, mode, print_=None, rtol=1e-5, on
        for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
            ret=False
            try:
-                ret = _params_allgood(ishape, kshape, mode,
-                                      subsample=subshape, img_stride=istride, kern_stride=kstride,
-                                      version=ver, verbose=verbose, random=random, id=id,print_=print_,rtol=rtol,ones=ones)
+                ret = _params_allgood(ishape,
+                        kshape,
+                        mode,
+                        subsample=subshape,
+                        img_stride=istride,
+                        kern_stride=kstride,
+                        version=ver,
+                        verbose=verbose,
+                        random=random,
+                        id=id,
+                        print_=print_,
+                        rtol=rtol,
+                        ones=ones)
            except Exception, e:
                print ver, id,(ishape, kshape, subshape, istride, kstride)
                print e