gpu conv - template-unrolling depth logarithmic in kernel size rather than linear.

9e4f9e45 · James Bergstra · 8b9676e3 · 9e4f9e45 · 9e4f9e45
--- a/theano/sandbox/cuda/conv_full_kernel.cu
+++ b/theano/sandbox/cuda/conv_full_kernel.cu
@@ -348,12 +348,17 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,

 template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy) 
 { 
-    return x[0] * y[0] + everything_dot<i-1>(x+sx, sx, y+sy, sy);
+    return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
+    //return x[0] * y[0] + everything_dot<i-1>(x+sx, sx, y+sy, sy);
 }
 template <> __device__ float everything_dot<0>(const float * x, const int sx, const float * y, const int sy)
 { 
    return 0;
 }
+template <> __device__ float everything_dot<1>(const float * x, const int sx, const float * y, const int sy)
+{ 
+    return x[0] * y[0];
+}
 template<int NSTACK>
 __global__ void
 conv_full_load_everything( float* img, float* kern, float* out,

--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
@@ -156,9 +156,14 @@ __device__ void load_padded_col_to_shared(float * dst, const float * src,

 template<int i> __device__ float convolutionRowNoFlip(const float *data,
 						      const float *kern){
-  return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
+    return convolutionRowNoFlip<i/2>(data, kern)+ convolutionRowNoFlip<(i+1)/2>(data+i/2, kern+i/2) ;
+  //return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
 }

+template<> __device__ float convolutionRowNoFlip<1>(const float *data,
+						    const float *kern){
+    return data[0]*kern[0];
+}
 template<> __device__ float convolutionRowNoFlip<0>(const float *data,
 						    const float *kern){
    return 0;