Force instantiate kernel templates

0d5cffbe · Sean Lee · 89f584bc · 0d5cffbe · 0d5cffbe · 0d5cffbe
--- a/theano/sandbox/gpuarray/conv.cu
+++ b/theano/sandbox/gpuarray/conv.cu
--- a/theano/sandbox/gpuarray/conv_full_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_full_kernel.cu
@@ -122,7 +122,7 @@ conv_full_patch( const float* img, const float* kern, float* out,
 //template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.

 template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
-__global__ void
+__device__ inline void
 conv_full_patch_stack( const float* img, const float* kern, float* out,
                       int img_len, int img_wid,
                       int kern_len, int kern_wid, int nkern, int nstack,
@@ -182,6 +182,31 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
        out_row*out_wid+out_col] = sum;
 }

+extern "C" {
+#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
+__global__ void \
+conv_full_patch_stack_##suffix( \
+    const float *img, const float *kern, float *out, \
+    int img_len, int img_wid, \
+    int kern_len, int kern_wid, int nkern, int nstack, \
+    int img_stride_col, int img_stride_row, \
+    int kern_stride_col, int kern_stride_row,  \
+    int kern_stride_stack, int kern_stride_nkern) \
+{ \
+    conv_full_patch_stack<__VA_ARGS__>( \
+        img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
+        img_stride_col, img_stride_row, kern_stride_col, kern_stride_row,  \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+__INSTANTIATE_CONV_FULL_PATCH_STACK(0, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK(1, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK(2, true, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
+
+#undef __INSTANTIATE_CONV_FULL_PATCH_STACK
+}
+
 /**
 * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
 * I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
@@ -200,8 +225,8 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
 * template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
 *          if you set split and low_mem to true, we will use the low_mem version!
 */
-template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
-__global__ void
+template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem >
+__device__ inline void
 conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
                  const int img_len, const int img_wid,
                  const int kern_len, const int kern_wid,
@@ -257,7 +282,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
          const float* idx_kern=&d_kern[row*kern_wid];
          const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
          
-          convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+          convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
        }
      }
      out[batch_id*out_wid*out_len*nkern+//the good batch
@@ -292,7 +317,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
              const float* idx_kern=&d_kern[row*kern_wid];
              const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
              
-              convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+              convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
            }
          if(out_row<out_len)
            out[batch_id*out_wid*out_len*nkern+//the good batch
@@ -340,7 +365,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
            const float* idx_kern=&d_kern[row*kern_wid];
            const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
            
-            convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+            convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
          }
        }
        if(out_row<out_len)
@@ -351,6 +376,42 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
    }
 }

+extern "C" {
+#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
+__global__ void \
+conv_full_patch_stack_padded_##suffix( \
+    const float *img, const float *kern, float *out, \
+    const int img_len, const int img_wid, \
+    const int kern_len, const int kern_wid, \
+    const int nkern, const int nstack, \
+    const int img_stride_col, const int img_stride_row, \
+    const int img_stride_stack, const int img_stride_batch, \
+    const int kern_stride_col, const int kern_stride_row, \
+    const int kern_stride_stack, const int kern_stride_nkern) \
+{ \
+    conv_full_patch_stack_padded<__VA_ARGS__>( \
+        img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
+        img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
+        kern_stride_col, kern_stride_row, \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(0, false, false, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(1, false, false, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(2, false, false, true, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(4, false, true, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(5, false, true, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(6, false, true, true, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(8, true, false, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(9, true, false, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(10, true, false, true, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(12, true, true, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(13, true, true, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
+
+#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
+}
+
 template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy) 
 { 
    return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
@@ -364,7 +425,6 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
 { 
    return x[0] * y[0];
 }
-template<int NSTACK>
 __global__ void
 conv_full_load_everything( const float* img, const float* kern, float* out,
                 int img_len, int img_wid,
@@ -423,9 +483,9 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
            {
                int icol = out_col - kern_wid+1+col;
                if (icol < 0 || icol > img_wid) continue;
-                if (NSTACK > 0)
+                if (THEANO_KERN_WID > 0)
                {
-                    sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid,
+                    sum += everything_dot<THEANO_KERN_WID>(d_img + irow*img_wid + icol, img_len*img_wid,
                            d_kern + row*kern_wid+col, kern_len*kern_wid);
                }
                else

--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_kernel.cu