new GpuConv compile, but give wrong version in some cases!

5fc89c03 · Frederic · baf12f54 · 5fc89c03 · 5fc89c03 · 5fc89c03
--- a/theano/sandbox/gpuarray/conv.cu
+++ b/theano/sandbox/gpuarray/conv.cu
--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
+import copy
+import os
+
 import theano
-from theano import gof
+from theano import config, gof
+from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
+from theano.sandbox.gpuarray.type import GpuArrayType


 class GpuConv(gof.Op):
@@ -114,6 +119,9 @@ class GpuConv(gof.Op):
            str(self.kshp))

    def make_node(self, img, kern):
+        if img.dtype != "float32" or kern.dtype != "float32":
+            raise NotImplementedError("GpuConv currently only work"
+                                      " with float32 dtype")
        if img.type.ndim != 4:
            raise TypeError('img must be 4D tensor')
        if kern.type.ndim != 4:
@@ -121,7 +129,8 @@ class GpuConv(gof.Op):

        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
                         False, False]
-        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
+        out = GpuArrayType(img.dtype, broadcastable)()
+        return gof.Apply(self, [img, kern], [out])

    def flops(self, inputs, outputs):
        """ Useful with the hack in profilemode to print the MFlops"""
@@ -145,6 +154,8 @@ class GpuConv(gof.Op):
    def make_thunk(self, node, storage_map, compute_map, no_recycling):
        node_ = copy.copy(node)
        assert node.op is node_.op
+        if config.gpuarray.sync:
+            raise NotImplementedError("GpuConv do not implement gpuarray.sync Theano flag")
        if node_.op.max_threads_dim0 is None:
            cuda = theano.sandbox.cuda
            device_id = cuda.use.device_number
@@ -169,20 +180,30 @@ class GpuConv(gof.Op):
        return ['-DTHEANO_KERN_WID=' + str(nb)]  # ,'-g','-G']

    def c_headers(self):
-        return ['cuda_ndarray.cuh', '<stdio.h>']
+        return ['<stdio.h>', 'cuda.h',
+                '<compyte/extension.h>', '<compyte/numpy_compat.h>']

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
        return (0, 20)

+    def c_init_code(self):
+        return ['cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
+
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
        # these files
        files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
-        codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
-                for f in files]
+        codes = ["CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);",
+                 "float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}",
+                 "const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}"]
+        codes += [open(os.path.join(os.path.split(__file__)[0], f)).read()
+                  for f in files]
        return reduce(str.__add__, codes)

+    def c_compiler(self):
+        return NVCC_compiler
+
    def c_code(self, node, nodename, inp, out_, sub):
        img, kern = inp
        out, = out_
@@ -226,7 +247,8 @@ class GpuConv(gof.Op):
    }

    // TODO, make out be decref before we alloc out2!
-    CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
+    PyGpuArrayObject * out2 = (PyGpuArrayObject *)PyGpuArray_Conv(
+                                                         %(img)s, %(kern)s,
                                                         %(out)s, mode,
                                                         dx, dy,
                                                         version, verbose,

--- a/theano/sandbox/gpuarray/conv_full_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_full_kernel.cu
@@ -4,7 +4,8 @@
 //grid block size=batch_id
 //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
 __global__ void
-conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
+conv_full_patch_split(const float* img, const float* kern, float* out,
+                      int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
 {
  int __shared__ out_len, out_wid, nb_thread_id;
  out_len = img_len + kern_len - 1;
@@ -60,7 +61,7 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
 //grid block size=batch_id, nkern
 //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
 __global__ void
-conv_full_patch( float* img, float* kern, float* out,
+conv_full_patch( const float* img, const float* kern, float* out,
                 int img_len, int img_wid,
                 int kern_len, int kern_wid, int nkern, int nstack)
 {
@@ -122,7 +123,7 @@ conv_full_patch( float* img, float* kern, float* out,

 template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
 __global__ void
-conv_full_patch_stack( float* img, float* kern, float* out,
+conv_full_patch_stack( const float* img, const float* kern, float* out,
                       int img_len, int img_wid,
                       int kern_len, int kern_wid, int nkern, int nstack,
                       int img_stride_col, int img_stride_row,
@@ -133,7 +134,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
  out_len = img_len + kern_len - 1;
  out_wid = img_wid + kern_wid - 1;
  nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
-  float __shared__ *kern_, *img_;
+  const float __shared__ *kern_, *img_;
  extern __shared__ float s_data[];

    const int batch_id = blockIdx.x;
@@ -201,7 +202,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
 */
 template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
 __global__ void
-conv_full_patch_stack_padded( float* img, float* kern, float* out,
+conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
                  const int img_len, const int img_wid,
                  const int kern_len, const int kern_wid,
                  const int nkern, const int nstack,
@@ -365,7 +366,7 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
 }
 template<int NSTACK>
 __global__ void
-conv_full_load_everything( float* img, float* kern, float* out,
+conv_full_load_everything( const float* img, const float* kern, float* out,
                 int img_len, int img_wid,
                 int kern_len, int kern_wid, int nkern, int nstack,
                 int img_stride_col, int img_stride_row,

--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_kernel.cu
@@ -221,7 +221,7 @@ __device__ void store_or_accumulate(float& dst,const float value ){
 */
 template<bool flipped_kern, int KERN_WIDTH, bool split>
 __global__ void
-conv_patch( float* img, float* kern, float* out,
+conv_patch( const float* img, const float* kern, float* out,
            int img_len, int img_wid, int kern_len, int kern_wid,
            int nkern, int nstack)
 {
@@ -304,7 +304,7 @@ conv_patch( float* img, float* kern, float* out,
 */
 template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
 __global__ void
-conv_patch_stack( float* img, float* kern, float* out,
+conv_patch_stack( const float* img, const float* kern, float* out,
                  int img_len, int img_wid, int kern_len, int kern_wid,
                  int out_len, int out_wid,
                  int nkern, int nstack, int img_stride_col,int img_stride_row,
@@ -375,7 +375,7 @@ conv_patch_stack( float* img, float* kern, float* out,
                                          out_row*out_wid+out_col],sum);
    }else{

-      float __shared__ *kern_, *img_;
+      const float __shared__ *kern_, *img_;
      int __shared__ out_len_max;

      kern_=kern+kern_stride_nkern*kern_id;//the good nkern
@@ -456,7 +456,7 @@ conv_patch_stack( float* img, float* kern, float* out,
 */
 template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
 __global__ void
-conv_patch_stack_reduce( float* img, float* kern, float* out,
+conv_patch_stack_reduce( const float* img, const float* kern, float* out,
                  int img_len, int img_wid, int kern_len, int kern_wid,
                  int nkern, int nstack, int img_stride_col,int img_stride_row,
                  int img_stride_stack, int img_stride_batch,
@@ -572,7 +572,7 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
 */
 template<int KERN_WIDTH, bool c_contiguous>
 __global__ void
-conv_rows( float* img, float* kern, float* out,
+conv_rows( const float* img, const float* kern, float* out,
           int img_len, int img_wid, int kern_len, int kern_wid,
           int nkern, int nstack,
           int img_stride_col, int img_stride_row,
@@ -633,7 +633,7 @@ conv_rows( float* img, float* kern, float* out,
 */
 template<int KERN_WIDTH, bool c_contiguous>
 __global__ void
-conv_rows_stack( float* img, float* kern, float* out,
+conv_rows_stack( const float* img, const float* kern, float* out,
                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
                 const int nkern, const int nstack,
                 const int img_stride_col, const int img_stride_row,
@@ -731,7 +731,7 @@ conv_rows_stack( float* img, float* kern, float* out,
 */
 template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
 __global__ void
-conv_rows_stack2( float* img, float* kern, float* out,
+conv_rows_stack2(const float* img, const float* kern, float* out,
                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
                 const int nkern, const int nstack,
                 const int img_stride_col, const int img_stride_row,
@@ -831,8 +831,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
        int img_len, int img_wid, 
        int kern_len, int kern_wid,
        int out_len, int out_wid, //physical
-        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
        int subsample_rows, int subsample_cols,
        const int initial_reduce_boundary)
@@ -859,8 +859,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
        float sum = 0.0f;
        if(stack_loop){
          for (; ss < stacklen; ss+=blockDim.x){
-            float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
-            float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
+            const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
+            const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
            for (int cc = 0; cc < kern_wid; ++cc)
            {
                sum +=  kk_0[0] * ii_0[0];
@@ -869,8 +869,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
            }
          }
        }else{
-          float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
-          float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
+          const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
+          const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
          for (int cc = 0; cc < kern_wid; ++cc)
          {
            sum +=  kk_0[0] * ii_0[0];
@@ -925,8 +925,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
        int img_len, int img_wid, 
        int kern_len, int kern_wid,
        int out_len, int out_wid, //physical
-        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
        int subsample_rows, int subsample_cols)
 {
@@ -984,8 +984,8 @@ conv_reference_full(int nB, int nK, int stacklen,
        int img_len, int img_wid, 
        int kern_len, int kern_wid,
        int out_len, int out_wid, //physical dimensions
-        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
        int subsample_rows, int subsample_cols)
 {

--- a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
@@ -25,6 +25,7 @@ from theano.tests.unittest_tools import seed_rng
 from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
                                                          mode_without_gpu)
 from theano.sandbox.gpuarray.type import GpuArrayType
+from theano.sandbox.gpuarray.conv import GpuConv
 import pygpu
 gftensor4 = GpuArrayType('float32', [False] * 4)

@@ -159,11 +160,11 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
        t1 = time.time()
        i = gftensor4()
        k = gftensor4()
-        op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
-                                              subsample=subsample,
-                                              version=version,
-                                              verbose=verbose,
-                                              kshp=compile_kshp)(i, k)
+        op = GpuConv(border_mode=mode,
+                     subsample=subsample,
+                     version=version,
+                     verbose=verbose,
+                     kshp=compile_kshp)(i, k)
        f = theano.function([i, k], op, mode=mode_with_gpu)
        gpuval = f(img, kern)
        t2 = time.time()
@@ -731,7 +732,7 @@ class TestConv2DGPU(unittest.TestCase):

            func = theano.function([a, A], image_estimate, mode=mode_with_gpu)
            #theano.printing.debugprint(func,)
-            assert any([isinstance(node.op, theano.sandbox.cuda.blas.GpuConv)
+            assert any([isinstance(node.op, GpuConv)
                        for node in func.maker.fgraph.toposort()])

            a_in = numpy.random.randn(*featshp).astype("float32")