new GpuConv compile, but give wrong version in some cases!

5fc89c03 · Frederic · baf12f54 · 5fc89c03 · 5fc89c03 · 5fc89c03
--- a/theano/sandbox/gpuarray/conv.cu
+++ b/theano/sandbox/gpuarray/conv.cu
 // REMEMBER TO RAISE c_code_cache_version when changing this file
 //
+//TODO detect SHARED_SIZE dynamically
+#define SHARED_SIZE (16*1024)
 enum { ConvMode_FULL, ConvMode_VALID };
-PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
+PyObject * PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern, PyGpuArrayObject * out, const int mode,
+                           const size_t subsample_rows, const size_t subsample_cols, const int version, const int verbose);
+template <typename T>
+static T ceil_intdiv(T a, T b)
+{
+    return (a/b) + ((a % b) ? 1: 0);
+}
 /*
 * version: -1, autodetect, >=0 a specific version to use.
 *          If it can't be executed, we revert to the reference implementation
 */
 int
-CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
+PyGpuArray_conv_valid(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
-                       CudaNdarray * out, int subsample_rows, int subsample_cols,
+                       PyGpuArrayObject * out, size_t subsample_rows, size_t subsample_cols,
                       int version = -1, int verbose=0,
                       int max_threads_dim0 = 512
                       )
 {
    int work_complete = 0;
    const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
-    if (img->nd != 4)
+    if (PyGpuArray_NDIM(img) != 4)
    {
        PyErr_SetString(PyExc_ValueError, "required img of 4D");
        return -1;
    }
-    if (kern->nd != 4)
+    if (PyGpuArray_NDIM(kern) != 4)
    {
        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
        return -1;
    }
-    if (out->nd != 4)
+    if (PyGpuArray_NDIM(out) != 4)
    {
        PyErr_SetString(PyExc_ValueError, "required out of 4D");
        return -1;
@@ -40,40 +50,40 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
                version, THEANO_KERN_WID);
        fprintf(stderr,
                "INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n",
-                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
+                PyGpuArray_DIMS(img)[0], PyGpuArray_DIMS(img)[1],
-                CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
+                PyGpuArray_DIMS(img)[2],PyGpuArray_DIMS(img)[3],
-                CudaNdarray_HOST_STRIDES(img)[0],
+                PyGpuArray_STRIDES(img)[0]/4,
-                CudaNdarray_HOST_STRIDES(img)[1],
+                PyGpuArray_STRIDES(img)[1]/4,
-                CudaNdarray_HOST_STRIDES(img)[2],
+                PyGpuArray_STRIDES(img)[2]/4,
-                CudaNdarray_HOST_STRIDES(img)[3]);
+                PyGpuArray_STRIDES(img)[3]/4);
        fprintf(stderr,
                "INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
-                CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
+                PyGpuArray_DIMS(kern)[0], PyGpuArray_DIMS(kern)[1],
-                CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
+                PyGpuArray_DIMS(kern)[2], PyGpuArray_DIMS(kern)[3],
-                CudaNdarray_HOST_STRIDES(kern)[0],
+                PyGpuArray_STRIDES(kern)[0]/4,
-                CudaNdarray_HOST_STRIDES(kern)[1],
+                PyGpuArray_STRIDES(kern)[1]/4,
-                CudaNdarray_HOST_STRIDES(kern)[2],
+                PyGpuArray_STRIDES(kern)[2]/4,
-                CudaNdarray_HOST_STRIDES(kern)[3]);
+                PyGpuArray_STRIDES(kern)[3]/4);
        fprintf(stderr,
                "INFO:   out dim: %i %i %i %i  out stride: %i %i %i %i\n",
-               CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1],
+               PyGpuArray_DIMS(out)[0], PyGpuArray_DIMS(out)[1],
-               CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
+               PyGpuArray_DIMS(out)[2], PyGpuArray_DIMS(out)[3],
-               CudaNdarray_HOST_STRIDES(out)[0],
+               PyGpuArray_STRIDES(out)[0]/4,
-               CudaNdarray_HOST_STRIDES(out)[1],
+               PyGpuArray_STRIDES(out)[1]/4,
-               CudaNdarray_HOST_STRIDES(out)[2],
+               PyGpuArray_STRIDES(out)[2]/4,
-               CudaNdarray_HOST_STRIDES(out)[3]);
+               PyGpuArray_STRIDES(out)[3]/4);
        fprintf(stderr,
                "INFO:   subsample_rows=%d, subsample_cols=%d\n",
                subsample_rows, subsample_cols);
    }
    //Check the output size is valid
-    assert (CudaNdarray_HOST_DIMS(out)[2] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2]- CudaNdarray_HOST_DIMS(kern)[2] + 1, subsample_rows));
+    assert (PyGpuArray_DIMS(out)[2] == ceil_intdiv(PyGpuArray_DIMS(img)[2]- PyGpuArray_DIMS(kern)[2] + 1, subsample_rows));
-    assert (CudaNdarray_HOST_DIMS(out)[3] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3]- CudaNdarray_HOST_DIMS(kern)[3] + 1, subsample_cols));
+    assert (PyGpuArray_DIMS(out)[3] == ceil_intdiv(PyGpuArray_DIMS(img)[3]- PyGpuArray_DIMS(kern)[3] + 1, subsample_cols));
-    assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
+    assert (PyGpuArray_DIMS(out)[0] == PyGpuArray_DIMS(img)[0]);
-    assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
+    assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]);
-    assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
+    assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]);
    // we now search through a few implementations until one applies to our arguments.
@@ -82,24 +92,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    //TODO: make a parameter the number of division
    //TODO: Should we make them in separate grid block instead?
-    const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
+    const int nstack=PyGpuArray_DIMS(kern)[1];
-    const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
+    const int nbatch=PyGpuArray_DIMS(img)[0];
-    const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
+    const int nkern=PyGpuArray_DIMS(kern)[0];
-    const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
+    const int img_wid=PyGpuArray_DIMS(img)[3];
-    const int img_len=CudaNdarray_HOST_DIMS(img)[2];
+    const int img_len=PyGpuArray_DIMS(img)[2];
-    const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
+    const int kern_wid=PyGpuArray_DIMS(kern)[3];
-    const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
+    const int kern_len=PyGpuArray_DIMS(kern)[2];
-    const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
+    const int out_wid=PyGpuArray_DIMS(out)[3];
-    const int out_len=CudaNdarray_HOST_DIMS(out)[2];
+    const int out_len=PyGpuArray_DIMS(out)[2];
-    const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
+    const int img_stride_col= PyGpuArray_STRIDES(img)[3]/4;
-    const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
+    const int img_stride_row=PyGpuArray_STRIDES(img)[2]/4;
-    const int img_stride_stack= CudaNdarray_HOST_STRIDES(img)[1];
+    const int img_stride_stack= PyGpuArray_STRIDES(img)[1]/4;
-    const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
+    const int img_stride_batch=PyGpuArray_STRIDES(img)[0]/4;
-    const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
+    const int kern_stride_col= PyGpuArray_STRIDES(kern)[3]/4;
-    const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
+    const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
-    const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
+    const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
-    const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
+    const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
    const int img_size=img_len*img_wid;
    const int kern_size=kern_len*kern_wid;
@@ -107,17 +117,17 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    const int img_size_byte = img_size*sizeof(float);
    const int kern_size_byte = kern_size*sizeof(float);
    const int out_size_byte = out_size*sizeof(float);
-    if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
+    if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
      PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
-                   " %d kernel columns, but the kernel we received had %d columns!",
+                   " %d kernel columns, but the kernel we received had %ud columns!",
-                   THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
+                   THEANO_KERN_WID, PyGpuArray_DIMS(kern)[3]);
      return -1;
    }
    bool subsample = subsample_rows!=1 || subsample_cols!=1;
-    bool img_contiguous = CudaNdarray_is_c_contiguous(img);
+    bool img_contiguous = img->ga.flags & GA_C_CONTIGUOUS;
-    bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
+    bool kern_contiguous = kern->ga.flags & GA_C_CONTIGUOUS;
-    bool out_contiguous = CudaNdarray_is_c_contiguous(out);
+    bool out_contiguous = out->ga.flags & GA_C_CONTIGUOUS;
    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
@@ -130,7 +140,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    //we don't need to unflip it, but have the new value when we unflip it.
    bool kern_flipped=true;
    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
-    float * kern_data_unflipped = kern->devdata;
+    const float * kern_data_unflipped = cuda_get_ptr(kern);
    int kern_stride_col_unflipped=kern_stride_col;
    int kern_stride_row_unflipped=kern_stride_row;
    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
@@ -139,7 +149,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
      kern_stride_row_unflipped=kern_wid;
      kern_flipped=false;
      kern_contiguous_2d_unflipped = true;
-      kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
+      kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
    }
    //if we remove the restriction
@@ -173,7 +183,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        dim3 grid(nbatch, nkern);
        int shared_size=(img_size + kern_size)*sizeof(float);
-        void (*f)(float*, float*, float*,
+        void (*f)(const float*, const float*, float*,
                  int, int, int, int,
                  int, int);
@@ -184,9 +194,9 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        CONV_PATCH_SPECIAL(THEANO_KERN_WID);
         f<<< grid, threads, shared_size>>>
-             (img->devdata, kern->devdata, out->devdata,
+             (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
              img_len, img_wid, kern_len, kern_wid, nkern, nstack);
-        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
@@ -234,7 +244,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        dim3 grid(nbatch,nkern);
        int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
-        void (*f)(float*, float*, float*,
+        void (*f)(const float*, const float*, float*,
                  int, int, int, int,
                  int, int, int, int,
                  int, int, int, int,
@@ -277,14 +287,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
        f<<< grid, threads, shared_size>>>
-             (img->devdata, kern->devdata, out->devdata,
+            (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
              img_len, img_wid, kern_len, kern_wid, 
              out_len, out_wid, nkern, nstack,
              img_stride_col, img_stride_row, img_stride_stack,
              img_stride_batch, kern_stride_col, kern_stride_row,
              kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
-        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
@@ -346,7 +355,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        dim3 threads(out_wid);
        dim3 grid(out_len, nbatch*nkern);
        int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
-        void (*f)(float*, float*, float*,
+        void (*f)(const float*, const float*, float*,
                  int, int, int, int,
                  int, int, int, int,
                  int, int, int, int,
@@ -358,14 +367,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        CONV_ROWS_SPECIAL(THEANO_KERN_WID);
        f<<< grid, threads, shared_size >>>
-          (img->devdata, kern->devdata, out->devdata,
+            (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
           img_stride_col, img_stride_row,
           img_stride_stack,img_stride_batch,
           kern_stride_col, kern_stride_row,
           kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
@@ -408,7 +416,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
-        void (*f)(float*, float*, float*,
+        void (*f)(const float*, const float*, float*,
                  int, int, int, int,
                  int, int, int, int,
                  int, int, int, int,
@@ -430,16 +438,15 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        }
        f<<< grid, threads, shared_size >>>
-          (img->devdata,
+            (cuda_get_ptr(img),
-           kern->devdata,
+             cuda_get_ptr(kern),
-           out->devdata,
+             cuda_get_ptr(out),
           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
           img_stride_col, img_stride_row,
           img_stride_stack,img_stride_batch,
           kern_stride_col, kern_stride_row,
           kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
@@ -503,7 +510,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
-        void (*f)(float*, float*, float*,
+        void (*f)(const float*, const float*, float*,
                  int, int, int, int,
                  int, int, int, int,
                  int, int, int, int,
@@ -518,16 +525,15 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
        f<<< grid, threads, shared_size >>>
-          (img->devdata,
+            (cuda_get_ptr(img),
-           kern->devdata,
+             cuda_get_ptr(kern),
-           out->devdata,
+             cuda_get_ptr(out),
           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
           img_stride_col, img_stride_row,
           img_stride_stack,img_stride_batch,
           kern_stride_col, kern_stride_row,
           kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
        {
@@ -626,7 +632,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
            dim3 threads(out_wid, out_len, thread_z);
            dim3 grid(nbatch,nkern);
-            void (*f)(float*, float*, float*,
+            void (*f)(const float*, const float*, float*,
                      int, int, int, int,
                      int, int, int, int,
                      int, int,
@@ -657,13 +663,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
                else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
            CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
-            f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
+            f<<< grid, threads, shared_size>>>(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
                                               img_len, img_wid, kern_len, kern_wid,
                                               nkern, nstack,
                                               img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
                                               kern_stride_col_unflipped, kern_stride_row_unflipped,
                                               kern_stride_stack, kern_stride_nkern);
-            CNDA_THREAD_SYNC;
            cudaError_t sts = cudaGetLastError();
            if (cudaSuccess == sts)
            {
@@ -705,8 +711,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        kern_len<=320 &&
        !work_complete) //conv_valid_row_reduce
    {
-        int outsize = CudaNdarray_SIZE(out);
+        int outsize = PyGpuArray_SIZE(out);
-        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
+        int n_blocks = std::min(outsize, 4096);
        int block_nstack=nstack;
        //Max of 512 threads per blocks.
@@ -736,8 +742,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        void (*f)(int, int, int, int,
                  int, int, int, int, int,
-                  float*, int, int, int, int,
+                  const float*, int, int, int, int,
-                  float*, int, int, int, int,
+                  const float*, int, int, int, int,
                  float*, int, int, int, int,
                  int, int, int);
@@ -749,23 +755,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        else
          f=conv_valid_row_reduce<true>;
        f<<<n_blocks, n_threads, n_reduce_buf>>>(
-                nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
+                nbatch, nkern, PyGpuArray_DIMS(img)[1],
                img_len, img_wid,
                kern_len, kern_wid,
                out_len, out_wid,
-                img->devdata,
+                cuda_get_ptr(img),
-                CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], 
+                PyGpuArray_STRIDES(img)[0]/4, PyGpuArray_STRIDES(img)[1]/4, 
                img_stride_row, img_stride_col,
-                kern->devdata,
+                cuda_get_ptr(kern),
-                CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],
+                PyGpuArray_STRIDES(kern)[0]/4, PyGpuArray_STRIDES(kern)[1]/4,
-                CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
+                PyGpuArray_STRIDES(kern)[2]/4, PyGpuArray_STRIDES(kern)[3]/4,
-                out->devdata,
+                cuda_get_ptr(out),
-                CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1],
+                PyGpuArray_STRIDES(out)[0]/4, PyGpuArray_STRIDES(out)[1]/4,
-                CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
+                PyGpuArray_STRIDES(out)[2]/4, PyGpuArray_STRIDES(out)[3]/4,
                subsample_rows, subsample_cols, initial_reduce_boundary);
-        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
        {
@@ -791,65 +795,64 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    if (1 && !work_complete) //conv_reference_valid
    {
-        int outsize = CudaNdarray_SIZE(out);
+        int outsize = PyGpuArray_SIZE(out);
-        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
+        int n_blocks = std::min(outsize, 4096);
        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
-                                 NUM_VECTOR_OP_THREADS_PER_BLOCK);
+                                 256);
        if (1)
        {
            if (verbose)
              fprintf(stderr, "INFO: launching conv_reference_valid\n");
            if (verbose>1)
              fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
-                      nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
+                      nbatch, PyGpuArray_DIMS(img)[1], img_len, img_wid,
-                      img->devdata,
+                      cuda_get_ptr(img),
-                      CudaNdarray_HOST_STRIDES(img)[0],
+                      PyGpuArray_STRIDES(img)[0]/4,
-                      CudaNdarray_HOST_STRIDES(img)[1],
+                      PyGpuArray_STRIDES(img)[1]/4,
-                      CudaNdarray_HOST_STRIDES(img)[2],
+                      PyGpuArray_STRIDES(img)[2]/4,
-                      CudaNdarray_HOST_STRIDES(img)[3]);
+                      PyGpuArray_STRIDES(img)[3]/4);
            if (verbose>1)
              fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n",
                      nkern, nstack, kern_len, kern_wid,
-                      kern->devdata,
+                      cuda_get_ptr(kern),
-                      CudaNdarray_HOST_STRIDES(kern)[0],
+                      PyGpuArray_STRIDES(kern)[0]/4,
-                      CudaNdarray_HOST_STRIDES(kern)[1],
+                      PyGpuArray_STRIDES(kern)[1]/4,
-                      CudaNdarray_HOST_STRIDES(kern)[2],
+                      PyGpuArray_STRIDES(kern)[2]/4,
-                      CudaNdarray_HOST_STRIDES(kern)[3]);
+                      PyGpuArray_STRIDES(kern)[3]/4);
            if (verbose>1)
              fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
-                      CudaNdarray_HOST_DIMS(out)[0],
+                      PyGpuArray_DIMS(out)[0],
-                      CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
+                      PyGpuArray_DIMS(out)[1], out_len, out_wid,
-                      out->devdata,
+                      cuda_get_ptr(out),
-                      CudaNdarray_HOST_STRIDES(out)[0],
+                      PyGpuArray_STRIDES(out)[0]/4,
-                      CudaNdarray_HOST_STRIDES(out)[1],
+                      PyGpuArray_STRIDES(out)[1]/4,
-                      CudaNdarray_HOST_STRIDES(out)[2],
+                      PyGpuArray_STRIDES(out)[2]/4,
-                      CudaNdarray_HOST_STRIDES(out)[3]);
+                      PyGpuArray_STRIDES(out)[3]/4);
            if (verbose>1)
              fprintf(stderr, "   launch params: %i %i %i\n",
                      outsize, n_blocks, n_threads);
        }
        conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
-                CudaNdarray_HOST_DIMS(img)[1],
+                PyGpuArray_DIMS(img)[1],
                img_len, img_wid,
                kern_len, kern_wid,
                out_len, out_wid,
-                img->devdata,
+                cuda_get_ptr(img),
-                CudaNdarray_HOST_STRIDES(img)[0],
+                PyGpuArray_STRIDES(img)[0]/4,
-                CudaNdarray_HOST_STRIDES(img)[1],
+                PyGpuArray_STRIDES(img)[1]/4,
-                CudaNdarray_HOST_STRIDES(img)[2],
+                PyGpuArray_STRIDES(img)[2]/4,
-                CudaNdarray_HOST_STRIDES(img)[3],
+                PyGpuArray_STRIDES(img)[3]/4,
-                kern->devdata,
+                cuda_get_ptr(kern),
-                CudaNdarray_HOST_STRIDES(kern)[0],
+                PyGpuArray_STRIDES(kern)[0]/4,
-                CudaNdarray_HOST_STRIDES(kern)[1],
+                PyGpuArray_STRIDES(kern)[1]/4,
-                CudaNdarray_HOST_STRIDES(kern)[2],
+                PyGpuArray_STRIDES(kern)[2]/4,
-                CudaNdarray_HOST_STRIDES(kern)[3],
+                PyGpuArray_STRIDES(kern)[3]/4,
-                out->devdata,
+                cuda_get_ptr(out),
-                CudaNdarray_HOST_STRIDES(out)[0],
+                PyGpuArray_STRIDES(out)[0]/4,
-                CudaNdarray_HOST_STRIDES(out)[1],
+                PyGpuArray_STRIDES(out)[1]/4,
-                CudaNdarray_HOST_STRIDES(out)[2],
+                PyGpuArray_STRIDES(out)[2]/4,
-                CudaNdarray_HOST_STRIDES(out)[3],
+                PyGpuArray_STRIDES(out)[3]/4,
                subsample_rows, subsample_cols);
-        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
@@ -864,7 +867,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
              fprintf(stderr, "INFO: 'conv_reference_valid' failed\n");
            PyErr_Format(PyExc_RuntimeError,
                         "ERROR: all implementations failed for"
-                         " CudaNdarray_conv_valid! (%s)",
+                         " PyGpuArray_conv_valid! (%s)",
                         cudaGetErrorString(sts));
            return -1;
        }
@@ -873,7 +876,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    {
      PyErr_Format(PyExc_RuntimeError,
                   "ERROR: no implementation(s) worked for"
-                   " CudaNdarray_conv_valid!"
+                   " PyGpuArray_conv_valid!"
                   " Version asked(%d) (-1 mean use an heuristic)",
                   version);
        return -1;
@@ -882,56 +885,56 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
 }
 int
-CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
+PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
-                      CudaNdarray * out, int subsample_rows,
+                      PyGpuArrayObject * out, size_t subsample_rows,
-                      int subsample_cols, int version = -1, int verbose=0,
+                      size_t subsample_cols, int version = -1, int verbose=0,
                      int max_threads_dim0=512)
 {
  //144 is the biggest static shared size used with compiling this file.
    const int shared_avail = SHARED_SIZE - 150;
    int work_complete = 0;
-    if (img->nd != 4)
+    if (PyGpuArray_NDIM(img) != 4)
    {
        PyErr_SetString(PyExc_ValueError, "required img of 4D");
        return -1;
    }
-    if (kern->nd != 4)
+    if (PyGpuArray_NDIM(kern) != 4)
    {
        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
        return -1;
    }
-    if (out->nd != 4)
+    if (PyGpuArray_NDIM(out) != 4)
    {
        PyErr_SetString(PyExc_ValueError, "required out of 4D");
        return -1;
    }
    // check the size of the output matrix
-    assert (CudaNdarray_HOST_DIMS(out)[2] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1, subsample_rows));
+    assert (PyGpuArray_DIMS(out)[2] == ceil_intdiv(PyGpuArray_DIMS(img)[2] + PyGpuArray_DIMS(kern)[2] - 1, subsample_rows));
-    assert (CudaNdarray_HOST_DIMS(out)[3] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1, subsample_cols));
+    assert (PyGpuArray_DIMS(out)[3] == ceil_intdiv(PyGpuArray_DIMS(img)[3] + PyGpuArray_DIMS(kern)[3] - 1, subsample_cols));
-    assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
+    assert (PyGpuArray_DIMS(out)[0] == PyGpuArray_DIMS(img)[0]);
-    assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
+    assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]);
-    assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
+    assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]);
-    const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
+    const int nstack=PyGpuArray_DIMS(kern)[1];
-    const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
+    const int nbatch=PyGpuArray_DIMS(img)[0];
-    const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
+    const int nkern=PyGpuArray_DIMS(kern)[0];
-    const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
+    const int img_wid=PyGpuArray_DIMS(img)[3];
-    const int img_len=CudaNdarray_HOST_DIMS(img)[2];
+    const int img_len=PyGpuArray_DIMS(img)[2];
-    const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
+    const int kern_wid=PyGpuArray_DIMS(kern)[3];
-    const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
+    const int kern_len=PyGpuArray_DIMS(kern)[2];
-    const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
+    const int out_wid=PyGpuArray_DIMS(out)[3];
-    const int out_len=CudaNdarray_HOST_DIMS(out)[2];
+    const int out_len=PyGpuArray_DIMS(out)[2];
-    const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
+    const int img_stride_col= PyGpuArray_STRIDES(img)[3]/4;
-    const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
+    const int img_stride_row=PyGpuArray_STRIDES(img)[2]/4;
-    const int img_stride_stack=CudaNdarray_HOST_STRIDES(img)[1];
+    const int img_stride_stack=PyGpuArray_STRIDES(img)[1]/4;
-    const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
+    const int img_stride_batch=PyGpuArray_STRIDES(img)[0]/4;
-    const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
+    const int kern_stride_col= PyGpuArray_STRIDES(kern)[3]/4;
-    const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
+    const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
-    const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
+    const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
-    const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
+    const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
    const int img_size=img_len*img_wid;
    const int kern_size=kern_len*kern_wid;
@@ -946,20 +949,20 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
    //const int out_size_byte = out_size*sizeof(float); // unused 
-    if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) ||
+    if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) ||
          (THEANO_KERN_WID == 0))){
      PyErr_Format(PyExc_ValueError,
                   "ERROR: This GpuConv code was compiled for"
                   " %d kernel columns, but the kernel we received"
-                   " had %d columns!",
+                   " had %ud columns!",
-                   THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
+                   THEANO_KERN_WID, PyGpuArray_DIMS(kern)[3]);
      return -1;
    }
    bool subsample = subsample_rows!=1 || subsample_cols!=1;
-    bool img_contiguous = CudaNdarray_is_c_contiguous(img);
+    bool img_contiguous = img->ga.flags & GA_C_CONTIGUOUS;
-    bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
+    bool kern_contiguous = kern->ga.flags & GA_C_CONTIGUOUS;
-    bool out_contiguous = CudaNdarray_is_c_contiguous(out);
+    bool out_contiguous = out->ga.flags & GA_C_CONTIGUOUS;
    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
@@ -974,7 +977,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
    //we don't need to unflip it, but have the new value when we unflip it.
    bool kern_flipped=true;
    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
-    float * kern_data_unflipped = kern->devdata;
+    const float * kern_data_unflipped = cuda_get_ptr(kern);
    int kern_stride_col_unflipped=kern_stride_col;
    int kern_stride_row_unflipped=kern_stride_row;
    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
@@ -983,7 +986,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
      kern_stride_row_unflipped=kern_wid;
      kern_flipped=false;
      kern_contiguous_2d_unflipped = true;
-      kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
+      kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
    }
    if (verbose>1)
@@ -991,26 +994,26 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        printf("INFO: Running conv_full version=%d,"
               " MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
        printf("INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n", 
-               CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
+               PyGpuArray_DIMS(img)[0], PyGpuArray_DIMS(img)[1],
-               CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
+               PyGpuArray_DIMS(img)[2], PyGpuArray_DIMS(img)[3],
-               CudaNdarray_HOST_STRIDES(img)[0],
+               PyGpuArray_STRIDES(img)[0]/4,
-               CudaNdarray_HOST_STRIDES(img)[1],
+               PyGpuArray_STRIDES(img)[1]/4,
-               CudaNdarray_HOST_STRIDES(img)[2],
+               PyGpuArray_STRIDES(img)[2]/4,
-               CudaNdarray_HOST_STRIDES(img)[3]);
+               PyGpuArray_STRIDES(img)[3]/4);
        printf("INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
-               CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
+               PyGpuArray_DIMS(kern)[0], PyGpuArray_DIMS(kern)[1],
-               CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
+               PyGpuArray_DIMS(kern)[2], PyGpuArray_DIMS(kern)[3],
-               CudaNdarray_HOST_STRIDES(kern)[0],
+               PyGpuArray_STRIDES(kern)[0]/4,
-               CudaNdarray_HOST_STRIDES(kern)[1],
+               PyGpuArray_STRIDES(kern)[1]/4,
-               CudaNdarray_HOST_STRIDES(kern)[2],
+               PyGpuArray_STRIDES(kern)[2]/4,
-               CudaNdarray_HOST_STRIDES(kern)[3]);
+               PyGpuArray_STRIDES(kern)[3]/4);
        printf("INFO:   out dim: %i %i %i %i  out stride: %i %i %i %i\n",
-               CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1],
+               PyGpuArray_DIMS(out)[0], PyGpuArray_DIMS(out)[1],
-               CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
+               PyGpuArray_DIMS(out)[2], PyGpuArray_DIMS(out)[3],
-               CudaNdarray_HOST_STRIDES(out)[0],
+               PyGpuArray_STRIDES(out)[0]/4,
-               CudaNdarray_HOST_STRIDES(out)[1],
+               PyGpuArray_STRIDES(out)[1]/4,
-               CudaNdarray_HOST_STRIDES(out)[2],
+               PyGpuArray_STRIDES(out)[2]/4,
-               CudaNdarray_HOST_STRIDES(out)[3]);
+               PyGpuArray_STRIDES(out)[3]/4);
    }
    if (!subsample &&
@@ -1063,7 +1066,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        int shared_size=img_size_padded_byte + kern_size_byte;
        if(version==5)
          shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
-        void (*f)(float*, float*, float*,
+        void (*f)(const float*, const float*, float*,
                  int, int, int, int,
                  int, int, int, int,
                  int, int, int, int,
@@ -1087,13 +1090,12 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
        f<<< grid, threads, shared_size>>>
-             (img->devdata, kern_data_unflipped, out->devdata,
+            (cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
              img_len, img_wid, kern_len, kern_wid, nkern, nstack,
              img_stride_col, img_stride_row, img_stride_stack,
              img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
              kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
@@ -1147,14 +1149,13 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
        conv_full_patch<<< grid, threads, shared_size>>>
-          (img->devdata,
+            (cuda_get_ptr(img),
-           kern->devdata,
+             cuda_get_ptr(kern),
-           out->devdata,
+             cuda_get_ptr(out),
           img_len, img_wid,
           kern_len, kern_wid,
           nkern, nstack);
-        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
        {
@@ -1189,30 +1190,29 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
        //typeof(conv_full_load_everything<0>) f = ;
-        void (*f)(float*, float*, float*,
+        void (*f)(const float*, const float*, float*,
                  int, int, int, int, int, int,
                  int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
        f = conv_full_load_everything<THEANO_KERN_WID>;
        f<<< grid, threads, shared_size>>>
-          (img->devdata,
+            (cuda_get_ptr(img),
-           kern->devdata,
+             cuda_get_ptr(kern),
-           out->devdata,
+             cuda_get_ptr(out),
           img_len, img_wid, 
           kern_len, kern_wid,
           nkern, nstack,
-           CudaNdarray_HOST_STRIDES(img)[3],
+           PyGpuArray_STRIDES(img)[3]/4,
-           CudaNdarray_HOST_STRIDES(img)[2],
+           PyGpuArray_STRIDES(img)[2]/4,
-           CudaNdarray_HOST_STRIDES(img)[1],
+           PyGpuArray_STRIDES(img)[1]/4,
-           CudaNdarray_HOST_STRIDES(img)[0],
+           PyGpuArray_STRIDES(img)[0]/4,
-           CudaNdarray_HOST_STRIDES(kern)[3],
+           PyGpuArray_STRIDES(kern)[3]/4,
-           CudaNdarray_HOST_STRIDES(kern)[2],
+           PyGpuArray_STRIDES(kern)[2]/4,
-           CudaNdarray_HOST_STRIDES(kern)[1],
+           PyGpuArray_STRIDES(kern)[1]/4,
-           CudaNdarray_HOST_STRIDES(kern)[0]
+           PyGpuArray_STRIDES(kern)[0]/4
           );
-        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
        {
@@ -1246,7 +1246,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        dim3 grid(nbatch,nkern);
        int shared_size=(img_size + kern_size)*sizeof(float);
-        void (*f)(float*, float*, float*,
+        void (*f)(const float*, const float*, float*,
                  int, int, int, int,
                  int, int, int, int,
                  int, int, int, int);
@@ -1257,15 +1257,15 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
        f<<< grid, threads, shared_size>>>(
-                img->devdata,
+                cuda_get_ptr(img),
-                kern->devdata,
+                cuda_get_ptr(kern),
-                out->devdata,
+                cuda_get_ptr(out),
                img_len, img_wid,
                kern_len, kern_wid,
                nkern, nstack,img_stride_col, img_stride_row,
                kern_stride_col, kern_stride_row,
                kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
        {
@@ -1290,48 +1290,48 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
    {
        if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");
-        int outsize = CudaNdarray_SIZE(out);
+        int outsize = PyGpuArray_SIZE(out);
-        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
+        int n_blocks = std::min(outsize, 4096);
        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
-                                 NUM_VECTOR_OP_THREADS_PER_BLOCK);
+                                 256);
        if (0)
        {
            if (verbose)
              fprintf(stderr, "INFO: launching conv_reference_valid\n");
            if (verbose)
              fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
-                      CudaNdarray_HOST_DIMS(img)[0],
+                      PyGpuArray_DIMS(img)[0],
-                      CudaNdarray_HOST_DIMS(img)[1],
+                      PyGpuArray_DIMS(img)[1],
-                      CudaNdarray_HOST_DIMS(img)[2],
+                      PyGpuArray_DIMS(img)[2],
-                      CudaNdarray_HOST_DIMS(img)[3],
+                      PyGpuArray_DIMS(img)[3],
-                      img->devdata,
+                      cuda_get_ptr(img),
-                      CudaNdarray_HOST_STRIDES(img)[0],
+                      PyGpuArray_STRIDES(img)[0]/4,
-                      CudaNdarray_HOST_STRIDES(img)[1],
+                      PyGpuArray_STRIDES(img)[1]/4,
-                      CudaNdarray_HOST_STRIDES(img)[2],
+                      PyGpuArray_STRIDES(img)[2]/4,
-                      CudaNdarray_HOST_STRIDES(img)[3]);
+                      PyGpuArray_STRIDES(img)[3]/4);
            if (verbose)
              fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n",
-                      CudaNdarray_HOST_DIMS(kern)[0],
+                      PyGpuArray_DIMS(kern)[0],
-                      CudaNdarray_HOST_DIMS(kern)[1],
+                      PyGpuArray_DIMS(kern)[1],
-                      CudaNdarray_HOST_DIMS(kern)[2],
+                      PyGpuArray_DIMS(kern)[2],
-                      CudaNdarray_HOST_DIMS(kern)[3],
+                      PyGpuArray_DIMS(kern)[3],
-                      kern->devdata,
+                      cuda_get_ptr(kern),
-                      CudaNdarray_HOST_STRIDES(kern)[0],
+                      PyGpuArray_STRIDES(kern)[0]/4,
-                      CudaNdarray_HOST_STRIDES(kern)[1],
+                      PyGpuArray_STRIDES(kern)[1]/4,
-                      CudaNdarray_HOST_STRIDES(kern)[2],
+                      PyGpuArray_STRIDES(kern)[2]/4,
-                      CudaNdarray_HOST_STRIDES(kern)[3]
+                      PyGpuArray_STRIDES(kern)[3]/4
                        );
            if (verbose)
              fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
-                      CudaNdarray_HOST_DIMS(out)[0],
+                      PyGpuArray_DIMS(out)[0],
-                      CudaNdarray_HOST_DIMS(out)[1],
+                      PyGpuArray_DIMS(out)[1],
-                      CudaNdarray_HOST_DIMS(out)[2],
+                      PyGpuArray_DIMS(out)[2],
-                      CudaNdarray_HOST_DIMS(out)[3],
+                      PyGpuArray_DIMS(out)[3],
-                      out->devdata,
+                      cuda_get_ptr(out),
-                      CudaNdarray_HOST_STRIDES(out)[0],
+                      PyGpuArray_STRIDES(out)[0]/4,
-                      CudaNdarray_HOST_STRIDES(out)[1],
+                      PyGpuArray_STRIDES(out)[1]/4,
-                      CudaNdarray_HOST_STRIDES(out)[2],
+                      PyGpuArray_STRIDES(out)[2]/4,
-                      CudaNdarray_HOST_STRIDES(out)[3]);
+                      PyGpuArray_STRIDES(out)[3]/4);
            if (verbose)
              fprintf(stderr, "   launch params: %i %i %i\n",
                      outsize, n_blocks, n_threads);
@@ -1340,25 +1340,24 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
                      subsample_rows, subsample_cols);
        }
        conv_reference_full<<<n_blocks, n_threads>>>(
-                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
+                PyGpuArray_DIMS(img)[0], PyGpuArray_DIMS(kern)[0],
-                CudaNdarray_HOST_DIMS(img)[1],
+                PyGpuArray_DIMS(img)[1],
-                CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
+                PyGpuArray_DIMS(img)[2], PyGpuArray_DIMS(img)[3],
-                CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
+                PyGpuArray_DIMS(kern)[2], PyGpuArray_DIMS(kern)[3],
-                CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
+                PyGpuArray_DIMS(out)[2], PyGpuArray_DIMS(out)[3],
-                img->devdata, CudaNdarray_HOST_STRIDES(img)[0],
+                cuda_get_ptr(img), PyGpuArray_STRIDES(img)[0]/4,
-                CudaNdarray_HOST_STRIDES(img)[1],
+                PyGpuArray_STRIDES(img)[1]/4,
-                CudaNdarray_HOST_STRIDES(img)[2],
+                PyGpuArray_STRIDES(img)[2]/4,
-                CudaNdarray_HOST_STRIDES(img)[3],
+                PyGpuArray_STRIDES(img)[3]/4,
-                kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0],
+                cuda_get_ptr(kern), PyGpuArray_STRIDES(kern)[0]/4,
-                CudaNdarray_HOST_STRIDES(kern)[1],
+                PyGpuArray_STRIDES(kern)[1]/4,
-                CudaNdarray_HOST_STRIDES(kern)[2],
+                PyGpuArray_STRIDES(kern)[2]/4,
-                CudaNdarray_HOST_STRIDES(kern)[3],
+                PyGpuArray_STRIDES(kern)[3]/4,
-                out->devdata, CudaNdarray_HOST_STRIDES(out)[0],
+                cuda_get_ptr(out), PyGpuArray_STRIDES(out)[0]/4,
-                CudaNdarray_HOST_STRIDES(out)[1],
+                PyGpuArray_STRIDES(out)[1]/4,
-                CudaNdarray_HOST_STRIDES(out)[2],
+                PyGpuArray_STRIDES(out)[2]/4,
-                CudaNdarray_HOST_STRIDES(out)[3],
+                PyGpuArray_STRIDES(out)[3]/4,
                subsample_rows, subsample_cols);
-        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
@@ -1392,9 +1391,9 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
 }
 PyObject *
-CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
+PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
-                 CudaNdarray * out, const int mode,
+                 PyGpuArrayObject * out, const int mode,
-                 const int subsample_rows, const int subsample_cols,
+                 const size_t subsample_rows, const size_t subsample_cols,
                 const int version, const int verbose,
                 const int max_threads_dim0 = 512
                 )
@@ -1402,43 +1401,43 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
    // Re-use the out object if possible.  If the out object it not used, then its refcount is not modified.
    //  If the out object is re-used then it is returned, and its refcount is incremented by 1.
    //
-    if (img->nd != 4)
+    if (PyGpuArray_NDIM(img) != 4)
    {
-      PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
+      PyErr_SetString(PyExc_ValueError, "PyGpuArray 4-D tensor required");
      return NULL;
    }
-    if (kern->nd != 4)
+    if (PyGpuArray_NDIM(kern) != 4)
    {
-      PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
+      PyErr_SetString(PyExc_ValueError, "PyGpuArray 4-D tensor required");
      return NULL;
    }
-    int out_dim[4];
+    size_t out_dim[4];
-    out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
+    out_dim[0] = PyGpuArray_DIMS(img)[0];
-    out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0];
+    out_dim[1] = PyGpuArray_DIMS(kern)[0];
-    int logical_rows, logical_cols;
+    size_t logical_rows, logical_cols;
    if (mode == ConvMode_VALID)
    {
-        logical_rows = CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1;
+        logical_rows = PyGpuArray_DIMS(img)[2] - PyGpuArray_DIMS(kern)[2] + 1;
-        logical_cols = CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1;
+        logical_cols = PyGpuArray_DIMS(img)[3] - PyGpuArray_DIMS(kern)[3] + 1;
    }
    else
    {
-        logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1;
+        logical_rows = PyGpuArray_DIMS(img)[2] + PyGpuArray_DIMS(kern)[2] - 1;
-        logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1;
+        logical_cols = PyGpuArray_DIMS(img)[3] + PyGpuArray_DIMS(kern)[3] - 1;
    }
    out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
    out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
-    CudaNdarray * rval = NULL;
+    PyGpuArrayObject * rval = NULL;
    if ( out
-         && out->nd==4
+         && PyGpuArray_NDIM(out)==4
-         && CudaNdarray_is_c_contiguous(out)
+         && out->ga.flags & GA_C_CONTIGUOUS
-         && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
+         && PyGpuArray_DIMS(out)[0]==out_dim[0]
-         && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
+         && PyGpuArray_DIMS(out)[1]==out_dim[1]
-         && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
+         && PyGpuArray_DIMS(out)[2]==out_dim[2]
-         && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])
+         && PyGpuArray_DIMS(out)[3]==out_dim[3])
    {
      rval = out;
      Py_INCREF(rval);
@@ -1458,20 +1457,22 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
                "INFO: Conv don't have an 'out' argument"
                " structure.\n");
-      rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
+      rval = pygpu_zeros(4, out_dim,
+                         img->ga.typecode, GA_C_ORDER,
+                         pygpu_default_context(), Py_None);
      //rval might be null
    }
    if ((rval==NULL)
-        || ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval,
+        || ((mode==ConvMode_VALID) && PyGpuArray_conv_valid(img, kern, rval,
-                                                             subsample_rows,
+                                                            subsample_rows,
-                                                             subsample_cols,
+                                                            subsample_cols,
-                                                             version, verbose,
+                                                            version, verbose,
-                                                             max_threads_dim0))
+                                                            max_threads_dim0))
-        || ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval,
+        || ((mode==ConvMode_FULL) && PyGpuArray_conv_full(img, kern, rval,
-                                                           subsample_rows,
+                                                          subsample_rows,
-                                                           subsample_cols,
+                                                          subsample_cols,
-                                                           version, verbose,
+                                                          version, verbose,
-                                                           max_threads_dim0))
+                                                          max_threads_dim0))
            )
    {
        // if rval is something we just allocated,

--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
+import copy
+import os
 import theano
-from theano import gof
+from theano import config, gof
+from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
+from theano.sandbox.gpuarray.type import GpuArrayType
 class GpuConv(gof.Op):
@@ -114,6 +119,9 @@ class GpuConv(gof.Op):
            str(self.kshp))
    def make_node(self, img, kern):
+        if img.dtype != "float32" or kern.dtype != "float32":
+            raise NotImplementedError("GpuConv currently only work"
+                                      " with float32 dtype")
        if img.type.ndim != 4:
            raise TypeError('img must be 4D tensor')
        if kern.type.ndim != 4:
@@ -121,7 +129,8 @@ class GpuConv(gof.Op):
        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
                         False, False]
-        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
+        out = GpuArrayType(img.dtype, broadcastable)()
+        return gof.Apply(self, [img, kern], [out])
    def flops(self, inputs, outputs):
        """ Useful with the hack in profilemode to print the MFlops"""
@@ -145,6 +154,8 @@ class GpuConv(gof.Op):
    def make_thunk(self, node, storage_map, compute_map, no_recycling):
        node_ = copy.copy(node)
        assert node.op is node_.op
+        if config.gpuarray.sync:
+            raise NotImplementedError("GpuConv do not implement gpuarray.sync Theano flag")
        if node_.op.max_threads_dim0 is None:
            cuda = theano.sandbox.cuda
            device_id = cuda.use.device_number
@@ -169,20 +180,30 @@ class GpuConv(gof.Op):
        return ['-DTHEANO_KERN_WID=' + str(nb)]  # ,'-g','-G']
    def c_headers(self):
-        return ['cuda_ndarray.cuh', '<stdio.h>']
+        return ['<stdio.h>', 'cuda.h',
+                '<compyte/extension.h>', '<compyte/numpy_compat.h>']
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
        return (0, 20)
+    def c_init_code(self):
+        return ['cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
        # these files
        files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
-        codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
+        codes = ["CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);",
-                for f in files]
+                 "float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}",
+                 "const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}"]
+        codes += [open(os.path.join(os.path.split(__file__)[0], f)).read()
+                  for f in files]
        return reduce(str.__add__, codes)
+    def c_compiler(self):
+        return NVCC_compiler
    def c_code(self, node, nodename, inp, out_, sub):
        img, kern = inp
        out, = out_
@@ -226,7 +247,8 @@ class GpuConv(gof.Op):
    }
    // TODO, make out be decref before we alloc out2!
-    CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
+    PyGpuArrayObject * out2 = (PyGpuArrayObject *)PyGpuArray_Conv(
+                                                         %(img)s, %(kern)s,
                                                         %(out)s, mode,
                                                         dx, dy,
                                                         version, verbose,

--- a/theano/sandbox/gpuarray/conv_full_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_full_kernel.cu
@@ -4,7 +4,8 @@
 //grid block size=batch_id
 //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
 __global__ void
-conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
+conv_full_patch_split(const float* img, const float* kern, float* out,
+                      int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
 {
  int __shared__ out_len, out_wid, nb_thread_id;
  out_len = img_len + kern_len - 1;
@@ -60,7 +61,7 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
 //grid block size=batch_id, nkern
 //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
 __global__ void
-conv_full_patch( float* img, float* kern, float* out,
+conv_full_patch( const float* img, const float* kern, float* out,
                 int img_len, int img_wid,
                 int kern_len, int kern_wid, int nkern, int nstack)
 {
@@ -122,7 +123,7 @@ conv_full_patch( float* img, float* kern, float* out,
 template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
 __global__ void
-conv_full_patch_stack( float* img, float* kern, float* out,
+conv_full_patch_stack( const float* img, const float* kern, float* out,
                       int img_len, int img_wid,
                       int kern_len, int kern_wid, int nkern, int nstack,
                       int img_stride_col, int img_stride_row,
@@ -133,7 +134,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
  out_len = img_len + kern_len - 1;
  out_wid = img_wid + kern_wid - 1;
  nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
-  float __shared__ *kern_, *img_;
+  const float __shared__ *kern_, *img_;
  extern __shared__ float s_data[];
    const int batch_id = blockIdx.x;
@@ -201,7 +202,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
 */
 template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
 __global__ void
-conv_full_patch_stack_padded( float* img, float* kern, float* out,
+conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
                  const int img_len, const int img_wid,
                  const int kern_len, const int kern_wid,
                  const int nkern, const int nstack,
@@ -365,7 +366,7 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
 }
 template<int NSTACK>
 __global__ void
-conv_full_load_everything( float* img, float* kern, float* out,
+conv_full_load_everything( const float* img, const float* kern, float* out,
                 int img_len, int img_wid,
                 int kern_len, int kern_wid, int nkern, int nstack,
                 int img_stride_col, int img_stride_row,

--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_kernel.cu
@@ -221,7 +221,7 @@ __device__ void store_or_accumulate(float& dst,const float value ){
 */
 template<bool flipped_kern, int KERN_WIDTH, bool split>
 __global__ void
-conv_patch( float* img, float* kern, float* out,
+conv_patch( const float* img, const float* kern, float* out,
            int img_len, int img_wid, int kern_len, int kern_wid,
            int nkern, int nstack)
 {
@@ -304,7 +304,7 @@ conv_patch( float* img, float* kern, float* out,
 */
 template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
 __global__ void
-conv_patch_stack( float* img, float* kern, float* out,
+conv_patch_stack( const float* img, const float* kern, float* out,
                  int img_len, int img_wid, int kern_len, int kern_wid,
                  int out_len, int out_wid,
                  int nkern, int nstack, int img_stride_col,int img_stride_row,
@@ -375,7 +375,7 @@ conv_patch_stack( float* img, float* kern, float* out,
                                          out_row*out_wid+out_col],sum);
    }else{
-      float __shared__ *kern_, *img_;
+      const float __shared__ *kern_, *img_;
      int __shared__ out_len_max;
      kern_=kern+kern_stride_nkern*kern_id;//the good nkern
@@ -456,7 +456,7 @@ conv_patch_stack( float* img, float* kern, float* out,
 */
 template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
 __global__ void
-conv_patch_stack_reduce( float* img, float* kern, float* out,
+conv_patch_stack_reduce( const float* img, const float* kern, float* out,
                  int img_len, int img_wid, int kern_len, int kern_wid,
                  int nkern, int nstack, int img_stride_col,int img_stride_row,
                  int img_stride_stack, int img_stride_batch,
@@ -572,7 +572,7 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
 */
 template<int KERN_WIDTH, bool c_contiguous>
 __global__ void
-conv_rows( float* img, float* kern, float* out,
+conv_rows( const float* img, const float* kern, float* out,
           int img_len, int img_wid, int kern_len, int kern_wid,
           int nkern, int nstack,
           int img_stride_col, int img_stride_row,
@@ -633,7 +633,7 @@ conv_rows( float* img, float* kern, float* out,
 */
 template<int KERN_WIDTH, bool c_contiguous>
 __global__ void
-conv_rows_stack( float* img, float* kern, float* out,
+conv_rows_stack( const float* img, const float* kern, float* out,
                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
                 const int nkern, const int nstack,
                 const int img_stride_col, const int img_stride_row,
@@ -731,7 +731,7 @@ conv_rows_stack( float* img, float* kern, float* out,
 */
 template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
 __global__ void
-conv_rows_stack2( float* img, float* kern, float* out,
+conv_rows_stack2(const float* img, const float* kern, float* out,
                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
                 const int nkern, const int nstack,
                 const int img_stride_col, const int img_stride_row,
@@ -831,8 +831,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
        int img_len, int img_wid, 
        int kern_len, int kern_wid,
        int out_len, int out_wid, //physical
-        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
        int subsample_rows, int subsample_cols,
        const int initial_reduce_boundary)
@@ -859,8 +859,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
        float sum = 0.0f;
        if(stack_loop){
          for (; ss < stacklen; ss+=blockDim.x){
-            float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
+            const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
-            float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
+            const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
            for (int cc = 0; cc < kern_wid; ++cc)
            {
                sum +=  kk_0[0] * ii_0[0];
@@ -869,8 +869,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
            }
          }
        }else{
-          float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
+          const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
-          float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
+          const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
          for (int cc = 0; cc < kern_wid; ++cc)
          {
            sum +=  kk_0[0] * ii_0[0];
@@ -925,8 +925,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
        int img_len, int img_wid, 
        int kern_len, int kern_wid,
        int out_len, int out_wid, //physical
-        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
        int subsample_rows, int subsample_cols)
 {
@@ -984,8 +984,8 @@ conv_reference_full(int nB, int nK, int stacklen,
        int img_len, int img_wid, 
        int kern_len, int kern_wid,
        int out_len, int out_wid, //physical dimensions
-        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
        int subsample_rows, int subsample_cols)
 {

--- a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
@@ -25,6 +25,7 @@ from theano.tests.unittest_tools import seed_rng
 from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
                                                          mode_without_gpu)
 from theano.sandbox.gpuarray.type import GpuArrayType
+from theano.sandbox.gpuarray.conv import GpuConv
 import pygpu
 gftensor4 = GpuArrayType('float32', [False] * 4)
@@ -159,11 +160,11 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
        t1 = time.time()
        i = gftensor4()
        k = gftensor4()
-        op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
+        op = GpuConv(border_mode=mode,
-                                              subsample=subsample,
+                     subsample=subsample,
-                                              version=version,
+                     version=version,
-                                              verbose=verbose,
+                     verbose=verbose,
-                                              kshp=compile_kshp)(i, k)
+                     kshp=compile_kshp)(i, k)
        f = theano.function([i, k], op, mode=mode_with_gpu)
        gpuval = f(img, kern)
        t2 = time.time()
@@ -731,7 +732,7 @@ class TestConv2DGPU(unittest.TestCase):
            func = theano.function([a, A], image_estimate, mode=mode_with_gpu)
            #theano.printing.debugprint(func,)
-            assert any([isinstance(node.op, theano.sandbox.cuda.blas.GpuConv)
+            assert any([isinstance(node.op, GpuConv)
                        for node in func.maker.fgraph.toposort()])
            a_in = numpy.random.randn(*featshp).astype("float32")