Attempt at fixing convolution. It does not crash but gives wrong results.

c032ac66 · Arnaud Bergeron · 4f1c2697 · c032ac66 · c032ac66 · c032ac66
--- a/theano/sandbox/gpuarray/conv.cu
+++ b/theano/sandbox/gpuarray/conv.cu
--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
@@ -164,7 +164,7 @@ class GpuConv(GpuKernelBase, gof.Op):
        node_ = copy.copy(node)
        assert node.op is node_.op
        if node_.op.max_threads_dim0 is None:
-            node_.op.max_threads_dim0 = node._inputs[0].type.context.maxlsize
+            node_.op.max_threads_dim0 = node_.inputs[0].type.context.maxlsize
        return super(GpuConv, node_.op).make_thunk(node_, storage_map,
                                                   compute_map, no_recycling)
@@ -179,7 +179,7 @@ class GpuConv(GpuKernelBase, gof.Op):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 22)
+        return (0, 23)
    def c_code(self, node, nodename, inp, out_, sub):
        if node.inputs[0].type.context.kind != "cuda":
@@ -251,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op):
        """ % locals()
        code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
                           for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
-        kname = "conv_full_load_everything"
        gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
        bin = gk._binary
        bcode = ','.join(hex(ord(c)) for c in bin)
@@ -262,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op):
        static const char conv_bcode[] = {%(bcode)s};
        static const char *conv_code = "%(code)s";
        """ % locals()
-        for k in kernels:
+        return mod
-            mod += "static GpuKernel " + k.name + '_' + name + ";\n"
-        mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read()
+    def c_support_code_struct(self, node, name):
+        mod = GpuKernelBase.c_support_code_struct(self, node, name)
+        with open(os.path.join(os.path.split(__file__)[0], "conv.cu")) as f:
+            mod += f.read()
        return mod
    @utils.memoize

--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_kernel.cu
@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
 //Must be the same size as a ptr. We can't use unsigned long as on Windows 64
 //bit, it is 32 bit.
-const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
+const size_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
 __device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
  if (nb_thread < 64)
@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
      if (thread_id < nb_thread)
        {
          const float * my_src_ptr = (const float *)(
-                  ((uintptr_t)src) & COALESCED_ALIGN);
+                  ((size_t)src) & COALESCED_ALIGN);
          my_src_ptr += thread_id;
          while (my_src_ptr < src + N)
          {

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -837,8 +837,7 @@ def local_gpu_conv(node, context_name):
        return
    out = gpu_conv(GpuFromHost(context_name)(img),
                   GpuFromHost(context_name)(kern))
-    # op_lifter want the output on the GPU.
+    assert isinstance(out.type, GpuArrayType)
-    out = GpuFromHost(context_name)(out)
    out.values_eq_approx = values_eq_approx
    return [out]