Attempt at fixing convolution. It does not crash but gives wrong results.

c032ac66 · Arnaud Bergeron · 4f1c2697 · c032ac66 · c032ac66 · c032ac66
--- a/theano/sandbox/gpuarray/conv.cu
+++ b/theano/sandbox/gpuarray/conv.cu
@@ -134,7 +134,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
    const int out_size_byte = out_size*sizeof(float);
    if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
     PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
-                   " %d kernel columns, but the kernel we received had %llud columns!",
+                   " %d kernel columns, but the kernel we received had %llu columns!",
                   THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]);
      return -1;
    }
@@ -217,13 +217,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i,"
-                      " n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i, nb_split=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1], nb_split);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_patch' failed (%s),"
@@ -307,21 +300,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,

        if (err == GA_NO_ERROR)
        {
-            if (verbose>1)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i,"
-                      " kern_flipped=true, accumulate=false, kern_width=%i,"
-                      " img_c_contiguous_2d=%i,"
-                      " kern_c_contiguous_2d=%i, nb_split=%i,"
-                      " preload_full_kernel=%i,"
-                      " subsample_rows=%llu, subsample_cols=%llu\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1],
-                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-                      nb_split, preload_full_kernel,
-                      (unsigned long long)subsample_rows,
-                      (unsigned long long)subsample_cols);
            if (verbose)
              fprintf(stderr,
                      "INFO: used 'conv_patch_stack' version with nb_split=%i"
@@ -334,21 +312,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i,"
-                      " kern_flipped=true, accumulate=false,"
-                      " kern_width=%i, img_c_contiguous_2d=%i,"
-                      " kern_c_contiguous_2d=%i, nb_split=%i,"
-                      " preload_full_kernel=%i,"
-                      " subsample_rows=%llu, subsample_cols=%llu\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1],
-                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-                      nb_split, preload_full_kernel,
-                      (unsigned long long)subsample_rows,
-                      (unsigned long long)subsample_cols);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_patch_stack' failed (%s),"
@@ -394,12 +357,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_rows' failed (%s),"
@@ -428,19 +385,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,

        size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);

-        if (0)
-          fprintf(stderr,
-                  "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
-                  img_contiguous_2d, kern_contiguous_2d,
-                  threads_per_block[0], threads_per_block[1], threads_per_block[2],
-                  n_blocks[0], n_blocks[1], n_blocks[2]);
-
        GpuKernel *k = NULL;
        if(!img_contiguous_2d || !kern_contiguous_2d) {
-            //fprintf(stderr, "using false version\n");
            k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
        } else {
-            //fprintf(stderr, "using true version\n");
            k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
        }

@@ -460,23 +408,11 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        if (err == GA_NO_ERROR)
        {
            work_complete = true;
-            if (verbose>1)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_rows_stack' failed (%s),"
@@ -543,12 +479,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        if (err == GA_NO_ERROR)
        {
            work_complete = true;
-            if (verbose>1)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: used 'conv_rows_stack2' version %s with"
@@ -558,12 +488,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i version=%d\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1],(version==9?2:3));
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_rows_stack2' failed (%s),"
@@ -680,13 +604,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,

            if (err == GA_NO_ERROR)
            {
-                if (verbose>1)
-                    fprintf(stderr,
-                            "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i, "
-                            "n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i,"
-                            " nb_threads=%i\n",
-                            threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1],
-                            shmem_sz, threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
                if (verbose)
                    fprintf(stderr,
                            "INFO: used 'conv_patch_stack_reduce' version"
@@ -697,14 +614,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
            }
            else
            {
-                if (verbose)
-                  fprintf(stderr,
-                          "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
-                          " n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i,"
-                          " nb_threads=%i\n",
-                          threads_per_block[0], threads_per_block[1], threads_per_block[2],
-                          n_blocks[0], n_blocks[1], shmem_sz,
-                          threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
                if (verbose)
                  fprintf(stderr,
                          "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
@@ -714,7 +623,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        } // else no good nb_splits was found
    }

-    if (1 && (version==6||version==-1) &&
+    if ((version==6||version==-1) &&
        kern_len<=320 &&
        !work_complete) //conv_valid_row_reduce
    {
@@ -782,12 +691,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0],
-                      n_reduce_buf, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_valid_row_reduce' failed (%s),"
@@ -805,43 +708,8 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
                                        (size_t)256),
                               (size_t)1, (size_t)1};

-        if (1)
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: launching conv_reference_valid\n");
-            if (verbose>1)
-              fprintf(stderr, "      img : %i %llu %i %i %p  "
-                      "%lld %lld %lld %lld\n",
-                      nbatch, (unsigned long long)stack_len, img_len, img_wid,
-                      (void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
-                      (long long)img_stride_batch,
-                      (long long)img_stride_stack,
-                      (long long)img_stride_row,
-                      (long long)img_stride_col);
-            if (verbose>1)
-              fprintf(stderr, "      kern: %i %i %i %i %p  "
-                      "%lld %lld %lld %lld\n",
-                      nkern, nstack, kern_len, kern_wid,
-                      (void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
-                      (long long)kern_stride_nkern,
-                      (long long)kern_stride_stack,
-                      (long long)kern_stride_row,
-                      (long long)kern_stride_col);
-            if (verbose>1)
-                fprintf(stderr, "      out : %llu %llu %i %i %p  "
-                        "%lld %lld %lld %lld\n",
-                      (unsigned long long)PyGpuArray_DIMS(out)[0],
-                      (unsigned long long)PyGpuArray_DIMS(out)[1],
-                      out_len, out_wid,
-                      (void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
-                      (long long)out_stride_batch,
-                      (long long)out_stride_nkern,
-                      (long long)out_stride_row,
-                      (long long)out_stride_col);
-            if (verbose>1)
-              fprintf(stderr, "   launch params: %i %i %i\n",
-                      outsize, n_blocks[0], threads_per_block[0]);
-        }
+        if (verbose)
+            fprintf(stderr, "INFO: launching conv_reference_valid\n");

        void *kernel_params[] = {
            (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
@@ -1113,15 +981,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,

        if (err == GA_NO_ERROR)
        {
-          if (verbose>1)
-            fprintf(stderr,
-                    "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
-                    " n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i, nb_threads=%i,"
-                    " out_len=%i, nb_split=%i, version=%i\n",
-                    threads_per_block[0], threads_per_block[1], threads_per_block[2],
-                    n_blocks[0], n_blocks[1], shmem_sz,
-                    threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
-                    out_len, nb_split, version);
            if (verbose)
              fprintf(stderr,
                      "INFO: used 'conv_full_patch_stack_padded'"
@@ -1131,15 +990,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        }
        else
        {
-          if (verbose)
-            fprintf(stderr,
-                    "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
-                    " n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i, nb_threads=%i,"
-                    " out_len=%i, nb_split=%i, version=%i\n",
-                    threads_per_block[0], threads_per_block[1], threads_per_block[2],
-                    n_blocks[0], n_blocks[1], shmem_sz,
-                    threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
-                    out_len, nb_split, version);
          if (verbose)
            fprintf(stderr,
                    "INFO: impl 'conv_full_patch_stack_padded' %s %s"
@@ -1179,12 +1029,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
-                      threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_full_patch' failed (%s),"
@@ -1225,12 +1069,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
-                      threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
                      " failed (%s), trying next implementation\n",
@@ -1276,12 +1114,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
                      GpuKernel_error(k, err));
@@ -1298,55 +1130,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
                                        (size_t)256),
                               (size_t)1, (size_t)1};

-        if (0)
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: launching conv_reference_valid\n");
-            if (verbose)
-              fprintf(stderr, "      img : %llu %llu %llu %llu %p  "
-                      "%lld %lld %lld %lld\n",
-                      (unsigned long long)nbatch,
-                      (unsigned long long)stack_len,
-                      (unsigned long long)img_len,
-                      (unsigned long long)img_wid,
-                      (void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
-                      (long long)img_stride_batch,
-                      (long long)img_stride_stack,
-                      (long long)img_stride_row,
-                      (long long)img_stride_col);
-            if (verbose)
-              fprintf(stderr, "      kern: %llu %llu %llu %llu %p  "
-                      "%lld %lld %lld %lld\n",
-                      (unsigned long long)nkern,
-                      (unsigned long long)nstack,
-                      (unsigned long long)kern_len,
-                      (unsigned long long)kern_wid,
-                      (void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
-                      (long long)kern_stride_nkern,
-                      (long long)kern_stride_stack,
-                      (long long)kern_stride_row,
-                      (long long)kern_stride_col);
-            if (verbose)
-                fprintf(stderr, "      out : %llu %llu %llu %llu %p  "
-                        "%lld %lld %lld %lld\n",
-                      (unsigned long long)PyGpuArray_DIMS(out)[0],
-                      (unsigned long long)PyGpuArray_DIMS(out)[1],
-                      (unsigned long long)out_len,
-                      (unsigned long long)out_wid,
-                      (void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
-                      (long long)out_stride_batch,
-                      (long long)out_stride_nkern,
-                      (long long)out_stride_row,
-                      (long long)out_stride_col);
-            if (verbose)
-              fprintf(stderr, "   launch params: %i %i %i\n",
-                      outsize, n_blocks[0], threads_per_block[0]);
-            if (verbose)
-                fprintf(stderr, "   subsample params: %llu %llu\n",
-                        (unsigned long long)subsample_rows,
-                        (unsigned long long)subsample_cols);
-        }
-
        void *kernel_params[] = {
            (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
            (void *)&img_len, (void *)&img_wid,
@@ -1377,11 +1160,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        }
        else
        {
-          if (verbose)
-            fprintf(stderr, "threads_per_block[0]=%i, threads_per_block[1]=%i,"
-                    " n_blocks[0]=%i, n_blocks[1]=%i,"
-                    " shmem_sz=%i, nb_threads=%i\n",
-                    threads_per_block[0], 1, n_blocks[0], 1, 0, threads_per_block[0]);
          if (verbose)
            fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
                    " trying next implementation\n",
@@ -1465,7 +1243,7 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,

      rval = pygpu_zeros(4, out_dim,
                         img->ga.typecode, GA_C_ORDER,
-                         img->ctx, Py_None);
+                         img->context, Py_None);
      //rval might be null
    }
    if ((rval==NULL)
@@ -1488,14 +1266,3 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
    }
    return (PyObject*)rval;
 }
-
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
@@ -164,7 +164,7 @@ class GpuConv(GpuKernelBase, gof.Op):
        node_ = copy.copy(node)
        assert node.op is node_.op
        if node_.op.max_threads_dim0 is None:
-            node_.op.max_threads_dim0 = node._inputs[0].type.context.maxlsize
+            node_.op.max_threads_dim0 = node_.inputs[0].type.context.maxlsize
        return super(GpuConv, node_.op).make_thunk(node_, storage_map,
                                                   compute_map, no_recycling)

@@ -179,7 +179,7 @@ class GpuConv(GpuKernelBase, gof.Op):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 22)
+        return (0, 23)

    def c_code(self, node, nodename, inp, out_, sub):
        if node.inputs[0].type.context.kind != "cuda":
@@ -251,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op):
        """ % locals()
        code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
                           for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
-        kname = "conv_full_load_everything"
        gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
        bin = gk._binary
        bcode = ','.join(hex(ord(c)) for c in bin)
@@ -262,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op):
        static const char conv_bcode[] = {%(bcode)s};
        static const char *conv_code = "%(code)s";
        """ % locals()
-        for k in kernels:
-            mod += "static GpuKernel " + k.name + '_' + name + ";\n"
-        mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read()
+        return mod
+
+    def c_support_code_struct(self, node, name):
+        mod = GpuKernelBase.c_support_code_struct(self, node, name)
+        with open(os.path.join(os.path.split(__file__)[0], "conv.cu")) as f:
+            mod += f.read()
        return mod

    @utils.memoize

--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_kernel.cu
@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {

 //Must be the same size as a ptr. We can't use unsigned long as on Windows 64
 //bit, it is 32 bit.
-const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
+const size_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers

 __device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
  if (nb_thread < 64)
@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
      if (thread_id < nb_thread)
        {
          const float * my_src_ptr = (const float *)(
-                  ((uintptr_t)src) & COALESCED_ALIGN);
+                  ((size_t)src) & COALESCED_ALIGN);
          my_src_ptr += thread_id;
          while (my_src_ptr < src + N)
          {

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -837,8 +837,7 @@ def local_gpu_conv(node, context_name):
        return
    out = gpu_conv(GpuFromHost(context_name)(img),
                   GpuFromHost(context_name)(kern))
-    # op_lifter want the output on the GPU.
-    out = GpuFromHost(context_name)(out)
+    assert isinstance(out.type, GpuArrayType)
    out.values_eq_approx = values_eq_approx
    return [out]