Switch gs and ls to follow libgpuarray.

5840d042 · Arnaud Bergeron · c0b24762 · 5840d042 · 5840d042 · 5840d042
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -349,23 +349,23 @@ int {fname}(unsigned int _nd, size_t *_gdim, size_t *_ldim, size_t _shared,
                  {args}) {{
  {setargs}
-  return GpuKernel_call(&{kname}, _nd, _ldim, _gdim, _shared, NULL);
+  return GpuKernel_call(&{kname}, _nd, _gdim, _ldim, _shared, NULL);
 }}
 int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{
-  size_t _ls = 0;
  size_t _gs = 0;
+  size_t _ls = 0;
  int _err;
  if (_nd != 1) return GA_UNSUPPORTED_ERROR;
-  _err = GpuKernel_sched(&{kname}, _n[0], &_ls, &_gs);
+  _err = GpuKernel_sched(&{kname}, _n[0], &_gs, &_ls);
  if (_err != GA_NO_ERROR)
    return _err;
  {setargs}
-  return GpuKernel_call(&{kname}, 1, &_ls, &_gs, _shared, NULL);
+  return GpuKernel_call(&{kname}, 1, &_gs, &_ls, _shared, NULL);
 }}
        """.format(args=args, fname=k.fname, setargs=setargs, sname=k.sname,
                   kname=k.objvar)

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -786,7 +786,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                        (void *)%(z)s->ga.data,
                        (void *)&%(z)s->ga.offset,
                        (void *)&stride_Z0};
-                int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
+                int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params);
                %(err_check)s
        """
        in_dtype = "npy_" + node.inputs[0].dtype
@@ -852,7 +852,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                  n_blocks[0],n_blocks[1],n_blocks[2],
                                  n_blocks[0]*n_blocks[1]*n_blocks[2],
                                  n_shared, %(shapes_data)s);
-            int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
+            int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params);
            %(err_check)s
            """ % locals(), file=sio)
@@ -1254,7 +1254,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                n_threads, numEls,
                                PyGpuArray_NDIM(%(x)s));
            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
-            int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
+            int err = GpuKernel_call(&%(k_var)s, 1, &n_blocks, &n_threads, n_shared, kernel_params);
            %(err_check)s
            %(sync)s
         }
@@ -1424,7 +1424,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                        (void *)%(z)s->ga.data,
                        (void *)&%(z)s->ga.offset,
                        (void *)&stride_Z0, (void *)&stride_Z1};
-                int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
+                int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
                %(err_check)s
                %(sync)s
        }else{
@@ -1453,7 +1453,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                    (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
                    (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
                    (void *)&stride_Z0, (void *)&stride_Z1};
-            int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
+            int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params);
            %(err_check)s
            %(sync)s
        }
@@ -1528,7 +1528,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                        (void *)%(z)s->ga.data,
                        (void *)&%(z)s->ga.offset,
                        (void *)&stride_Z0, (void *)&stride_Z1};
-                int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
+                int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
                %(err_check)s
                %(sync)s
            }
@@ -1662,7 +1662,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                        (void *)%(z)s->ga.data,
                        (void *)&%(z)s->ga.offset,
                        (void *)&stride_Z0, (void *)&stride_Z1};
-                int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
+                int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_thread, 0, kernel_params);
                %(err_check)s
                %(sync)s
            }
@@ -2849,7 +2849,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        if (gs == 0) gs = 1;
        n /= gs;
        ls = %(ls)s;
-        err = GpuKernel_call(&%(k_var)s, 1, &ls, &gs, 0, args);
+        err = GpuKernel_call(&%(k_var)s, 1, &gs, &ls, 0, args);
        if (err != GA_NO_ERROR) {
            PyErr_Format(PyExc_RuntimeError,
                         "gpuarray error: GpuCAReduceCPY: %%s.",

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -396,7 +396,7 @@ class GpuCumOp(GpuKernelBase, Op):
                                             (void*) &offsetZ,
                                             (void*) deviceBlockSum->ga.data
                        };
-                    int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
+                    int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
                    if (err != GA_NO_ERROR){
                        PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed");
                        return -1;
@@ -421,7 +421,7 @@ class GpuCumOp(GpuKernelBase, Op):
                                                 (void*) &offsetY,
                                                 (void*) &offsetZ
                            };
-                        int err = GpuKernel_call(&k_finalCumOp_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
+                        int err = GpuKernel_call(&k_finalCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
                        if (err != GA_NO_ERROR){
                            PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed");
                            return -1;
@@ -446,7 +446,7 @@ class GpuCumOp(GpuKernelBase, Op):
                                                 (void*) &(tmp0),
                                                 (void*) &(tmp1)
                        };
-                        int err = GpuKernel_call(&k_cumadd_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
+                        int err = GpuKernel_call(&k_cumadd_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
                        if (err != GA_NO_ERROR){
                            PyErr_SetString(PyExc_RuntimeError, "cumadd call failed");
                            return -1;

--- a/theano/gpuarray/gemm16.c
+++ b/theano/gpuarray/gemm16.c
@@ -224,7 +224,7 @@ int gemm16(PyGpuArrayObject *C, float alpha,
  params[11] = &beta;
  params[12] = &flags;
-  if (GpuKernel_call(gk, 2, threads, grid, 0, params) != GA_NO_ERROR) {
+  if (GpuKernel_call(gk, 2, grid, threads, 0, params) != GA_NO_ERROR) {
    PyErr_SetString(PyExc_RuntimeError, "error in gemm16 kernel call");
    res = 1;
  }

--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -205,7 +205,7 @@ KERNEL void k_multi_warp_multinomial(
        args[8] = (void*)&strides[3];
        args[9] = (void*)&strides[4];
-        err = GpuKernel_call(&%(kname)s, 1, &nb_threads, &nb_blocks, 0, args);
+        err = GpuKernel_call(&%(kname)s, 1, &nb_blocks, &nb_threads, 0, args);
        if (err != GA_NO_ERROR) {
           PyErr_Format(
                PyExc_RuntimeError,
@@ -455,7 +455,7 @@ KERNEL void k_multi_warp_multinomial_wor(
        nb_blocks2[0] = nb_blocks;
        nb_blocks2[1] = 1;
-        err = GpuKernel_call(&%(kname)s, 2, nb_threads2, nb_blocks2, 0, args);
+        err = GpuKernel_call(&%(kname)s, 2, nb_blocks2, nb_threads2, 0, args);
        if (err != GA_NO_ERROR) {
           PyErr_Format(
                PyExc_RuntimeError,

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
@@ -470,7 +470,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                                     (void *)&stride_Z1,
                                     (void *)%(z)s->ga.data,
                                     (void *)&%(z)s->ga.offset};
-            err = GpuKernel_call(fptr, 3, threads_per_block, n_blocks, 0, kernel_params);
+            err = GpuKernel_call(fptr, 3, n_blocks, threads_per_block, 0, kernel_params);
            %(err_check)s
            %(sync)s
        } // END NESTED SCOPE

--- a/theano/gpuarray/nnet.py
+++ b/theano/gpuarray/nnet.py
@@ -411,7 +411,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
                (void *)&stride_YIDX0,
                (void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset,
                (void *)&stride_DX0, (void *)&stride_DX1};
-            int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
+            int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, threads_per_block, 0, kernel_params);
            %(err_check)s
            %(sync)s
        }
@@ -587,13 +587,13 @@ class GpuSoftmax(GpuKernelBase, Op):
              //TODO: read the information from the card.
              if(shmem_sz < (32 * 1024 - 500)){
                err = GpuKernel_call(&kSoftmax_%(nodename)s, 3,
-                                     threads_per_block, n_blocks, shmem_sz,
+                                     n_blocks, threads_per_block, shmem_sz,
                                     kernel_params);
                fmt_str = "gpuarray error: kSoftmax_%(nodename)s: %%s";
                msg = GpuKernel_error(&kSoftmax_%(nodename)s, err);
              }else{
                err = GpuKernel_call(&kSoftmax_fixed_shared%(nodename)s, 3,
-                                     threads_per_block, n_blocks,
+                                     n_blocks, threads_per_block,
                                     threads_per_block[0] * sizeof(npy_%(work_x)s),
                                     kernel_params);
                fmt_str = "gpuarray error: kSoftmax_fixed_shared%(nodename)s: %%s";
@@ -801,13 +801,13 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
            {
              if(shmem_sz < (32 * 1024 - 500)){
                err = GpuKernel_call(&kSoftmaxWithBias_%(nodename)s, 3,
-                                     threads_per_block, n_blocks, shmem_sz,
+                                     n_blocks, threads_per_block, shmem_sz,
                                     kernel_params);
                fmt_str = "gpuarray error: kSoftmaxWithBias_%(nodename)s: %%s";
                msg = GpuKernel_error(&kSoftmaxWithBias_%(nodename)s, err);
              }else{
                err = GpuKernel_call(&kSoftmaxWithBias_fixed_shared%(nodename)s,
-                                     3, threads_per_block, n_blocks,
+                                     3, n_blocks, threads_per_block,
                                     threads_per_block[0] * sizeof(npy_%(work_x)s),
                                     kernel_params);
                fmt_str = "gpuarray error: kSoftmaxWithBias_fixed_shared%(nodename)s: %%s";

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -1055,7 +1055,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
                                       (void *)&indices_arr->ga.offset,
                                       (void *)&set_instead_of_inc,
                                       (void *)errbuf};
-              err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
+              err = GpuKernel_call(&%(k_var)s, 3, n_blocks, threads_per_block, 0, kernel_params);
              if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "gpuarray error: %(k_var)s: %%s.",