提交 5840d042 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Switch gs and ls to follow libgpuarray.

上级 c0b24762
...@@ -349,23 +349,23 @@ int {fname}(unsigned int _nd, size_t *_gdim, size_t *_ldim, size_t _shared, ...@@ -349,23 +349,23 @@ int {fname}(unsigned int _nd, size_t *_gdim, size_t *_ldim, size_t _shared,
{args}) {{ {args}) {{
{setargs} {setargs}
return GpuKernel_call(&{kname}, _nd, _ldim, _gdim, _shared, NULL); return GpuKernel_call(&{kname}, _nd, _gdim, _ldim, _shared, NULL);
}} }}
int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{ int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{
size_t _ls = 0;
size_t _gs = 0; size_t _gs = 0;
size_t _ls = 0;
int _err; int _err;
if (_nd != 1) return GA_UNSUPPORTED_ERROR; if (_nd != 1) return GA_UNSUPPORTED_ERROR;
_err = GpuKernel_sched(&{kname}, _n[0], &_ls, &_gs); _err = GpuKernel_sched(&{kname}, _n[0], &_gs, &_ls);
if (_err != GA_NO_ERROR) if (_err != GA_NO_ERROR)
return _err; return _err;
{setargs} {setargs}
return GpuKernel_call(&{kname}, 1, &_ls, &_gs, _shared, NULL); return GpuKernel_call(&{kname}, 1, &_gs, &_ls, _shared, NULL);
}} }}
""".format(args=args, fname=k.fname, setargs=setargs, sname=k.sname, """.format(args=args, fname=k.fname, setargs=setargs, sname=k.sname,
kname=k.objvar) kname=k.objvar)
......
...@@ -786,7 +786,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -786,7 +786,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)%(z)s->ga.data, (void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0}; (void *)&stride_Z0};
int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params); int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params);
%(err_check)s %(err_check)s
""" """
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
...@@ -852,7 +852,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -852,7 +852,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
n_blocks[0],n_blocks[1],n_blocks[2], n_blocks[0],n_blocks[1],n_blocks[2],
n_blocks[0]*n_blocks[1]*n_blocks[2], n_blocks[0]*n_blocks[1]*n_blocks[2],
n_shared, %(shapes_data)s); n_shared, %(shapes_data)s);
int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params); int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params);
%(err_check)s %(err_check)s
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1254,7 +1254,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1254,7 +1254,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
n_threads, numEls, n_threads, numEls,
PyGpuArray_NDIM(%(x)s)); PyGpuArray_NDIM(%(x)s));
size_t n_shared = sizeof(%(acc_dtype)s) * n_threads; size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params); int err = GpuKernel_call(&%(k_var)s, 1, &n_blocks, &n_threads, n_shared, kernel_params);
%(err_check)s %(err_check)s
%(sync)s %(sync)s
} }
...@@ -1424,7 +1424,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1424,7 +1424,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)%(z)s->ga.data, (void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1}; (void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params); int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
%(err_check)s %(err_check)s
%(sync)s %(sync)s
}else{ }else{
...@@ -1453,7 +1453,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1453,7 +1453,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2, (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
(void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset, (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1}; (void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params); int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params);
%(err_check)s %(err_check)s
%(sync)s %(sync)s
} }
...@@ -1528,7 +1528,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1528,7 +1528,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)%(z)s->ga.data, (void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1}; (void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params); int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
%(err_check)s %(err_check)s
%(sync)s %(sync)s
} }
...@@ -1662,7 +1662,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1662,7 +1662,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)%(z)s->ga.data, (void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1}; (void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params); int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_thread, 0, kernel_params);
%(err_check)s %(err_check)s
%(sync)s %(sync)s
} }
...@@ -2849,7 +2849,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2849,7 +2849,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if (gs == 0) gs = 1; if (gs == 0) gs = 1;
n /= gs; n /= gs;
ls = %(ls)s; ls = %(ls)s;
err = GpuKernel_call(&%(k_var)s, 1, &ls, &gs, 0, args); err = GpuKernel_call(&%(k_var)s, 1, &gs, &ls, 0, args);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: GpuCAReduceCPY: %%s.", "gpuarray error: GpuCAReduceCPY: %%s.",
......
...@@ -396,7 +396,7 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -396,7 +396,7 @@ class GpuCumOp(GpuKernelBase, Op):
(void*) &offsetZ, (void*) &offsetZ,
(void*) deviceBlockSum->ga.data (void*) deviceBlockSum->ga.data
}; };
int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params); int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed"); PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed");
return -1; return -1;
...@@ -421,7 +421,7 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -421,7 +421,7 @@ class GpuCumOp(GpuKernelBase, Op):
(void*) &offsetY, (void*) &offsetY,
(void*) &offsetZ (void*) &offsetZ
}; };
int err = GpuKernel_call(&k_finalCumOp_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params); int err = GpuKernel_call(&k_finalCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed"); PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed");
return -1; return -1;
...@@ -446,7 +446,7 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -446,7 +446,7 @@ class GpuCumOp(GpuKernelBase, Op):
(void*) &(tmp0), (void*) &(tmp0),
(void*) &(tmp1) (void*) &(tmp1)
}; };
int err = GpuKernel_call(&k_cumadd_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params); int err = GpuKernel_call(&k_cumadd_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "cumadd call failed"); PyErr_SetString(PyExc_RuntimeError, "cumadd call failed");
return -1; return -1;
......
...@@ -224,7 +224,7 @@ int gemm16(PyGpuArrayObject *C, float alpha, ...@@ -224,7 +224,7 @@ int gemm16(PyGpuArrayObject *C, float alpha,
params[11] = β params[11] = β
params[12] = &flags; params[12] = &flags;
if (GpuKernel_call(gk, 2, threads, grid, 0, params) != GA_NO_ERROR) { if (GpuKernel_call(gk, 2, grid, threads, 0, params) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "error in gemm16 kernel call"); PyErr_SetString(PyExc_RuntimeError, "error in gemm16 kernel call");
res = 1; res = 1;
} }
......
...@@ -205,7 +205,7 @@ KERNEL void k_multi_warp_multinomial( ...@@ -205,7 +205,7 @@ KERNEL void k_multi_warp_multinomial(
args[8] = (void*)&strides[3]; args[8] = (void*)&strides[3];
args[9] = (void*)&strides[4]; args[9] = (void*)&strides[4];
err = GpuKernel_call(&%(kname)s, 1, &nb_threads, &nb_blocks, 0, args); err = GpuKernel_call(&%(kname)s, 1, &nb_blocks, &nb_threads, 0, args);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format( PyErr_Format(
PyExc_RuntimeError, PyExc_RuntimeError,
...@@ -455,7 +455,7 @@ KERNEL void k_multi_warp_multinomial_wor( ...@@ -455,7 +455,7 @@ KERNEL void k_multi_warp_multinomial_wor(
nb_blocks2[0] = nb_blocks; nb_blocks2[0] = nb_blocks;
nb_blocks2[1] = 1; nb_blocks2[1] = 1;
err = GpuKernel_call(&%(kname)s, 2, nb_threads2, nb_blocks2, 0, args); err = GpuKernel_call(&%(kname)s, 2, nb_blocks2, nb_threads2, 0, args);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format( PyErr_Format(
PyExc_RuntimeError, PyExc_RuntimeError,
......
...@@ -470,7 +470,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -470,7 +470,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
(void *)&stride_Z1, (void *)&stride_Z1,
(void *)%(z)s->ga.data, (void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset}; (void *)&%(z)s->ga.offset};
err = GpuKernel_call(fptr, 3, threads_per_block, n_blocks, 0, kernel_params); err = GpuKernel_call(fptr, 3, n_blocks, threads_per_block, 0, kernel_params);
%(err_check)s %(err_check)s
%(sync)s %(sync)s
} // END NESTED SCOPE } // END NESTED SCOPE
......
...@@ -411,7 +411,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -411,7 +411,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
(void *)&stride_YIDX0, (void *)&stride_YIDX0,
(void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset, (void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset,
(void *)&stride_DX0, (void *)&stride_DX1}; (void *)&stride_DX0, (void *)&stride_DX1};
int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params); int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, threads_per_block, 0, kernel_params);
%(err_check)s %(err_check)s
%(sync)s %(sync)s
} }
...@@ -587,13 +587,13 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -587,13 +587,13 @@ class GpuSoftmax(GpuKernelBase, Op):
//TODO: read the information from the card. //TODO: read the information from the card.
if(shmem_sz < (32 * 1024 - 500)){ if(shmem_sz < (32 * 1024 - 500)){
err = GpuKernel_call(&kSoftmax_%(nodename)s, 3, err = GpuKernel_call(&kSoftmax_%(nodename)s, 3,
threads_per_block, n_blocks, shmem_sz, n_blocks, threads_per_block, shmem_sz,
kernel_params); kernel_params);
fmt_str = "gpuarray error: kSoftmax_%(nodename)s: %%s"; fmt_str = "gpuarray error: kSoftmax_%(nodename)s: %%s";
msg = GpuKernel_error(&kSoftmax_%(nodename)s, err); msg = GpuKernel_error(&kSoftmax_%(nodename)s, err);
}else{ }else{
err = GpuKernel_call(&kSoftmax_fixed_shared%(nodename)s, 3, err = GpuKernel_call(&kSoftmax_fixed_shared%(nodename)s, 3,
threads_per_block, n_blocks, n_blocks, threads_per_block,
threads_per_block[0] * sizeof(npy_%(work_x)s), threads_per_block[0] * sizeof(npy_%(work_x)s),
kernel_params); kernel_params);
fmt_str = "gpuarray error: kSoftmax_fixed_shared%(nodename)s: %%s"; fmt_str = "gpuarray error: kSoftmax_fixed_shared%(nodename)s: %%s";
...@@ -801,13 +801,13 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -801,13 +801,13 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
{ {
if(shmem_sz < (32 * 1024 - 500)){ if(shmem_sz < (32 * 1024 - 500)){
err = GpuKernel_call(&kSoftmaxWithBias_%(nodename)s, 3, err = GpuKernel_call(&kSoftmaxWithBias_%(nodename)s, 3,
threads_per_block, n_blocks, shmem_sz, n_blocks, threads_per_block, shmem_sz,
kernel_params); kernel_params);
fmt_str = "gpuarray error: kSoftmaxWithBias_%(nodename)s: %%s"; fmt_str = "gpuarray error: kSoftmaxWithBias_%(nodename)s: %%s";
msg = GpuKernel_error(&kSoftmaxWithBias_%(nodename)s, err); msg = GpuKernel_error(&kSoftmaxWithBias_%(nodename)s, err);
}else{ }else{
err = GpuKernel_call(&kSoftmaxWithBias_fixed_shared%(nodename)s, err = GpuKernel_call(&kSoftmaxWithBias_fixed_shared%(nodename)s,
3, threads_per_block, n_blocks, 3, n_blocks, threads_per_block,
threads_per_block[0] * sizeof(npy_%(work_x)s), threads_per_block[0] * sizeof(npy_%(work_x)s),
kernel_params); kernel_params);
fmt_str = "gpuarray error: kSoftmaxWithBias_fixed_shared%(nodename)s: %%s"; fmt_str = "gpuarray error: kSoftmaxWithBias_fixed_shared%(nodename)s: %%s";
......
...@@ -1055,7 +1055,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1055,7 +1055,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
(void *)&indices_arr->ga.offset, (void *)&indices_arr->ga.offset,
(void *)&set_instead_of_inc, (void *)&set_instead_of_inc,
(void *)errbuf}; (void *)errbuf};
err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params); err = GpuKernel_call(&%(k_var)s, 3, n_blocks, threads_per_block, 0, kernel_params);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.", "gpuarray error: %(k_var)s: %%s.",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论