提交 d5701e07 authored 作者: Pierre Luc Carrier's avatar Pierre Luc Carrier

Convert x->nCudaNdarray_HOST_DIMS to PyGpuArray_DIMS in ops GpuSoftmax and GpuSoftmaxWithBias.

上级 cc8b517b
...@@ -482,16 +482,16 @@ class GpuSoftmax (Op): ...@@ -482,16 +482,16 @@ class GpuSoftmax (Op):
%(fail)s; %(fail)s;
} }
if ((NULL == %(z)s) || if ((NULL == %(z)s) ||
(CudaNdarray_HOST_DIMS(%(z)s)[0] != (PyGpuArray_DIMS(%(z)s)[0] !=
CudaNdarray_HOST_DIMS(%(x)s)[0]) || PyGpuArray_DIMS(%(x)s)[0]) ||
(CudaNdarray_HOST_DIMS(%(z)s)[1] != (PyGpuArray_DIMS(%(z)s)[1] !=
CudaNdarray_HOST_DIMS(%(x)s)[1])) PyGpuArray_DIMS(%(x)s)[1]))
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*)CudaNdarray_New(); %(z)s = (CudaNdarray*)CudaNdarray_New();
if ((NULL == %(z)s) if ((NULL == %(z)s)
|| CudaNdarray_alloc_contiguous(%(z)s, 2, || CudaNdarray_alloc_contiguous(%(z)s, 2,
CudaNdarray_HOST_DIMS(%(x)s))) PyGpuArray_DIMS(%(x)s)))
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = NULL; %(z)s = NULL;
...@@ -499,14 +499,14 @@ class GpuSoftmax (Op): ...@@ -499,14 +499,14 @@ class GpuSoftmax (Op):
} }
} }
{ {
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0],
32 * 1024); 32 * 1024);
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512); int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], 512);
int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(dtype)s); 2 * sizeof(npy_%(dtype)s);
if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0) if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{ {
//Those numbers are based on not too recent GPU //Those numbers are based on not too recent GPU
//to make them compatible with more GPU. //to make them compatible with more GPU.
...@@ -518,8 +518,8 @@ class GpuSoftmax (Op): ...@@ -518,8 +518,8 @@ class GpuSoftmax (Op):
n_threads, n_threads,
n_shared_bytes n_shared_bytes
>>>( >>>(
CudaNdarray_HOST_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1], PyGpuArray_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
...@@ -536,8 +536,8 @@ class GpuSoftmax (Op): ...@@ -536,8 +536,8 @@ class GpuSoftmax (Op):
n_threads, n_threads,
n_threads * sizeof(npy_%(dtype)s) n_threads * sizeof(npy_%(dtype)s)
>>>( >>>(
CudaNdarray_HOST_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1], PyGpuArray_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
...@@ -658,27 +658,27 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -658,27 +658,27 @@ class GpuSoftmaxWithBias (GpuOp):
PyErr_SetString(PyExc_ValueError, "rank error for the bias"); PyErr_SetString(PyExc_ValueError, "rank error for the bias");
%(fail)s; %(fail)s;
} }
if ((CudaNdarray_HOST_DIMS(%(x)s)[1] != if ((PyGpuArray_DIMS(%(x)s)[1] !=
CudaNdarray_HOST_DIMS(%(b)s)[0])) PyGpuArray_DIMS(%(b)s)[0]))
{ {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"number of columns in x (%%ld)" "number of columns in x (%%ld)"
" does not match length of b (%%ld)", " does not match length of b (%%ld)",
(long int)CudaNdarray_HOST_DIMS(%(x)s)[1], (long int)PyGpuArray_DIMS(%(x)s)[1],
(long int)CudaNdarray_HOST_DIMS(%(b)s)[0]); (long int)PyGpuArray_DIMS(%(b)s)[0]);
%(fail)s; %(fail)s;
} }
if ((NULL == %(z)s) if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != || (PyGpuArray_DIMS(%(z)s)[0] !=
CudaNdarray_HOST_DIMS(%(x)s)[0]) PyGpuArray_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != || (PyGpuArray_DIMS(%(z)s)[1] !=
CudaNdarray_HOST_DIMS(%(x)s)[1])) PyGpuArray_DIMS(%(x)s)[1]))
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*)CudaNdarray_New(); %(z)s = (CudaNdarray*)CudaNdarray_New();
if ((NULL == %(z)s) if ((NULL == %(z)s)
|| CudaNdarray_alloc_contiguous(%(z)s, 2, || CudaNdarray_alloc_contiguous(%(z)s, 2,
CudaNdarray_HOST_DIMS(%(x)s))) PyGpuArray_DIMS(%(x)s)))
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = NULL; %(z)s = NULL;
...@@ -686,12 +686,12 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -686,12 +686,12 @@ class GpuSoftmaxWithBias (GpuOp):
} }
} }
{ {
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024); int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0],32*1024);
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512); int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], 512);
int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(dtype)s); 2 * sizeof(npy_%(dtype)s);
if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0) if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{ {
if(n_shared_bytes < (32 * 1024 - 500)){ if(n_shared_bytes < (32 * 1024 - 500)){
kSoftmaxWithBias_%(nodename)s kSoftmaxWithBias_%(nodename)s
...@@ -700,8 +700,8 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -700,8 +700,8 @@ class GpuSoftmaxWithBias (GpuOp):
n_threads, n_threads,
n_shared_bytes n_shared_bytes
>>>( >>>(
CudaNdarray_HOST_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1], PyGpuArray_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
...@@ -721,8 +721,8 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -721,8 +721,8 @@ class GpuSoftmaxWithBias (GpuOp):
n_threads, n_threads,
n_threads * sizeof(npy_%(dtype)s) n_threads * sizeof(npy_%(dtype)s)
>>>( >>>(
CudaNdarray_HOST_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1], PyGpuArray_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论