Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
d76b9684
提交
d76b9684
authored
1月 20, 2014
作者:
Frederic
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Convert some of the old GpuCAReduce code to the new interface.
上级
a8c03627
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
104 行增加
和
92 行删除
+104
-92
elemwise.py
theano/sandbox/gpuarray/elemwise.py
+100
-92
test_elemwise.py
theano/sandbox/gpuarray/tests/test_elemwise.py
+4
-0
没有找到文件。
theano/sandbox/gpuarray/elemwise.py
浏览文件 @
d76b9684
...
@@ -658,10 +658,10 @@ class GpuCAReduce(GpuOp):
...
@@ -658,10 +658,10 @@ class GpuCAReduce(GpuOp):
#check input
#check input
print
>>
sio
,
"""
print
>>
sio
,
"""
if (
%(x)
s->nd
!=
%(nd_in)
s)
if (
PyGpuArray_NDIM(
%(x)
s)
!=
%(nd_in)
s)
{
{
PyErr_Format(PyExc_TypeError,
PyErr_Format(PyExc_TypeError,
"required nd=
%(nd_in)
s, got nd=
%%
i",
%(x)
s->nd
);
"required nd=
%(nd_in)
s, got nd=
%%
i",
PyGpuArray_NDIM(
%(x)
s)
);
%(fail)
s;
%(fail)
s;
}
}
"""
%
locals
()
"""
%
locals
()
...
@@ -690,7 +690,7 @@ class GpuCAReduce(GpuOp):
...
@@ -690,7 +690,7 @@ class GpuCAReduce(GpuOp):
# check the basics of out output
# check the basics of out output
print
>>
sio
,
"""
print
>>
sio
,
"""
if ( !
%(z)
s
if ( !
%(z)
s
|| (
%(z)
s->nd
!=
%(nd_out)
s)
|| (
PyGpuArray_NDIM(
%(z)
s)
!=
%(nd_out)
s)
"""
%
locals
()
"""
%
locals
()
#ensure that the output has the right non-reduced dimensions
#ensure that the output has the right non-reduced dimensions
...
@@ -705,19 +705,22 @@ class GpuCAReduce(GpuOp):
...
@@ -705,19 +705,22 @@ class GpuCAReduce(GpuOp):
{
{
"""
%
locals
()
"""
%
locals
()
if
nd_out
>
0
:
if
nd_out
>
0
:
print
>>
sio
,
"
in
t new_dims[
%(nd_out)
s]; "
%
locals
()
print
>>
sio
,
"
size_
t new_dims[
%(nd_out)
s]; "
%
locals
()
else
:
else
:
print
>>
sio
,
"
in
t *new_dims=NULL; "
print
>>
sio
,
"
size_
t *new_dims=NULL; "
j
=
0
j
=
0
for
i
in
xrange
(
nd_in
):
for
i
in
xrange
(
nd_in
):
if
not
self
.
reduce_mask
[
i
]:
if
not
self
.
reduce_mask
[
i
]:
print
>>
sio
,
'new_dims[
%(j)
s] = PyGpuArray_DIMS(
%(x)
s)[
%(i)
s];'
%
locals
()
print
>>
sio
,
'new_dims[
%(j)
s] = PyGpuArray_DIMS(
%(x)
s)[
%(i)
s];'
%
locals
()
j
+=
1
j
+=
1
out_typecode
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
dtype
)
print
>>
sio
,
"""
print
>>
sio
,
"""
Py_XDECREF(
%(z)
s);
Py_XDECREF(
%(z)
s);
%(z)
s = (CudaNdarray*) CudaNdarray_NewDims(
%(nd_out)
s, new_dims);
%(z)
s = pygpu_empty(
%(nd_out)
s, new_dims,
%(out_typecode)
s, GA_C_ORDER,
pygpu_default_context(),
Py_None);
if (NULL ==
%(z)
s)
if (NULL ==
%(z)
s)
{
{
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
...
@@ -729,7 +732,7 @@ class GpuCAReduce(GpuOp):
...
@@ -729,7 +732,7 @@ class GpuCAReduce(GpuOp):
# \begin bracket the reduction in a check that there is
# \begin bracket the reduction in a check that there is
# actually work to do
# actually work to do
if
getattr
(
self
.
scalar_op
,
'identity'
,
None
)
==
0
:
if
getattr
(
self
.
scalar_op
,
'identity'
,
None
)
==
0
:
zero_shp
=
"cudaMemset(
%(z)
s->devdata, 0, CudaNda
rray_SIZE(
%(z)
s) * sizeof(float))"
%
locals
()
zero_shp
=
"cudaMemset(
(float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset), 0, PyGpuA
rray_SIZE(
%(z)
s) * sizeof(float))"
%
locals
()
#TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
#TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
else
:
else
:
zero_shp
=
"""
zero_shp
=
"""
...
@@ -738,10 +741,10 @@ class GpuCAReduce(GpuOp):
...
@@ -738,10 +741,10 @@ class GpuCAReduce(GpuOp):
%(fail)
s;
%(fail)
s;
"""
%
locals
()
"""
%
locals
()
print
>>
sio
,
"""
print
>>
sio
,
"""
if (
CudaNdarray_SIZE(
%(z)
s) && ! CudaNda
rray_SIZE(
%(x)
s)){
if (
PyGpuArray_SIZE(
%(z)
s) && ! PyGpuA
rray_SIZE(
%(x)
s)){
%(zero_shp)
s;
%(zero_shp)
s;
}
}
else if (
CudaNda
rray_SIZE(
%(z)
s))
else if (
PyGpuA
rray_SIZE(
%(z)
s))
{
{
"""
%
locals
()
"""
%
locals
()
...
@@ -753,7 +756,7 @@ class GpuCAReduce(GpuOp):
...
@@ -753,7 +756,7 @@ class GpuCAReduce(GpuOp):
#check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
#check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
#TODO: check if we are ccontiguous when we un-dimshuffle
#TODO: check if we are ccontiguous when we un-dimshuffle
#TODO: if only some dims are ccontiguous, call version with less dims.
#TODO: if only some dims are ccontiguous, call version with less dims.
print
>>
sio
,
'if(
CudaNdarray_is_c_contiguous(
%(x)
s)
){'
%
locals
()
print
>>
sio
,
'if(
%(x)
s->ga.flags & GA_C_CONTIGUOUS
){'
%
locals
()
self
.
c_code_reduce_ccontig
(
sio
,
node
,
name
,
x
,
z
,
fail
)
self
.
c_code_reduce_ccontig
(
sio
,
node
,
name
,
x
,
z
,
fail
)
print
>>
sio
,
"}else{"
print
>>
sio
,
"}else{"
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
...
@@ -784,11 +787,11 @@ class GpuCAReduce(GpuOp):
...
@@ -784,11 +787,11 @@ class GpuCAReduce(GpuOp):
n_shared>>>(
n_shared>>>(
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[1],
PyGpuArray_DIMS(
%(x)
s)[1],
CudaNdarray_DEV_DATA(
%(x)
s
),
(float *)(((char *)cuda_get_ptr(
%(x)
s->ga.data))+
%(x)
s->ga.offset
),
CudaNdarray_HOST_STRIDES(
%(x)
s)[0]
,
PyGpuArray_STRIDES(
%(x)
s)[0]/4
,
CudaNdarray_HOST_STRIDES(
%(x)
s)[1]
,
PyGpuArray_STRIDES(
%(x)
s)[1]/4
,
CudaNdarray_DEV_DATA(
%(z)
s
),
(float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset
),
CudaNdarray_HOST_STRIDES(
%(z)
s)[0]
PyGpuArray_STRIDES(
%(z)
s)[0]/4
);
);
CNDA_THREAD_SYNC;
CNDA_THREAD_SYNC;
if (cudaSuccess != cudaGetLastError())
if (cudaSuccess != cudaGetLastError())
...
@@ -802,8 +805,8 @@ class GpuCAReduce(GpuOp):
...
@@ -802,8 +805,8 @@ class GpuCAReduce(GpuOp):
pattern
=
''
.
join
(
str
(
c
)
for
c
in
self
.
reduce_mask
)
pattern
=
''
.
join
(
str
(
c
)
for
c
in
self
.
reduce_mask
)
ndim
=
len
(
self
.
reduce_mask
)
ndim
=
len
(
self
.
reduce_mask
)
nd_out
=
ndim
-
sum
(
self
.
reduce_mask
)
nd_out
=
ndim
-
sum
(
self
.
reduce_mask
)
shapes_format
=
"shape=(
%
s)"
%
","
.
join
([
"
%
d
"
]
*
node
.
inputs
[
0
]
.
ndim
)
shapes_format
=
"shape=(
%
s)"
%
","
.
join
([
"
%
llu
"
]
*
node
.
inputs
[
0
]
.
ndim
)
shapes_data
=
","
.
join
([
"PyGpuArray_DIMS(
%
s)[
%
d]"
%
(
x
,
i
)
shapes_data
=
","
.
join
([
"
(unsigned long long)
PyGpuArray_DIMS(
%
s)[
%
d]"
%
(
x
,
i
)
for
i
in
range
(
node
.
inputs
[
0
]
.
ndim
)])
for
i
in
range
(
node
.
inputs
[
0
]
.
ndim
)])
print
>>
sio
,
"""
print
>>
sio
,
"""
...
@@ -825,18 +828,18 @@ class GpuCAReduce(GpuOp):
...
@@ -825,18 +828,18 @@ class GpuCAReduce(GpuOp):
PyGpuArray_DIMS(
%(x)
s)[
%(i)
s],
PyGpuArray_DIMS(
%(x)
s)[
%(i)
s],
"""
%
locals
()
"""
%
locals
()
print
>>
sio
,
"""
print
>>
sio
,
"""
CudaNdarray_DEV_DATA(
%(x)
s
)
(float *)(((char *)cuda_get_ptr(
%(x)
s->ga.data))+
%(x)
s->ga.offset
)
"""
%
locals
()
"""
%
locals
()
for
i
in
xrange
(
ndim
):
for
i
in
xrange
(
ndim
):
print
>>
sio
,
"""
print
>>
sio
,
"""
,
CudaNdarray_HOST_STRIDES(
%(x)
s)[
%(i)
s]
,
PyGpuArray_STRIDES(
%(x)
s)[
%(i)
s]/4
"""
%
locals
()
"""
%
locals
()
print
>>
sio
,
"""
print
>>
sio
,
"""
,
CudaNdarray_DEV_DATA(
%(z)
s
)
,
(float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset
)
"""
%
locals
()
"""
%
locals
()
for
i
in
xrange
(
nd_out
):
for
i
in
xrange
(
nd_out
):
print
>>
sio
,
"""
print
>>
sio
,
"""
,
CudaNdarray_HOST_STRIDES(
%(z)
s)[
%(i)
s]
,
PyGpuArray_STRIDES(
%(z)
s)[
%(i)
s]/4
"""
%
locals
()
"""
%
locals
()
print
>>
sio
,
"""
print
>>
sio
,
"""
...
@@ -966,8 +969,8 @@ class GpuCAReduce(GpuOp):
...
@@ -966,8 +969,8 @@ class GpuCAReduce(GpuOp):
dtype
=
x
.
dtype
dtype
=
x
.
dtype
dummy_left
=
scal
.
Scalar
(
dtype
=
dtype
)()
dummy_left
=
Scalar
(
dtype
=
dtype
)()
dummy_right
=
scal
.
Scalar
(
dtype
=
dtype
)()
dummy_right
=
Scalar
(
dtype
=
dtype
)()
dummy_node
=
self
.
scalar_op
.
make_node
(
dummy_left
,
dummy_right
)
dummy_node
=
self
.
scalar_op
.
make_node
(
dummy_left
,
dummy_right
)
...
@@ -1123,7 +1126,7 @@ class GpuCAReduce(GpuOp):
...
@@ -1123,7 +1126,7 @@ class GpuCAReduce(GpuOp):
C contiguous.
C contiguous.
"""
"""
if
getattr
(
self
.
scalar_op
,
'identity'
,
None
)
==
0
:
if
getattr
(
self
.
scalar_op
,
'identity'
,
None
)
==
0
:
zero_shp
=
"cudaMemset(
%(z)
s->devdata, 0, CudaNda
rray_SIZE(
%(z)
s) * sizeof(float))"
%
locals
()
zero_shp
=
"cudaMemset(
(float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset), 0, PyGpuA
rray_SIZE(
%(z)
s) * sizeof(float))"
%
locals
()
#TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
#TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
else
:
else
:
zero_shp
=
"""
zero_shp
=
"""
...
@@ -1132,25 +1135,29 @@ class GpuCAReduce(GpuOp):
...
@@ -1132,25 +1135,29 @@ class GpuCAReduce(GpuOp):
%(fail)
s;
%(fail)
s;
"""
%
locals
()
"""
%
locals
()
sync
=
""
if
config
.
gpuarray
.
sync
:
sync
=
"""GpuArray_sync(&
%(z)
s->ga);"""
%
locals
()
print
>>
sio
,
"""
print
>>
sio
,
"""
{
{
if(
CudaNda
rray_SIZE(
%(x)
s)==0){
if(
PyGpuA
rray_SIZE(
%(x)
s)==0){
%(zero_shp)
s;
%(zero_shp)
s;
}else{
}else{
int verbose = 0;
int verbose = 0;
dim3 n_threads(
dim3 n_threads(
std::min(
CudaNda
rray_SIZE(
%(x)
s),
std::min(
PyGpuA
rray_SIZE(
%(x)
s),
NUM_VECTOR_OP_THREADS_PER_BLOCK
));
(size_t) 256
));
dim3 n_blocks(1);
dim3 n_blocks(1);
if (verbose) printf("running kernel_reduce_ccontig_
%(name)
s"
if (verbose) printf("running kernel_reduce_ccontig_
%(name)
s"
" n_threads.x=
%%
d, size=
%%
d, ndim=
%%
d
\\
n",
" n_threads.x=
%%
d, size=
%%
d, ndim=
%%
d
\\
n",
n_threads.x,CudaNdarray_SIZE(
%(x)
s),
%(x)
s->nd);
n_threads.x,PyGpuArray_SIZE(
%(x)
s),
PyGpuArray_NDIM(
%(x)
s));
int n_shared = sizeof(float) * n_threads.x;
int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_ccontig_
%(name)
s<<<n_blocks, n_threads, n_shared>>>(
kernel_reduce_ccontig_
%(name)
s<<<n_blocks, n_threads, n_shared>>>(
CudaNda
rray_SIZE(
%(x)
s),
PyGpuA
rray_SIZE(
%(x)
s),
CudaNdarray_DEV_DATA(
%(x)
s
),
(float *)(((char *)cuda_get_ptr(
%(x)
s->ga.data))+
%(x)
s->ga.offset
),
CudaNdarray_DEV_DATA(
%(z)
s
));
(float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset
));
CNDA_THREAD_SYNC;
%(sync)
s
cudaError_t sts = cudaGetLastError();
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
if (cudaSuccess != sts)
{
{
...
@@ -1177,7 +1184,7 @@ class GpuCAReduce(GpuOp):
...
@@ -1177,7 +1184,7 @@ class GpuCAReduce(GpuOp):
int verbose = 0;
int verbose = 0;
dim3 n_threads(
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK
));
(size_t) 256
));
dim3 n_blocks(1);
dim3 n_blocks(1);
%(makecall)
s
%(makecall)
s
}
}
...
@@ -1190,8 +1197,8 @@ class GpuCAReduce(GpuOp):
...
@@ -1190,8 +1197,8 @@ class GpuCAReduce(GpuOp):
int verbose = 0;
int verbose = 0;
dim3 n_threads(
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[1],
std::min(PyGpuArray_DIMS(
%(x)
s)[1],
NUM_VECTOR_OP_THREADS_PER_BLOCK
));
(size_t) 256
));
while (n_threads.y * n_threads.x <=
NUM_VECTOR_OP_THREADS_PER_BLOCK
) ++n_threads.y;
while (n_threads.y * n_threads.x <=
256
) ++n_threads.y;
n_threads.y -= 1;
n_threads.y -= 1;
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[0])
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[0])
n_threads.y = PyGpuArray_DIMS(
%(x)
s)[0];
n_threads.y = PyGpuArray_DIMS(
%(x)
s)[0];
...
@@ -1212,12 +1219,12 @@ class GpuCAReduce(GpuOp):
...
@@ -1212,12 +1219,12 @@ class GpuCAReduce(GpuOp):
N_pattern
=
''
.
join
([
'1'
]
*
N
)
N_pattern
=
''
.
join
([
'1'
]
*
N
)
param_dim
=
","
.
join
([
"PyGpuArray_DIMS(
%
s)[
%
d]"
%
(
x
,
i
)
param_dim
=
","
.
join
([
"PyGpuArray_DIMS(
%
s)[
%
d]"
%
(
x
,
i
)
for
i
in
xrange
(
N
+
1
)])
for
i
in
xrange
(
N
+
1
)])
strides_dim
=
","
.
join
([
"
CudaNdarray_HOST_STRIDES(
%
s)[
%
d]
"
strides_dim
=
","
.
join
([
"
PyGpuArray_STRIDES(
%
s)[
%
d]/4
"
%
(
x
,
i
)
for
i
in
xrange
(
N
+
1
)])
%
(
x
,
i
)
for
i
in
xrange
(
N
+
1
)])
threads_y
=
"""
threads_y
=
"""
//get as many y threads as we can fit
//get as many y threads as we can fit
while (n_threads.x * (n_threads.y+1) <=
NUM_VECTOR_OP_THREADS_PER_BLOCK
)
while (n_threads.x * (n_threads.y+1) <=
256
)
{
{
if (n_threads.y < PyGpuArray_DIMS(
%(x)
s)[
%(N)
s-1])
if (n_threads.y < PyGpuArray_DIMS(
%(x)
s)[
%(N)
s-1])
n_threads.y += 1;
n_threads.y += 1;
...
@@ -1227,7 +1234,7 @@ class GpuCAReduce(GpuOp):
...
@@ -1227,7 +1234,7 @@ class GpuCAReduce(GpuOp):
threads_z
=
"""
threads_z
=
"""
//get as many z threads as we can fit
//get as many z threads as we can fit
while (n_threads.x * n_threads.y * (n_threads.z+1) <=
NUM_VECTOR_OP_THREADS_PER_BLOCK
)
while (n_threads.x * n_threads.y * (n_threads.z+1) <=
256
)
{
{
if (n_threads.z < PyGpuArray_DIMS(
%(x)
s)[
%(N)
s-2])
if (n_threads.z < PyGpuArray_DIMS(
%(x)
s)[
%(N)
s-2])
n_threads.z += 1;
n_threads.z += 1;
...
@@ -1247,11 +1254,11 @@ class GpuCAReduce(GpuOp):
...
@@ -1247,11 +1254,11 @@ class GpuCAReduce(GpuOp):
int verbose = 0;
int verbose = 0;
dim3 n_threads(
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[
%(N)
s],
std::min(PyGpuArray_DIMS(
%(x)
s)[
%(N)
s],
NUM_VECTOR_OP_THREADS_PER_BLOCK
));
(size_t) 256
));
%(threads_y)
s
%(threads_y)
s
%(threads_z)
s
%(threads_z)
s
dim3 n_blocks(std::min(PyGpuArray_DIMS(
%(x)
s)[0],
dim3 n_blocks(std::min(PyGpuArray_DIMS(
%(x)
s)[0],
NUM_VECTOR_OP_BLOCKS
));
(size_t) 4096
));
%(makecall)
s
%(makecall)
s
}
}
"""
%
locals
()
"""
%
locals
()
...
@@ -1271,10 +1278,10 @@ class GpuCAReduce(GpuOp):
...
@@ -1271,10 +1278,10 @@ class GpuCAReduce(GpuOp):
int verbose = 0;
int verbose = 0;
dim3 n_threads(
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK
));
(size_t) 256
));
dim3 n_blocks(1,
dim3 n_blocks(1,
std::min(PyGpuArray_DIMS(
%(x)
s)[1],
std::min(PyGpuArray_DIMS(
%(x)
s)[1],
NUM_VECTOR_OP_BLOCKS
));
(size_t) 4096
));
if (verbose) {
if (verbose) {
fprintf(stderr,
fprintf(stderr,
"running kernel_reduce_10_
%(name)
s n_blocks=(
%%
i,
%%
i)
\\
n",
"running kernel_reduce_10_
%(name)
s n_blocks=(
%%
i,
%%
i)
\\
n",
...
@@ -1287,13 +1294,13 @@ class GpuCAReduce(GpuOp):
...
@@ -1287,13 +1294,13 @@ class GpuCAReduce(GpuOp):
1,
1,
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[1],
PyGpuArray_DIMS(
%(x)
s)[1],
CudaNdarray_DEV_DATA(
%(x)
s
),
(float *)(((char *)cuda_get_ptr(
%(x)
s->ga.data))+
%(x)
s->ga.offset
),
1,
1,
CudaNdarray_HOST_STRIDES(
%(x)
s)[0]
,
PyGpuArray_STRIDES(
%(x)
s)[0]/4
,
CudaNdarray_HOST_STRIDES(
%(x)
s)[1]
,
PyGpuArray_STRIDES(
%(x)
s)[1]/4
,
CudaNdarray_DEV_DATA(
%(z)
s
),
(float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset
),
1,
1,
CudaNdarray_HOST_STRIDES(
%(z)
s)[0]
PyGpuArray_STRIDES(
%(z)
s)[0]/4
);
);
CNDA_THREAD_SYNC;
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
cudaError_t sts = cudaGetLastError();
...
@@ -1350,18 +1357,18 @@ class GpuCAReduce(GpuOp):
...
@@ -1350,18 +1357,18 @@ class GpuCAReduce(GpuOp):
// The gridsize would ideally be (A, D). But we do the following logic to make
// The gridsize would ideally be (A, D). But we do the following logic to make
// sure we don't ask for a grid that is too big.
// sure we don't ask for a grid that is too big.
dim3 n_blocks(A,D);
dim3 n_blocks(A,D);
if (n_blocks.x >
NUM_VECTOR_OP_BLOCKS) n_blocks.x = NUM_VECTOR_OP_BLOCKS
;
if (n_blocks.x >
4096) n_blocks.x = 4096
;
if (n_blocks.x*n_blocks.y >
NUM_VECTOR_OP_BLOCKS) n_blocks.y = NUM_VECTOR_OP_BLOCKS
/n_blocks.x;
if (n_blocks.x*n_blocks.y >
4096) n_blocks.y = 4096
/n_blocks.x;
int n_shared = 0;
int n_shared = 0;
kernel_reduce_010_AD_
%(name)
s<<<n_blocks, n_threads, n_shared>>>(
kernel_reduce_010_AD_
%(name)
s<<<n_blocks, n_threads, n_shared>>>(
A,B,C,D,
A,B,C,D,
CudaNdarray_DEV_DATA(
%(x)
s
),
(float *)(((char *)cuda_get_ptr(
%(x)
s->ga.data))+
%(x)
s->ga.offset
),
CudaNdarray_HOST_STRIDES(
%(x)
s)[0]
,
PyGpuArray_STRIDES(
%(x)
s)[0]/4
,
CudaNdarray_HOST_STRIDES(
%(x)
s)[1]
,
PyGpuArray_STRIDES(
%(x)
s)[1]/4
,
CudaNdarray_HOST_STRIDES(
%(x)
s)[2]
,
PyGpuArray_STRIDES(
%(x)
s)[2]/4
,
CudaNdarray_DEV_DATA(
%(z)
s
),
(float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset
),
CudaNdarray_HOST_STRIDES(
%(z)
s)[0]
,
PyGpuArray_STRIDES(
%(z)
s)[0]/4
,
CudaNdarray_HOST_STRIDES(
%(z)
s)[1]
PyGpuArray_STRIDES(
%(z)
s)[1]/4
);
);
CNDA_THREAD_SYNC;
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
cudaError_t sts = cudaGetLastError();
...
@@ -1385,36 +1392,36 @@ class GpuCAReduce(GpuOp):
...
@@ -1385,36 +1392,36 @@ class GpuCAReduce(GpuOp):
int verbose = 2;
int verbose = 2;
dim3 n_threads(std::min(32,PyGpuArray_DIMS(
%(x)
s)[2]));
dim3 n_threads(std::min(32,PyGpuArray_DIMS(
%(x)
s)[2]));
while( (n_threads.x*(n_threads.y+1)<=
NUM_VECTOR_OP_THREADS_PER_BLOCK
)
while( (n_threads.x*(n_threads.y+1)<=
256
)
&& (n_threads.y<PyGpuArray_DIMS(
%(x)
s)[1])){
&& (n_threads.y<PyGpuArray_DIMS(
%(x)
s)[1])){
n_threads.y++;
n_threads.y++;
}
}
dim3 n_blocks(std::min(PyGpuArray_DIMS(
%(x)
s)[0],
dim3 n_blocks(std::min(PyGpuArray_DIMS(
%(x)
s)[0],
(int)
NUM_VECTOR_OP_BLOCKS
));
(int)
4096
));
n_blocks.y = std::min(
n_blocks.y = std::min(
ceil_intdiv(PyGpuArray_DIMS(
%(x)
s)[2],(int)n_threads.x),
ceil_intdiv(PyGpuArray_DIMS(
%(x)
s)[2],(int)n_threads.x),
(int)(
NUM_VECTOR_OP_BLOCKS
/ n_blocks.x)
(int)(
4096
/ n_blocks.x)
);
);
if(std::min(std::min(
CudaNdarray_HOST_STRIDES(
%(x)
s)[0]
,
if(std::min(std::min(
PyGpuArray_STRIDES(
%(x)
s)[0]/4
,
CudaNdarray_HOST_STRIDES(
%(x)
s)[1]
),
PyGpuArray_STRIDES(
%(x)
s)[1]/4
),
CudaNdarray_HOST_STRIDES(
%(x)
s)[2]
)
PyGpuArray_STRIDES(
%(x)
s)[2]/4
)
==
CudaNdarray_HOST_STRIDES(
%(x)
s)[2]
==
PyGpuArray_STRIDES(
%(x)
s)[2]/4
&& n_blocks.y==ceil_intdiv(PyGpuArray_DIMS(
%(x)
s)[2],(int)n_threads.x)){
&& n_blocks.y==ceil_intdiv(PyGpuArray_DIMS(
%(x)
s)[2],(int)n_threads.x)){
if(verbose>1)
if(verbose>1)
printf("n_block.x.1=
%%
d, n_block.x.2=
%%
d, n_block.y.1=
%%
d, n_block.y.2=
%%
d,
\\
n",
printf("n_block.x.1=
%%
d, n_block.x.2=
%%
d, n_block.y.1=
%%
d, n_block.y.2=
%%
d,
\\
n",
PyGpuArray_DIMS(
%(x)
s)[0],
NUM_VECTOR_OP_BLOCKS
,
PyGpuArray_DIMS(
%(x)
s)[0],
4096
,
ceil_intdiv(PyGpuArray_DIMS(
%(x)
s)[2],(int)n_threads.x),
ceil_intdiv(PyGpuArray_DIMS(
%(x)
s)[2],(int)n_threads.x),
(int)(
NUM_VECTOR_OP_BLOCKS
/ n_blocks.x));
(int)(
4096
/ n_blocks.x));
assert(n_threads.x<=32);
assert(n_threads.x<=32);
%(makecall_inner)
s
%(makecall_inner)
s
}else{
}else{
n_threads.x = std::min(PyGpuArray_DIMS(
%(x)
s)[1],
n_threads.x = std::min(PyGpuArray_DIMS(
%(x)
s)[1],
(int)NUM_VECTOR_OP_THREADS_PER_BLOCK
);
(size_t) 256
);
n_blocks.x = std::min(PyGpuArray_DIMS(
%(x)
s)[0], (int)
NUM_VECTOR_OP_BLOCKS
);
n_blocks.x = std::min(PyGpuArray_DIMS(
%(x)
s)[0], (int)
4096
);
n_blocks.y = std::min(
n_blocks.y = std::min(
PyGpuArray_DIMS(
%(x)
s)[2],
PyGpuArray_DIMS(
%(x)
s)[2],
(int)(
NUM_VECTOR_OP_BLOCKS
/ n_blocks.x)
(int)(
4096
/ n_blocks.x)
);
);
%(makecall)
s
%(makecall)
s
}
}
...
@@ -1443,8 +1450,8 @@ class GpuCAReduce(GpuOp):
...
@@ -1443,8 +1450,8 @@ class GpuCAReduce(GpuOp):
int verbose = 0;
int verbose = 0;
dim3 n_threads(
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[3],
std::min(PyGpuArray_DIMS(
%(x)
s)[3],
NUM_VECTOR_OP_THREADS_PER_BLOCK
));
(size_t) 256
));
while (n_threads.x * n_threads.y <=
NUM_VECTOR_OP_THREADS_PER_BLOCK
)
while (n_threads.x * n_threads.y <=
256
)
{
{
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[1]) break;
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[1]) break;
n_threads.y += 1;
n_threads.y += 1;
...
@@ -1465,9 +1472,9 @@ class GpuCAReduce(GpuOp):
...
@@ -1465,9 +1472,9 @@ class GpuCAReduce(GpuOp):
int verbose = 0;
int verbose = 0;
dim3 n_threads(
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK
));
(size_t) 256
));
dim3 n_blocks(std::min(PyGpuArray_DIMS(
%(x)
s)[1],
NUM_VECTOR_OP_BLOCKS
));
dim3 n_blocks(std::min(PyGpuArray_DIMS(
%(x)
s)[1],
4096
));
while (n_blocks.x * (n_blocks.y+1) <=
NUM_VECTOR_OP_BLOCKS
&& n_blocks.y <= PyGpuArray_DIMS(
%(x)
s)[2])
while (n_blocks.x * (n_blocks.y+1) <=
4096
&& n_blocks.y <= PyGpuArray_DIMS(
%(x)
s)[2])
{
{
n_blocks.y += 1;
n_blocks.y += 1;
}
}
...
@@ -1482,8 +1489,8 @@ class GpuCAReduce(GpuOp):
...
@@ -1482,8 +1489,8 @@ class GpuCAReduce(GpuOp):
int verbose = 0;
int verbose = 0;
dim3 n_threads(
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[1],
std::min(PyGpuArray_DIMS(
%(x)
s)[1],
NUM_VECTOR_OP_THREADS_PER_BLOCK
));
(size_t) 256
));
while (n_threads.x*n_threads.y <=
NUM_VECTOR_OP_THREADS_PER_BLOCK
)
while (n_threads.x*n_threads.y <=
256
)
{
{
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[0])
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[0])
break;
break;
...
@@ -1503,11 +1510,11 @@ class GpuCAReduce(GpuOp):
...
@@ -1503,11 +1510,11 @@ class GpuCAReduce(GpuOp):
int verbose = 0;
int verbose = 0;
dim3 n_threads(
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[2],
std::min(PyGpuArray_DIMS(
%(x)
s)[2],
NUM_VECTOR_OP_THREADS_PER_BLOCK
));
(size_t) 256
));
dim3 n_blocks(
dim3 n_blocks(
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
NUM_VECTOR_OP_BLOCKS
));
4096
));
while (n_blocks.x * n_blocks.y <=
NUM_VECTOR_OP_BLOCKS
)
while (n_blocks.x * n_blocks.y <=
4096
)
{
{
if (n_blocks.y > PyGpuArray_DIMS(
%(x)
s)[1])
if (n_blocks.y > PyGpuArray_DIMS(
%(x)
s)[1])
break;
break;
...
@@ -1525,10 +1532,10 @@ class GpuCAReduce(GpuOp):
...
@@ -1525,10 +1532,10 @@ class GpuCAReduce(GpuOp):
int verbose = 0;
int verbose = 0;
dim3 n_threads(
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[2],
std::min(PyGpuArray_DIMS(
%(x)
s)[2],
NUM_VECTOR_OP_THREADS_PER_BLOCK
));
(size_t) 256
));
//get as many y threads as we can fit
//get as many y threads as we can fit
while (n_threads.x * n_threads.y <=
NUM_VECTOR_OP_THREADS_PER_BLOCK
)
while (n_threads.x * n_threads.y <=
256
)
{
{
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[1])
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[1])
break;
break;
...
@@ -1537,7 +1544,7 @@ class GpuCAReduce(GpuOp):
...
@@ -1537,7 +1544,7 @@ class GpuCAReduce(GpuOp):
n_threads.y -= 1;
n_threads.y -= 1;
//get as many z threads as we can fit
//get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <=
NUM_VECTOR_OP_THREADS_PER_BLOCK
)
while (n_threads.x * n_threads.y * n_threads.z <=
256
)
{
{
if (n_threads.z > PyGpuArray_DIMS(
%(x)
s)[0])
if (n_threads.z > PyGpuArray_DIMS(
%(x)
s)[0])
break;
break;
...
@@ -1558,9 +1565,9 @@ class GpuCAReduce(GpuOp):
...
@@ -1558,9 +1565,9 @@ class GpuCAReduce(GpuOp):
dim3 n_blocks(
dim3 n_blocks(
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
NUM_VECTOR_OP_BLOCKS
));
4096
));
while (n_blocks.x * n_blocks.y <=
NUM_VECTOR_OP_BLOCKS
&&
while (n_blocks.x * n_blocks.y <=
4096
&&
n_blocks.y < PyGpuArray_DIMS(
%(x)
s)[1])
n_blocks.y < PyGpuArray_DIMS(
%(x)
s)[1])
{
{
n_blocks.y += 1;
n_blocks.y += 1;
...
@@ -1568,8 +1575,8 @@ class GpuCAReduce(GpuOp):
...
@@ -1568,8 +1575,8 @@ class GpuCAReduce(GpuOp):
dim3 n_threads(
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[3],
std::min(PyGpuArray_DIMS(
%(x)
s)[3],
NUM_VECTOR_OP_THREADS_PER_BLOCK
));
(size_t) 256
));
while (n_threads.x * n_threads.y <=
NUM_VECTOR_OP_THREADS_PER_BLOCK
while (n_threads.x * n_threads.y <=
256
&& n_threads.y < PyGpuArray_DIMS(
%(x)
s)[2]
&& n_threads.y < PyGpuArray_DIMS(
%(x)
s)[2]
&& n_threads.x * n_threads.y * sizeof(float) <=(15*1024-200))
&& n_threads.x * n_threads.y * sizeof(float) <=(15*1024-200))
{
{
...
@@ -1587,10 +1594,10 @@ class GpuCAReduce(GpuOp):
...
@@ -1587,10 +1594,10 @@ class GpuCAReduce(GpuOp):
int verbose = 0;
int verbose = 0;
dim3 n_threads(
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[2],
std::min(PyGpuArray_DIMS(
%(x)
s)[2],
NUM_VECTOR_OP_THREADS_PER_BLOCK
));
(size_t) 256
));
//get as many y threads as we can fit
//get as many y threads as we can fit
while (n_threads.x * n_threads.y <=
NUM_VECTOR_OP_THREADS_PER_BLOCK
)
while (n_threads.x * n_threads.y <=
256
)
{
{
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[1])
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[1])
break;
break;
...
@@ -1599,7 +1606,7 @@ class GpuCAReduce(GpuOp):
...
@@ -1599,7 +1606,7 @@ class GpuCAReduce(GpuOp):
n_threads.y -= 1;
n_threads.y -= 1;
//get as many z threads as we can fit
//get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <=
NUM_VECTOR_OP_THREADS_PER_BLOCK
)
while (n_threads.x * n_threads.y * n_threads.z <=
256
)
{
{
if (n_threads.z > PyGpuArray_DIMS(
%(x)
s)[0])
if (n_threads.z > PyGpuArray_DIMS(
%(x)
s)[0])
break;
break;
...
@@ -1622,13 +1629,13 @@ class GpuCAReduce(GpuOp):
...
@@ -1622,13 +1629,13 @@ class GpuCAReduce(GpuOp):
int verbose = 0;
int verbose = 0;
dim3 n_threads(
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[3],
std::min(PyGpuArray_DIMS(
%(x)
s)[3],
NUM_VECTOR_OP_THREADS_PER_BLOCK
));
(size_t) 256
));
while (n_threads.x * (n_threads.y+1) <=
NUM_VECTOR_OP_THREADS_PER_BLOCK
) ++n_threads.y;
while (n_threads.x * (n_threads.y+1) <=
256
) ++n_threads.y;
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[2])
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[2])
n_threads.y = PyGpuArray_DIMS(
%(x)
s)[2];
n_threads.y = PyGpuArray_DIMS(
%(x)
s)[2];
while (n_threads.x * n_threads.y * (n_threads.z+1) <=
NUM_VECTOR_OP_THREADS_PER_BLOCK
) ++n_threads.z;
while (n_threads.x * n_threads.y * (n_threads.z+1) <=
256
) ++n_threads.z;
if (n_threads.z > 64)
if (n_threads.z > 64)
n_threads.z = 64;
n_threads.z = 64;
if (n_threads.z > PyGpuArray_DIMS(
%(x)
s)[0])
if (n_threads.z > PyGpuArray_DIMS(
%(x)
s)[0])
...
@@ -2235,6 +2242,7 @@ class GpuCAReduce(GpuOp):
...
@@ -2235,6 +2242,7 @@ class GpuCAReduce(GpuOp):
%(reducebuf)
s
%(reducebuf)
s
}
}
"""
%
locals
()
"""
%
locals
()
print
>>
sio
,
"CUdeviceptr (*cuda_get_ptr)(gpudata *g);"
return
sio
.
getvalue
()
return
sio
.
getvalue
()
...
...
theano/sandbox/gpuarray/tests/test_elemwise.py
浏览文件 @
d76b9684
...
@@ -66,6 +66,10 @@ class test_GpuCAReduceCPY(test_CAReduce):
...
@@ -66,6 +66,10 @@ class test_GpuCAReduceCPY(test_CAReduce):
self
.
with_linker
(
gof
.
CLinker
(),
op
,
dtype
=
dtype
,
self
.
with_linker
(
gof
.
CLinker
(),
op
,
dtype
=
dtype
,
test_nan
=
True
)
test_nan
=
True
)
def
test_infer_shape
(
self
):
for
dtype
in
self
.
dtypes
:
test_CAReduce
.
test_infer_shape
(
self
,
dtype
)
class
test_GpuCAReduce
(
test_GpuCAReduceCPY
):
class
test_GpuCAReduce
(
test_GpuCAReduceCPY
):
dtypes
=
[
"float32"
]
dtypes
=
[
"float32"
]
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论