Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
b80a7d12
提交
b80a7d12
authored
8月 08, 2017
作者:
Arnaud Bergeron
提交者:
Frederic Bastien
9月 03, 2017
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add cluda include to all kernels.
上级
28ffda75
显示空白字符变更
内嵌
并排
正在显示
19 个修改的文件
包含
101 行增加
和
44 行删除
+101
-44
basic_ops.py
theano/gpuarray/basic_ops.py
+9
-9
corr3d_gemm.c
theano/gpuarray/c_code/corr3d_gemm.c
+7
-1
corr_gemm.c
theano/gpuarray/c_code/corr_gemm.c
+5
-0
magma_cholesky.c
theano/gpuarray/c_code/magma_cholesky.c
+2
-0
magma_qr.c
theano/gpuarray/c_code/magma_qr.c
+1
-0
pool.c
theano/gpuarray/c_code/pool.c
+4
-0
pool_ave_grad.c
theano/gpuarray/c_code/pool_ave_grad.c
+2
-0
pool_grad_grad.c
theano/gpuarray/c_code/pool_grad_grad.c
+2
-0
pool_max_grad.c
theano/gpuarray/c_code/pool_max_grad.c
+2
-0
pool_max_rop.c
theano/gpuarray/c_code/pool_max_rop.c
+2
-0
elemwise.py
theano/gpuarray/elemwise.py
+31
-17
extra_ops.py
theano/gpuarray/extra_ops.py
+6
-3
kernel_codegen.py
theano/gpuarray/kernel_codegen.py
+3
-1
multinomial.py
theano/gpuarray/multinomial.py
+4
-2
neighbours.py
theano/gpuarray/neighbours.py
+4
-2
nnet.py
theano/gpuarray/nnet.py
+12
-6
rng_mrg.py
theano/gpuarray/rng_mrg.py
+2
-1
subtensor.py
theano/gpuarray/subtensor.py
+2
-2
tstgpueye.c
theano/gpuarray/tests/c_code/tstgpueye.c
+1
-0
没有找到文件。
theano/gpuarray/basic_ops.py
浏览文件 @
b80a7d12
...
@@ -158,7 +158,7 @@ class Kernel(object):
...
@@ -158,7 +158,7 @@ class Kernel(object):
the `params` list consists of C typecodes
the `params` list consists of C typecodes
It can also have the key `cflags` which is a string of C flag
It can also have the key `cflags` which is a string of C flag
values like this `"GA_USE_DOUBLE|GA_USE_
CLUDA
"`.
values like this `"GA_USE_DOUBLE|GA_USE_
SMALL
"`.
Parameters
Parameters
----------
----------
...
@@ -216,7 +216,7 @@ class Kernel(object):
...
@@ -216,7 +216,7 @@ class Kernel(object):
else
:
else
:
raise
TypeError
(
"can't get a dtype from
%
s"
%
(
type
(
t
),))
raise
TypeError
(
"can't get a dtype from
%
s"
%
(
type
(
t
),))
dtypes
=
[
get_dtype
(
t
)
for
t
in
types
]
dtypes
=
[
get_dtype
(
t
)
for
t
in
types
]
flags
=
dict
(
cluda
=
True
)
flags
=
dict
()
if
any
(
d
==
np
.
float64
for
d
in
dtypes
):
if
any
(
d
==
np
.
float64
for
d
in
dtypes
):
flags
[
'have_double'
]
=
True
flags
[
'have_double'
]
=
True
if
any
(
d
.
itemsize
<
4
for
d
in
dtypes
):
if
any
(
d
.
itemsize
<
4
for
d
in
dtypes
):
...
@@ -231,8 +231,6 @@ class Kernel(object):
...
@@ -231,8 +231,6 @@ class Kernel(object):
res
=
[]
res
=
[]
if
self
.
flags
.
get
(
'cflags'
,
''
)
!=
''
:
if
self
.
flags
.
get
(
'cflags'
,
''
)
!=
''
:
res
.
append
(
self
.
flags
[
'cflags'
])
res
.
append
(
self
.
flags
[
'cflags'
])
if
self
.
flags
.
get
(
'cluda'
,
False
):
res
.
append
(
'GA_USE_CLUDA'
)
if
self
.
flags
.
get
(
'have_double'
,
False
):
if
self
.
flags
.
get
(
'have_double'
,
False
):
res
.
append
(
'GA_USE_DOUBLE'
)
res
.
append
(
'GA_USE_DOUBLE'
)
if
self
.
flags
.
get
(
'have_small'
,
False
):
if
self
.
flags
.
get
(
'have_small'
,
False
):
...
@@ -241,15 +239,16 @@ class Kernel(object):
...
@@ -241,15 +239,16 @@ class Kernel(object):
res
.
append
(
'GA_USE_COMPLEX'
)
res
.
append
(
'GA_USE_COMPLEX'
)
if
self
.
flags
.
get
(
'have_half'
,
False
):
if
self
.
flags
.
get
(
'have_half'
,
False
):
res
.
append
(
'GA_USE_HALF'
)
res
.
append
(
'GA_USE_HALF'
)
return
'|'
.
join
(
res
)
res
=
'|'
.
join
(
res
)
if
not
res
:
return
'0'
return
res
def
_get_py_flags
(
self
):
def
_get_py_flags
(
self
):
res
=
dict
(
self
.
flags
)
res
=
dict
(
self
.
flags
)
cflags
=
res
.
pop
(
'cflags'
,
''
)
cflags
=
res
.
pop
(
'cflags'
,
''
)
for
fl
in
cflags
.
split
(
'|'
):
for
fl
in
cflags
.
split
(
'|'
):
fl
=
fl
.
strip
()
fl
=
fl
.
strip
()
if
fl
==
'GA_USE_CLUDA'
:
res
[
'cluda'
]
=
True
if
fl
==
'GA_USE_DOUBLE'
:
if
fl
==
'GA_USE_DOUBLE'
:
res
[
'have_double'
]
=
True
res
[
'have_double'
]
=
True
if
fl
==
'GA_USE_SMALL'
:
if
fl
==
'GA_USE_SMALL'
:
...
@@ -555,7 +554,7 @@ class CGpuKernelBase(COp, GpuKernelBase):
...
@@ -555,7 +554,7 @@ class CGpuKernelBase(COp, GpuKernelBase):
kflags
=
splt2
[
2
]
.
strip
()
kflags
=
splt2
[
2
]
.
strip
()
kcode
=
def_macros
+
'
\n
'
+
kcode
+
'
\n
'
+
undef_macros
kcode
=
def_macros
+
'
\n
'
+
kcode
+
'
\n
'
+
undef_macros
res
.
append
(
Kernel
(
kcode
,
ktypes
,
kname
,
res
.
append
(
Kernel
(
kcode
,
ktypes
,
kname
,
flags
=
dict
(
c
luda
=
True
,
c
flags
=
kflags
)))
flags
=
dict
(
cflags
=
kflags
)))
n
+=
2
n
+=
2
self
.
_cached_kernels
=
res
self
.
_cached_kernels
=
res
return
res
return
res
...
@@ -1619,7 +1618,8 @@ class GpuEye(GpuKernelBase, Op):
...
@@ -1619,7 +1618,8 @@ class GpuEye(GpuKernelBase, Op):
for
i
in
xrange
(
3
)]
for
i
in
xrange
(
3
)]
def
gpu_kernels
(
self
,
node
,
name
):
def
gpu_kernels
(
self
,
node
,
name
):
code
=
"""
code
=
"""#include "cluda.h"
KERNEL void eye(GLOBAL_MEM
%(ctype)
s *a, ga_size a_off,
KERNEL void eye(GLOBAL_MEM
%(ctype)
s *a, ga_size a_off,
ga_size n, ga_size m, ga_ssize k) {
ga_size n, ga_size m, ga_ssize k) {
a = (GLOBAL_MEM
%(ctype)
s *)(((GLOBAL_MEM char *)a) + a_off);
a = (GLOBAL_MEM
%(ctype)
s *)(((GLOBAL_MEM char *)a) + a_off);
...
...
theano/gpuarray/c_code/corr3d_gemm.c
浏览文件 @
b80a7d12
#section kernels
#section kernels
#kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
// TODO check kernel flags
#include "cluda.h"
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of
// sources are clearly marked. Below we reproduce the original license of
// the Caffe software.
// the Caffe software.
...
@@ -87,6 +88,8 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
...
@@ -87,6 +88,8 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
}
}
#kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL
void
im3d2col_kernel
(
const
ga_size
n
,
KERNEL
void
im3d2col_kernel
(
const
ga_size
n
,
GLOBAL_MEM
const
DTYPE_INPUT_0
*
data_im
,
GLOBAL_MEM
const
DTYPE_INPUT_0
*
data_im
,
const
ga_size
offset_im
,
const
ga_size
offset_im
,
...
@@ -139,6 +142,8 @@ KERNEL void im3d2col_kernel(const ga_size n,
...
@@ -139,6 +142,8 @@ KERNEL void im3d2col_kernel(const ga_size n,
// GPU kernel for the case of dilation
// GPU kernel for the case of dilation
#kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL
void
dilated_col2im3d_kernel
(
const
ga_size
n
,
KERNEL
void
dilated_col2im3d_kernel
(
const
ga_size
n
,
GLOBAL_MEM
const
DTYPE_INPUT_0
*
data_col
,
GLOBAL_MEM
const
DTYPE_INPUT_0
*
data_col
,
const
ga_size
offset_col
,
const
ga_size
offset_col
,
...
@@ -207,6 +212,7 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
...
@@ -207,6 +212,7 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
}
}
#kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL
void
col2im3d_kernel
(
const
ga_size
n
,
KERNEL
void
col2im3d_kernel
(
const
ga_size
n
,
GLOBAL_MEM
const
DTYPE_INPUT_0
*
data_col
,
GLOBAL_MEM
const
DTYPE_INPUT_0
*
data_col
,
...
...
theano/gpuarray/c_code/corr_gemm.c
浏览文件 @
b80a7d12
#section kernels
#section kernels
#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// TODO check kernel flags
// TODO check kernel flags
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of
// sources are clearly marked. Below we reproduce the original license of
...
@@ -77,6 +78,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
...
@@ -77,6 +78,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
}
}
#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL
void
im2col_kernel
(
const
ga_size
n
,
KERNEL
void
im2col_kernel
(
const
ga_size
n
,
GLOBAL_MEM
const
DTYPE_INPUT_0
*
data_im
,
GLOBAL_MEM
const
DTYPE_INPUT_0
*
data_im
,
...
@@ -122,6 +124,8 @@ KERNEL void im2col_kernel(const ga_size n,
...
@@ -122,6 +124,8 @@ KERNEL void im2col_kernel(const ga_size n,
// GPU kernel for the case of dilation
// GPU kernel for the case of dilation
#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL
void
dilated_col2im_kernel
(
const
ga_size
n
,
KERNEL
void
dilated_col2im_kernel
(
const
ga_size
n
,
GLOBAL_MEM
const
DTYPE_INPUT_0
*
data_col
,
const
ga_size
offset_col
,
GLOBAL_MEM
const
DTYPE_INPUT_0
*
data_col
,
const
ga_size
offset_col
,
const
ga_size
height
,
const
ga_size
width
,
const
ga_size
channels
,
const
ga_size
height
,
const
ga_size
width
,
const
ga_size
channels
,
...
@@ -172,6 +176,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
...
@@ -172,6 +176,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
}
}
#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
#include "cluda.h"
KERNEL
void
col2im_kernel
(
const
ga_size
n
,
KERNEL
void
col2im_kernel
(
const
ga_size
n
,
GLOBAL_MEM
const
DTYPE_INPUT_0
*
data_col
,
const
ga_size
offset_col
,
GLOBAL_MEM
const
DTYPE_INPUT_0
*
data_col
,
const
ga_size
offset_col
,
...
...
theano/gpuarray/c_code/magma_cholesky.c
浏览文件 @
b80a7d12
#section kernels
#section kernels
#kernel tril_kernel : size, size, size, *:
#kernel tril_kernel : size, size, size, *:
#include "cluda.h"
KERNEL
void
tril_kernel
(
const
ga_size
nthreads
,
const
ga_size
ncols
,
KERNEL
void
tril_kernel
(
const
ga_size
nthreads
,
const
ga_size
ncols
,
const
ga_size
a_off
,
GLOBAL_MEM
DTYPE_INPUT_0
*
a
)
{
const
ga_size
a_off
,
GLOBAL_MEM
DTYPE_INPUT_0
*
a
)
{
...
@@ -17,6 +18,7 @@ KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
...
@@ -17,6 +18,7 @@ KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
}
}
#kernel triu_kernel : size, size, size, *:
#kernel triu_kernel : size, size, size, *:
#include "cluda.h"
KERNEL
void
triu_kernel
(
const
ga_size
nthreads
,
const
ga_size
ncols
,
KERNEL
void
triu_kernel
(
const
ga_size
nthreads
,
const
ga_size
ncols
,
const
ga_size
a_off
,
GLOBAL_MEM
DTYPE_INPUT_0
*
a
)
{
const
ga_size
a_off
,
GLOBAL_MEM
DTYPE_INPUT_0
*
a
)
{
...
...
theano/gpuarray/c_code/magma_qr.c
浏览文件 @
b80a7d12
#section kernels
#section kernels
#kernel triu_kernel : size, size, size, *:
#kernel triu_kernel : size, size, size, *:
#include "cluda.h"
KERNEL
void
triu_kernel
(
const
ga_size
nthreads
,
const
ga_size
ncols
,
KERNEL
void
triu_kernel
(
const
ga_size
nthreads
,
const
ga_size
ncols
,
const
ga_size
a_off
,
GLOBAL_MEM
DTYPE_INPUT_0
*
a
)
{
const
ga_size
a_off
,
GLOBAL_MEM
DTYPE_INPUT_0
*
a
)
{
...
...
theano/gpuarray/c_code/pool.c
浏览文件 @
b80a7d12
#section kernels
#section kernels
#kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size :
#kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL
void
max_pool2d_kernel
(
const
ga_size
nthreads
,
KERNEL
void
max_pool2d_kernel
(
const
ga_size
nthreads
,
...
@@ -44,6 +45,7 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
...
@@ -44,6 +45,7 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
}
}
#kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL
void
max_pool3d_kernel
(
const
ga_size
nthreads
,
KERNEL
void
max_pool3d_kernel
(
const
ga_size
nthreads
,
...
@@ -95,6 +97,7 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
...
@@ -95,6 +97,7 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
}
}
#kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, bool, bool, *, size:
#kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, bool, bool, *, size:
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL
void
ave_pool2d_kernel
(
const
ga_size
nthreads
,
KERNEL
void
ave_pool2d_kernel
(
const
ga_size
nthreads
,
...
@@ -150,6 +153,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
...
@@ -150,6 +153,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
}
}
#kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
#kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL
void
ave_pool3d_kernel
(
const
ga_size
nthreads
,
KERNEL
void
ave_pool3d_kernel
(
const
ga_size
nthreads
,
...
...
theano/gpuarray/c_code/pool_ave_grad.c
浏览文件 @
b80a7d12
#section kernels
#section kernels
#kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, bool, bool, *, size :
#kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL
void
ave_pool2d_grad_kernel
(
const
ga_size
nthreads
,
KERNEL
void
ave_pool2d_grad_kernel
(
const
ga_size
nthreads
,
...
@@ -50,6 +51,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
...
@@ -50,6 +51,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
}
}
#kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
#kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL
void
ave_pool3d_grad_kernel
(
const
ga_size
nthreads
,
KERNEL
void
ave_pool3d_grad_kernel
(
const
ga_size
nthreads
,
...
...
theano/gpuarray/c_code/pool_grad_grad.c
浏览文件 @
b80a7d12
#section kernels
#section kernels
#kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
#kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL
void
max_pool2d_grad_grad_kernel
(
const
ga_size
nthreads
,
KERNEL
void
max_pool2d_grad_grad_kernel
(
const
ga_size
nthreads
,
const
ga_size
num
,
const
ga_size
channels
,
const
ga_size
pooled_height
,
const
ga_size
num
,
const
ga_size
channels
,
const
ga_size
pooled_height
,
...
@@ -47,6 +48,7 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
...
@@ -47,6 +48,7 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
}
}
#kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
KERNEL
void
max_pool3d_grad_grad_kernel
(
const
ga_size
nthreads
,
KERNEL
void
max_pool3d_grad_grad_kernel
(
const
ga_size
nthreads
,
const
ga_size
num
,
const
ga_size
channels
,
const
ga_size
pooled_depth
,
const
ga_size
num
,
const
ga_size
channels
,
const
ga_size
pooled_depth
,
...
...
theano/gpuarray/c_code/pool_max_grad.c
浏览文件 @
b80a7d12
#section kernels
#section kernels
#kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
#kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL
void
max_pool2d_grad_kernel
(
const
ga_size
nthreads
,
KERNEL
void
max_pool2d_grad_kernel
(
const
ga_size
nthreads
,
...
@@ -43,6 +44,7 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
...
@@ -43,6 +44,7 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
}
}
#kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL
void
max_pool3d_grad_kernel
(
const
ga_size
nthreads
,
KERNEL
void
max_pool3d_grad_kernel
(
const
ga_size
nthreads
,
...
...
theano/gpuarray/c_code/pool_max_rop.c
浏览文件 @
b80a7d12
#section kernels
#section kernels
#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size :
#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL
void
max_pool2d_rop_kernel
(
const
ga_size
nthreads
,
KERNEL
void
max_pool2d_rop_kernel
(
const
ga_size
nthreads
,
...
@@ -50,6 +51,7 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
...
@@ -50,6 +51,7 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
}
}
#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
#include "cluda.h"
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL
void
max_pool3d_rop_kernel
(
const
ga_size
nthreads
,
KERNEL
void
max_pool3d_rop_kernel
(
const
ga_size
nthreads
,
...
...
theano/gpuarray/elemwise.py
浏览文件 @
b80a7d12
...
@@ -1743,7 +1743,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1743,7 +1743,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname
=
"kernel_reduce_ccontig"
kname
=
"kernel_reduce_ccontig"
k_var
=
"kernel_reduce_ccontig_"
+
nodename
k_var
=
"kernel_reduce_ccontig_"
+
nodename
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
const ga_size d0,
const ga_size d0,
const
%(in_type)
s *A, const ga_size offset_A,
const
%(in_type)
s *A, const ga_size offset_A,
...
@@ -1781,7 +1782,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1781,7 +1782,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname
=
"kernel_reduce_1"
kname
=
"kernel_reduce_1"
k_var
=
"kernel_reduce_1_"
+
nodename
k_var
=
"kernel_reduce_1_"
+
nodename
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
const ga_size d0,
const ga_size d0,
const
%(in_type)
s *A, const ga_size offset_A,
const
%(in_type)
s *A, const ga_size offset_A,
...
@@ -1821,7 +1823,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1821,7 +1823,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname
=
"kernel_reduce_11"
kname
=
"kernel_reduce_11"
k_var
=
"kernel_reduce_11_"
+
nodename
k_var
=
"kernel_reduce_11_"
+
nodename
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
const ga_size d0, const ga_size d1,
const ga_size d0, const ga_size d1,
const
%(in_type)
s *A, const ga_size offset_A,
const
%(in_type)
s *A, const ga_size offset_A,
...
@@ -1909,7 +1912,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1909,7 +1912,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
load_in
+
"(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])"
,
load_in
+
"(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])"
,
{},
True
)
{},
True
)
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
%(decl)
s{
%(decl)
s{
%(init)
s
%(init)
s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
...
@@ -1943,7 +1947,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1943,7 +1947,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname
=
"kernel_reduce_010"
kname
=
"kernel_reduce_010"
k_var
=
"kernel_reduce_010_"
+
nodename
k_var
=
"kernel_reduce_010_"
+
nodename
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
const ga_size d0, const ga_size d1, const ga_size d2,
const ga_size d0, const ga_size d1, const ga_size d2,
const
%(in_type)
s *A, const ga_size offset_A,
const
%(in_type)
s *A, const ga_size offset_A,
...
@@ -1989,7 +1994,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1989,7 +1994,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname
=
"kernel_reduce_010_AD"
kname
=
"kernel_reduce_010_AD"
k_var
=
"kernel_reduce_010_AD_"
+
nodename
k_var
=
"kernel_reduce_010_AD_"
+
nodename
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
const ga_size A, const ga_size B, const ga_size C, const ga_size D,
const ga_size A, const ga_size B, const ga_size C, const ga_size D,
const
%(in_type)
s *X, const ga_size offset_X,
const
%(in_type)
s *X, const ga_size offset_X,
...
@@ -2053,7 +2059,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2053,7 +2059,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{},
True
)
{},
True
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[i0 * sA0 + 0 * sA1 + i2 * sA2])"
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[i0 * sA0 + 0 * sA1 + i2 * sA2])"
)
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
%(decl)
s
%(decl)
s
{
{
%(init)
s
%(init)
s
...
@@ -2088,7 +2095,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2088,7 +2095,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname
=
"kernel_reduce_110"
kname
=
"kernel_reduce_110"
k_var
=
"kernel_reduce_110_"
+
nodename
k_var
=
"kernel_reduce_110_"
+
nodename
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
const ga_size d0, const ga_size d1, const ga_size d2,
const ga_size d0, const ga_size d1, const ga_size d2,
const
%(in_type)
s *A, const ga_size offset_A,
const
%(in_type)
s *A, const ga_size offset_A,
...
@@ -2133,7 +2141,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2133,7 +2141,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{},
True
)
{},
True
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[i1 * sA1 + i2 * sA2])"
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[i1 * sA1 + i2 * sA2])"
)
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
%(decl)
s
%(decl)
s
{
{
%(init)
s
%(init)
s
...
@@ -2163,7 +2172,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2163,7 +2172,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{},
True
)
{},
True
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[0])"
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[0])"
)
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
%(decl)
s
%(decl)
s
{
{
%(init)
s
%(init)
s
...
@@ -2195,7 +2205,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2195,7 +2205,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname
=
"kernel_reduce_001"
kname
=
"kernel_reduce_001"
k_var
=
"kernel_reduce_001_"
+
nodename
k_var
=
"kernel_reduce_001_"
+
nodename
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""
#include "cluda.h"
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
const ga_size d0, const ga_size d1, const ga_size d2,
const ga_size d0, const ga_size d1, const ga_size d2,
const
%(in_type)
s *A, const ga_size offset_A,
const
%(in_type)
s *A, const ga_size offset_A,
...
@@ -2244,7 +2254,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2244,7 +2254,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{},
True
)
{},
True
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[i0 * sA0 + i1 * sA1])"
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[i0 * sA0 + i1 * sA1])"
)
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
%(decl)
s
%(decl)
s
{
{
%(init)
s
%(init)
s
...
@@ -2280,7 +2291,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2280,7 +2291,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{},
True
)
{},
True
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[i0 * sA0 + i2 * sA2])"
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[i0 * sA0 + i2 * sA2])"
)
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
%(decl)
s
%(decl)
s
{
{
%(init)
s
%(init)
s
...
@@ -2314,7 +2326,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2314,7 +2326,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{},
True
)
{},
True
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[0])"
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[0])"
)
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
%(decl)
s
%(decl)
s
{
{
%(init)
s
%(init)
s
...
@@ -2345,7 +2358,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2345,7 +2358,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kname
=
"kernel_reduce_1011"
kname
=
"kernel_reduce_1011"
k_var
=
"kernel_reduce_1011_"
+
nodename
k_var
=
"kernel_reduce_1011_"
+
nodename
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
const
%(in_type)
s *A, const ga_size offset_A,
const
%(in_type)
s *A, const ga_size offset_A,
...
@@ -2502,8 +2516,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2502,8 +2516,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
def
gpu_kernels
(
self
,
node
,
name
):
def
gpu_kernels
(
self
,
node
,
name
):
if
not
any
(
getattr
(
self
,
'redux'
,
[
node
.
inputs
[
0
]
.
ndim
!=
0
])):
if
not
any
(
getattr
(
self
,
'redux'
,
[
node
.
inputs
[
0
]
.
ndim
!=
0
])):
# Some OpenCL compilers do not accept no-arguments kernels
# Some OpenCL compilers do not accept no-arguments
empty
kernels
src
=
"
KERNEL void reduk(GLOBAL_MEM float *a) {
}"
src
=
"
#include
\"
cluda.h
\"\n
KERNEL void reduk(GLOBAL_MEM float *a) { a[0] = 0;
}"
params
=
[
'float32'
]
params
=
[
'float32'
]
else
:
else
:
k
=
self
.
get_kernel_cache
(
node
)
k
=
self
.
get_kernel_cache
(
node
)
...
...
theano/gpuarray/extra_ops.py
浏览文件 @
b80a7d12
...
@@ -74,7 +74,8 @@ class GpuCumOp(GpuKernelBase, Op):
...
@@ -74,7 +74,8 @@ class GpuCumOp(GpuKernelBase, Op):
k_var
=
"k_cumadd_"
+
nodename
k_var
=
"k_cumadd_"
+
nodename
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_x
=
node
.
inputs
[
0
]
.
dtype
flags
=
Kernel
.
get_flags
(
dtype_x
)
flags
=
Kernel
.
get_flags
(
dtype_x
)
code
=
"""
code
=
"""#include "cluda.h"
KERNEL void
%(kname)
s(float* input, ga_size input_offset,
KERNEL void
%(kname)
s(float* input, ga_size input_offset,
float* output, ga_size output_offset,
float* output, ga_size output_offset,
ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
...
@@ -112,7 +113,8 @@ class GpuCumOp(GpuKernelBase, Op):
...
@@ -112,7 +113,8 @@ class GpuCumOp(GpuKernelBase, Op):
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
'int32'
,
'int32'
,
gpuarray
.
GpuArray
,
gpuarray
.
SIZE
]
'int32'
,
'int32'
,
gpuarray
.
GpuArray
,
gpuarray
.
SIZE
]
code
=
"""
code
=
"""#include "cluda.h"
// helper functions
// helper functions
WITHIN_KERNEL
WITHIN_KERNEL
void k_reductionPhase(float* partialCumOp) {
void k_reductionPhase(float* partialCumOp) {
...
@@ -213,7 +215,8 @@ class GpuCumOp(GpuKernelBase, Op):
...
@@ -213,7 +215,8 @@ class GpuCumOp(GpuKernelBase, Op):
# k_finalCumOp
# k_finalCumOp
kname
=
"k_finalCumOp"
kname
=
"k_finalCumOp"
k_var
=
"k_finalCumOp_"
+
nodename
k_var
=
"k_finalCumOp_"
+
nodename
code
=
"""
code
=
"""#include "cluda.h"
KERNEL void k_finalCumOp(float* output, ga_size output_offset,
KERNEL void k_finalCumOp(float* output, ga_size output_offset,
float* blockSum, ga_size blockSum_offset,
float* blockSum, ga_size blockSum_offset,
size_t nbElementsPerCumOp,
size_t nbElementsPerCumOp,
...
...
theano/gpuarray/kernel_codegen.py
浏览文件 @
b80a7d12
...
@@ -34,7 +34,9 @@ def nvcc_kernel(name, params, body):
...
@@ -34,7 +34,9 @@ def nvcc_kernel(name, params, body):
else
:
else
:
yield
b
yield
b
bodystr
=
';
\n
'
.
join
(
flatbody
())
bodystr
=
';
\n
'
.
join
(
flatbody
())
return
"""KERNEL void
%(name)
s (
%(paramstr)
s)
return
"""#include "cluda.h"
KERNEL void
%(name)
s (
%(paramstr)
s)
{
{
%(bodystr)
s;
%(bodystr)
s;
}
}
...
...
theano/gpuarray/multinomial.py
浏览文件 @
b80a7d12
...
@@ -66,7 +66,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
...
@@ -66,7 +66,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
work_ctype
=
pygpu
.
gpuarray
.
dtype_to_ctype
(
work_dtype
(
node
.
inputs
[
0
]
.
dtype
))
work_ctype
=
pygpu
.
gpuarray
.
dtype_to_ctype
(
work_dtype
(
node
.
inputs
[
0
]
.
dtype
))
write_out_ctype
=
write_w
(
node
.
outputs
[
0
]
.
dtype
)
write_out_ctype
=
write_w
(
node
.
outputs
[
0
]
.
dtype
)
load_in_ctype
=
load_w
(
node
.
inputs
[
0
]
.
dtype
)
load_in_ctype
=
load_w
(
node
.
inputs
[
0
]
.
dtype
)
code
=
"""
code
=
"""#include "cluda.h"
KERNEL void k_multi_warp_multinomial(
KERNEL void k_multi_warp_multinomial(
const ga_size nb_multi,
const ga_size nb_multi,
const ga_size nb_outcomes,
const ga_size nb_outcomes,
...
@@ -276,7 +277,8 @@ class GPUAChoiceFromUniform(GpuKernelBase, Op):
...
@@ -276,7 +277,8 @@ class GPUAChoiceFromUniform(GpuKernelBase, Op):
def
gpu_kernels
(
self
,
node
,
name
):
def
gpu_kernels
(
self
,
node
,
name
):
replace
=
int
(
self
.
replace
)
replace
=
int
(
self
.
replace
)
code
=
"""
code
=
"""#include "cluda.h"
KERNEL void k_multi_warp_multinomial_wor(
KERNEL void k_multi_warp_multinomial_wor(
const ga_size nb_multi,
const ga_size nb_multi,
const ga_size nb_outcomes,
const ga_size nb_outcomes,
...
...
theano/gpuarray/neighbours.py
浏览文件 @
b80a7d12
...
@@ -61,7 +61,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -61,7 +61,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kernels
=
[]
kernels
=
[]
kname
=
"k_multi_warp_less"
kname
=
"k_multi_warp_less"
k_var
=
"k_multi_warp_less_"
+
nodename
k_var
=
"k_multi_warp_less_"
+
nodename
code
=
"""
code
=
"""#include "cluda.h"
// a version that uses less registers but doesn't work in all cases.
// a version that uses less registers but doesn't work in all cases.
%(mode_constants)
s
%(mode_constants)
s
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
...
@@ -163,7 +164,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -163,7 +164,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kname
=
"k_multi_warp"
kname
=
"k_multi_warp"
k_var
=
"k_multi_warp_"
+
nodename
k_var
=
"k_multi_warp_"
+
nodename
code
=
"""
code
=
"""#include "cluda.h"
%(mode_constants)
s
%(mode_constants)
s
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
const ga_int mode,
const ga_int mode,
...
...
theano/gpuarray/nnet.py
浏览文件 @
b80a7d12
...
@@ -75,7 +75,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
...
@@ -75,7 +75,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
gpuarray
.
GpuArray
,
gpuarray
.
SIZE
,
gpuarray
.
SSIZE
gpuarray
.
GpuArray
,
gpuarray
.
SIZE
,
gpuarray
.
SSIZE
]
]
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
KERNEL void
%(kname)
s(const ga_size M, const ga_size N,
KERNEL void
%(kname)
s(const ga_size M, const ga_size N,
GLOBAL_MEM const
%(type_x)
s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
GLOBAL_MEM const
%(type_x)
s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
GLOBAL_MEM const
%(type_b)
s* b, const ga_size offset_b, const ga_ssize bs0,
GLOBAL_MEM const
%(type_b)
s* b, const ga_size offset_b, const ga_ssize bs0,
...
@@ -393,7 +394,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
...
@@ -393,7 +394,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
gpuarray
.
GpuArray
,
gpuarray
.
SIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
GpuArray
,
gpuarray
.
SIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
]
]
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""#include "cluda.h"
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
const ga_size N, const ga_size K,
const ga_size N, const ga_size K,
GLOBAL_MEM const
%(type_dnll)
s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
GLOBAL_MEM const
%(type_dnll)
s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
...
@@ -557,7 +559,8 @@ class GpuSoftmax(GpuKernelBase, Op):
...
@@ -557,7 +559,8 @@ class GpuSoftmax(GpuKernelBase, Op):
kernels
=
[]
kernels
=
[]
kname
=
"kSoftmax"
kname
=
"kSoftmax"
k_var
=
"kSoftmax_"
+
nodename
k_var
=
"kSoftmax_"
+
nodename
code
=
"""
code
=
"""#include "cluda.h"
KERNEL void
%(kname)
s (const ga_size M, const ga_size N,
KERNEL void
%(kname)
s (const ga_size M, const ga_size N,
GLOBAL_MEM const
%(type_x)
s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const
%(type_x)
s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM
%(type_sm)
s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(
%(type_acc)
s, buf))
GLOBAL_MEM
%(type_sm)
s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(
%(type_acc)
s, buf))
...
@@ -630,7 +633,8 @@ class GpuSoftmax(GpuKernelBase, Op):
...
@@ -630,7 +633,8 @@ class GpuSoftmax(GpuKernelBase, Op):
flags
=
flags
,
objvar
=
k_var
))
flags
=
flags
,
objvar
=
k_var
))
kname
=
"kSoftmax_fixed_shared"
kname
=
"kSoftmax_fixed_shared"
k_var
=
"kSoftmax_fixed_shared"
+
nodename
k_var
=
"kSoftmax_fixed_shared"
+
nodename
code
=
"""
code
=
"""#include "cluda.h"
KERNEL void
%(kname)
s (const ga_size M, const ga_size N,
KERNEL void
%(kname)
s (const ga_size M, const ga_size N,
GLOBAL_MEM const
%(type_x)
s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const
%(type_x)
s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM
%(type_sm)
s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(
%(type_acc)
s, buf))
GLOBAL_MEM
%(type_sm)
s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(
%(type_acc)
s, buf))
...
@@ -854,7 +858,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
...
@@ -854,7 +858,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
kernels
=
[]
kernels
=
[]
kname
=
"kSoftmaxWithBias"
kname
=
"kSoftmaxWithBias"
k_var
=
"kSoftmaxWithBias_"
+
nodename
k_var
=
"kSoftmaxWithBias_"
+
nodename
code
=
"""
code
=
"""#include "cluda.h"
KERNEL void
%(kname)
s (const ga_size M, const ga_size N,
KERNEL void
%(kname)
s (const ga_size M, const ga_size N,
GLOBAL_MEM const
%(type_x)
s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const
%(type_x)
s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const
%(type_b)
s * b, const ga_size offset_b, const ga_ssize sb0,
GLOBAL_MEM const
%(type_b)
s * b, const ga_size offset_b, const ga_ssize sb0,
...
@@ -930,7 +935,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
...
@@ -930,7 +935,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
flags
=
flags
,
objvar
=
k_var
))
flags
=
flags
,
objvar
=
k_var
))
kname
=
"kSoftmaxWithBias_fixed_shared"
kname
=
"kSoftmaxWithBias_fixed_shared"
k_var
=
"kSoftmaxWithBias_fixed_shared"
+
nodename
k_var
=
"kSoftmaxWithBias_fixed_shared"
+
nodename
code
=
"""
code
=
"""#include "cluda.h"
KERNEL void
%(kname)
s (const ga_size M, const ga_size N,
KERNEL void
%(kname)
s (const ga_size M, const ga_size N,
GLOBAL_MEM const
%(type_x)
s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const
%(type_x)
s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const
%(type_b)
s * b, const ga_size offset_b, const ga_ssize sb0,
GLOBAL_MEM const
%(type_b)
s * b, const ga_size offset_b, const ga_ssize sb0,
...
...
theano/gpuarray/rng_mrg.py
浏览文件 @
b80a7d12
...
@@ -80,7 +80,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
...
@@ -80,7 +80,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
else
:
else
:
raise
ValueError
(
'Unsupported data type for output'
,
raise
ValueError
(
'Unsupported data type for output'
,
self
.
output_type
.
dtype
)
self
.
output_type
.
dtype
)
code
=
"""
code
=
"""#include "cluda.h"
KERNEL void mrg_uniform(
KERNEL void mrg_uniform(
GLOBAL_MEM
%(otype)
s *sample_data,
GLOBAL_MEM
%(otype)
s *sample_data,
ga_size sample_offset,
ga_size sample_offset,
...
...
theano/gpuarray/subtensor.py
浏览文件 @
b80a7d12
...
@@ -1121,7 +1121,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
...
@@ -1121,7 +1121,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
flags
=
Kernel
.
get_flags
(
dtype_x
,
dtype_y
,
dtype_ind
)
flags
=
Kernel
.
get_flags
(
dtype_x
,
dtype_y
,
dtype_ind
)
kname
=
"k_vector_add_fast"
kname
=
"k_vector_add_fast"
k_var
=
"k_vector_add_fast_"
+
nodename
k_var
=
"k_vector_add_fast_"
+
nodename
code
=
"""#include
<cluda.h>
code
=
"""#include
"cluda.h"
KERNEL void k_vector_add_fast(const ga_size numRowsX,
KERNEL void k_vector_add_fast(const ga_size numRowsX,
const ga_size numColsX,
const ga_size numColsX,
const ga_ssize stridesX0,
const ga_ssize stridesX0,
...
@@ -1211,7 +1211,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
...
@@ -1211,7 +1211,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
PyGpuArray_DIMS(py_other)[0],
PyGpuArray_DIMS(py_other)[0],
PyGpuArray_DIMS(py_other)[1],
PyGpuArray_DIMS(py_other)[1],
PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y,
PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y,
PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y
PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y
,
py_other->ga.data,
py_other->ga.data,
py_other->ga.offset,
py_other->ga.offset,
PyGpuArray_DIMS(indices_arr)[0],
PyGpuArray_DIMS(indices_arr)[0],
...
...
theano/gpuarray/tests/c_code/tstgpueye.c
浏览文件 @
b80a7d12
#section kernels
#section kernels
#kernel eye : *, size, size, size :
#kernel eye : *, size, size, size :
#include <cluda.h>
/* The eye name will be used to generate supporting objects. The only
/* The eye name will be used to generate supporting objects. The only
you probably need to care about is the kernel object which will be
you probably need to care about is the kernel object which will be
named 'k_' + <the name above> (k_eye in this case). This name also
named 'k_' + <the name above> (k_eye in this case). This name also
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论