Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
5df0cfd8
提交
5df0cfd8
authored
7月 01, 2017
作者:
Frédéric Bastien
提交者:
GitHub
7月 01, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #6091 from notoraptor/params-for-other-ops
Params for other ops
上级
49d99209
61cb8c41
显示空白字符变更
内嵌
并排
正在显示
7 个修改的文件
包含
255 行增加
和
208 行删除
+255
-208
ops.py
theano/compile/ops.py
+16
-3
extra_ops.py
theano/gpuarray/extra_ops.py
+19
-18
neighbours.py
theano/gpuarray/neighbours.py
+74
-63
rng_mrg.py
theano/gpuarray/rng_mrg.py
+35
-24
subtensor.py
theano/gpuarray/subtensor.py
+67
-60
neighbours.py
theano/tensor/nnet/neighbours.py
+31
-26
subtensor.py
theano/tensor/subtensor.py
+13
-14
没有找到文件。
theano/compile/ops.py
浏览文件 @
5df0cfd8
...
@@ -346,6 +346,18 @@ class Shape_i(gof.Op):
...
@@ -346,6 +346,18 @@ class Shape_i(gof.Op):
i
=
int
(
i
)
i
=
int
(
i
)
self
.
i
=
i
self
.
i
=
i
# NB:
# 1) params_type is defined as a property to avoid
# loop in Python import caused by importing theano.scalar below
# when params_type is defined directly in class code.
# 2) We wrap scalar into ParamsType (instead of directly using scalar as op param)
# to avoid Theano converting scalar param to constant that would be later
# hardcoded as litteral in C code, making us loose all the advantages of
# using params.
@property
def
params_type
(
self
):
return
gof
.
ParamsType
(
i
=
theano
.
scalar
.
basic
.
int64
)
def
__str__
(
self
):
def
__str__
(
self
):
return
'
%
s{
%
i}'
%
(
self
.
__class__
.
__name__
,
self
.
i
)
return
'
%
s{
%
i}'
%
(
self
.
__class__
.
__name__
,
self
.
i
)
...
@@ -360,7 +372,7 @@ class Shape_i(gof.Op):
...
@@ -360,7 +372,7 @@ class Shape_i(gof.Op):
(
x
,
self
.
i
))
(
x
,
self
.
i
))
return
theano
.
Apply
(
self
,
[
x
],
[
theano
.
tensor
.
lscalar
()])
return
theano
.
Apply
(
self
,
[
x
],
[
theano
.
tensor
.
lscalar
()])
def
perform
(
self
,
node
,
inp
,
out_
):
def
perform
(
self
,
node
,
inp
,
out_
,
params
):
x
,
=
inp
x
,
=
inp
out
,
=
out_
out
,
=
out_
if
out
[
0
]
is
None
:
if
out
[
0
]
is
None
:
...
@@ -383,7 +395,7 @@ class Shape_i(gof.Op):
...
@@ -383,7 +395,7 @@ class Shape_i(gof.Op):
version
.
append
((
str
(
t
),
v
))
version
.
append
((
str
(
t
),
v
))
if
version
:
if
version
:
version
.
append
(
1
)
version
.
append
(
2
)
return
tuple
(
version
)
return
tuple
(
version
)
...
@@ -391,7 +403,8 @@ class Shape_i(gof.Op):
...
@@ -391,7 +403,8 @@ class Shape_i(gof.Op):
iname
,
=
inames
iname
,
=
inames
oname
,
=
onames
oname
,
=
onames
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
i
=
self
.
i
# i is then 'params->i', not just 'params'.
i
=
sub
[
'params'
]
+
'->i'
itype
=
node
.
inputs
[
0
]
.
type
.
__class__
itype
=
node
.
inputs
[
0
]
.
type
.
__class__
if
itype
in
self
.
c_code_and_version
:
if
itype
in
self
.
c_code_and_version
:
...
...
theano/gpuarray/extra_ops.py
浏览文件 @
5df0cfd8
...
@@ -10,6 +10,9 @@ except ImportError:
...
@@ -10,6 +10,9 @@ except ImportError:
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
GpuReshape
,
infer_context_name
)
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
GpuReshape
,
infer_context_name
)
from
.opt
import
register_opt
,
op_lifter
,
register_opt2
from
.opt
import
register_opt
,
op_lifter
,
register_opt2
from
.type
import
gpu_context_type
from
theano.gof
import
ParamsType
import
theano.scalar
as
scalar
class
GpuCumOp
(
GpuKernelBase
,
Op
):
class
GpuCumOp
(
GpuKernelBase
,
Op
):
...
@@ -21,9 +24,12 @@ class GpuCumOp(GpuKernelBase, Op):
...
@@ -21,9 +24,12 @@ class GpuCumOp(GpuKernelBase, Op):
"""
"""
SUPPORTED_NDIMS
=
3
SUPPORTED_NDIMS
=
3
__props__
=
(
'axis'
,
'mode'
)
__props__
=
(
'axis'
,
'mode'
)
params_type
=
ParamsType
(
axis
=
scalar
.
int32
,
context
=
gpu_context_type
)
def
__init__
(
self
,
axis
,
mode
=
'add'
):
def
__init__
(
self
,
axis
,
mode
=
'add'
):
self
.
axis
=
axis
if
axis
else
0
assert
axis
is
not
None
self
.
axis
=
int
(
axis
)
self
.
mode
=
mode
self
.
mode
=
mode
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
...
@@ -35,7 +41,7 @@ class GpuCumOp(GpuKernelBase, Op):
...
@@ -35,7 +41,7 @@ class GpuCumOp(GpuKernelBase, Op):
return
hash
(
self
.
axis
)
^
hash
(
self
.
mode
)
return
hash
(
self
.
axis
)
^
hash
(
self
.
mode
)
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
6
,)
return
(
7
,)
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
,
'<gpuarray_helper.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
,
'<gpuarray_helper.h>'
]
...
@@ -43,6 +49,9 @@ class GpuCumOp(GpuKernelBase, Op):
...
@@ -43,6 +49,9 @@ class GpuCumOp(GpuKernelBase, Op):
def
c_header_dirs
(
self
):
def
c_header_dirs
(
self
):
return
[
os
.
path
.
dirname
(
__file__
)]
return
[
os
.
path
.
dirname
(
__file__
)]
def
get_params
(
self
,
node
):
return
self
.
params_type
.
get_params
(
self
,
context
=
node
.
inputs
[
0
]
.
type
.
context
)
def
make_node
(
self
,
x
):
def
make_node
(
self
,
x
):
assert
x
.
type
.
dtype
==
'float32'
,
"Only float32 supported for GpuCumOp"
assert
x
.
type
.
dtype
==
'float32'
,
"Only float32 supported for GpuCumOp"
...
@@ -244,24 +253,18 @@ class GpuCumOp(GpuKernelBase, Op):
...
@@ -244,24 +253,18 @@ class GpuCumOp(GpuKernelBase, Op):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
if
node
.
inputs
[
0
]
.
type
.
context
.
kind
!=
b
'cuda'
:
if
node
.
inputs
[
0
]
.
type
.
context
.
kind
!=
b
'cuda'
:
raise
NotImplementedError
(
"cuda only"
)
raise
NotImplementedError
(
"cuda only"
)
x
,
=
inp
return
"""
z
,
=
out
axis
=
self
.
axis
if
self
.
axis
is
not
None
else
0
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'params'
]
code
=
"""
const size_t* shape = PyGpuArray_DIMS(
%(x)
s);
const size_t* shape = PyGpuArray_DIMS(
%(x)
s);
bool needAllocation = !
%(z)
s || PyGpuArray_NDIM(
%(x)
s) != PyGpuArray_NDIM(
%(z)
s);
bool needAllocation = !
%(z)
s || PyGpuArray_NDIM(
%(x)
s) != PyGpuArray_NDIM(
%(z)
s);
int axis =
%(
axis)
s;
int axis =
%(
params)
s->axi
s;
if (axis < 0) {
if (axis < 0) {
// Convert negative axis to positive axis.
// Convert negative axis to positive axis.
axis += PyGpuArray_NDIM(
%(x)
s);
axis += PyGpuArray_NDIM(
%(x)
s);
}
}
if (theano_prep_output(&
%(z)
s, PyGpuArray_NDIM(
%(x)
s), PyGpuArray_DIMS(
%(x)
s),
%(x)
s->ga.typecode, GA_C_ORDER,
%(ctx)
s) != 0){
if (theano_prep_output(&
%(z)
s, PyGpuArray_NDIM(
%(x)
s), PyGpuArray_DIMS(
%(x)
s),
%(x)
s->ga.typecode, GA_C_ORDER,
%(params)
s->context) != 0) {
%(fail)
s;
%(fail)
s;
}
}
...
@@ -270,17 +273,17 @@ class GpuCumOp(GpuKernelBase, Op):
...
@@ -270,17 +273,17 @@ class GpuCumOp(GpuKernelBase, Op):
size_t max_grid_size1;
size_t max_grid_size1;
size_t max_grid_size2;
size_t max_grid_size2;
int err;
int err;
err = gpucontext_property(
%(
ctx)
s
->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
err = gpucontext_property(
%(
params)
s->context
->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
if (err != GA_NO_ERROR){
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0");
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0");
%(fail)
s;
%(fail)
s;
}
}
err = gpucontext_property(
%(
ctx)
s
->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
err = gpucontext_property(
%(
params)
s->context
->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
if (err != GA_NO_ERROR){
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1");
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1");
%(fail)
s;
%(fail)
s;
}
}
err = gpucontext_property(
%(
ctx)
s
->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
err = gpucontext_property(
%(
params)
s->context
->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
if (err != GA_NO_ERROR){
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
%(fail)
s;
%(fail)
s;
...
@@ -289,9 +292,7 @@ class GpuCumOp(GpuKernelBase, Op):
...
@@ -289,9 +292,7 @@ class GpuCumOp(GpuKernelBase, Op):
%(fail)
s;
%(fail)
s;
}
}
}
}
"""
%
locals
()
"""
%
dict
(
x
=
inp
[
0
],
z
=
out
[
0
],
nodename
=
nodename
,
fail
=
sub
[
'fail'
],
params
=
sub
[
'params'
])
return
code
def
c_support_code_struct
(
self
,
node
,
nodename
):
def
c_support_code_struct
(
self
,
node
,
nodename
):
code
=
"""
code
=
"""
...
...
theano/gpuarray/neighbours.py
浏览文件 @
5df0cfd8
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
numpy
as
np
from
theano
import
Op
,
Apply
,
config
from
theano
import
Op
,
Apply
,
config
from
theano.gof
import
ParamsType
from
theano.tensor.nnet.neighbours
import
Images2Neibs
from
theano.tensor.nnet.neighbours
import
Images2Neibs
import
theano.tensor
as
T
import
theano.tensor
as
T
try
:
try
:
import
pygpu
from
pygpu
import
gpuarray
from
pygpu
import
gpuarray
except
ImportError
:
except
ImportError
:
pass
pass
...
@@ -14,7 +13,7 @@ except ImportError:
...
@@ -14,7 +13,7 @@ except ImportError:
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
infer_context_name
)
infer_context_name
)
from
.opt
import
register_opt2
,
op_lifter
,
register_opt
from
.opt
import
register_opt2
,
op_lifter
,
register_opt
from
.type
import
GpuArrayType
from
.type
import
GpuArrayType
,
gpu_context_type
class
GpuImages2Neibs
(
GpuKernelBase
,
Images2Neibs
,
Op
):
class
GpuImages2Neibs
(
GpuKernelBase
,
Images2Neibs
,
Op
):
...
@@ -22,13 +21,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -22,13 +21,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
Images2Neibs for the GPU.
Images2Neibs for the GPU.
"""
"""
def
__init__
(
self
,
mode
=
'valid'
):
params_type
=
ParamsType
(
mode
=
Images2Neibs
.
BORDER_MODE
,
context
=
gpu_context_type
)
if
mode
not
in
[
'valid'
,
'half'
,
'full'
,
'ignore_borders'
,
'wrap_centered'
]:
def
get_params
(
self
,
node
):
raise
NotImplementedError
(
"Only the mode valid, half, full, "
return
self
.
params_type
.
get_params
(
self
,
context
=
node
.
inputs
[
0
]
.
type
.
context
)
"ignore_borders and wrap_centered have "
"been implemented for GpuImages2Neibs"
)
self
.
mode
=
mode
def
make_node
(
self
,
ten4
,
neib_shape
,
neib_step
=
None
):
def
make_node
(
self
,
ten4
,
neib_shape
,
neib_step
=
None
):
ten4
=
as_gpuarray_variable
(
ten4
,
infer_context_name
(
ten4
))
ten4
=
as_gpuarray_variable
(
ten4
,
infer_context_name
(
ten4
))
...
@@ -50,7 +46,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -50,7 +46,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
context_name
=
ten4
.
type
.
context_name
)()])
context_name
=
ten4
.
type
.
context_name
)()])
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
1
2
,)
return
(
1
3
,)
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
...
@@ -61,13 +57,16 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -61,13 +57,16 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
flags
=
Kernel
.
get_flags
(
dtype_ten4
,
dtype_z
)
flags
=
Kernel
.
get_flags
(
dtype_ten4
,
dtype_z
)
type_ten4
=
gpuarray
.
dtype_to_ctype
(
dtype_ten4
)
type_ten4
=
gpuarray
.
dtype_to_ctype
(
dtype_ten4
)
type_z
=
gpuarray
.
dtype_to_ctype
(
dtype_z
)
type_z
=
gpuarray
.
dtype_to_ctype
(
dtype_z
)
mode
=
self
.
mode
# `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here.
mode_constants
=
self
.
BORDER_MODE
.
c_support_code
()
kernels
=
[]
kernels
=
[]
kname
=
"k_multi_warp_less"
kname
=
"k_multi_warp_less"
k_var
=
"k_multi_warp_less_"
+
nodename
k_var
=
"k_multi_warp_less_"
+
nodename
code
=
"""
code
=
"""
// a version that uses less registers but doesn't work in all cases.
// a version that uses less registers but doesn't work in all cases.
%(mode_constants)
s
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
const ga_int mode,
const ga_int nb_batch,
const ga_int nb_batch,
const ga_int nb_stack,
const ga_int nb_stack,
const ga_int height,
const ga_int height,
...
@@ -110,29 +109,29 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -110,29 +109,29 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
ga_int i = LID_1; // loop over c
ga_int i = LID_1; // loop over c
{
{
ga_int ten4_2 = i + a * step_x;
ga_int ten4_2 = i + a * step_x;
if(
"
%(mode)
s"=="wrap_centered")
{
if(
mode == MODE_WRAP_CENTERED)
{
ten4_2 -= wrap_centered_half_idx_shift_x;
ten4_2 -= wrap_centered_half_idx_shift_x;
if ( ten4_2 < 0 )
if ( ten4_2 < 0 )
ten4_2 += height;
ten4_2 += height;
else if (ten4_2 >= height)
else if (ten4_2 >= height)
ten4_2 -= height;
ten4_2 -= height;
} else if (
"
%(mode)
s"=="half")
{
} else if (
mode == MODE_HALF)
{
ten4_2 -= wrap_centered_half_idx_shift_x;
ten4_2 -= wrap_centered_half_idx_shift_x;
} else if (
"
%(mode)
s"=="full")
{
} else if (
mode == MODE_FULL)
{
ten4_2 -= c - 1;
ten4_2 -= c - 1;
}
}
ga_int j = LID_0; // loop over d
ga_int j = LID_0; // loop over d
{
{
ga_int ten4_3 = j + b * step_y;
ga_int ten4_3 = j + b * step_y;
if(
"
%(mode)
s"=="wrap_centered"
){
if(
mode == MODE_WRAP_CENTERED
){
ten4_3 -= wrap_centered_half_idx_shift_y;
ten4_3 -= wrap_centered_half_idx_shift_y;
if ( ten4_3 < 0 )
if ( ten4_3 < 0 )
ten4_3 += width;
ten4_3 += width;
else if (ten4_3 >= width)
else if (ten4_3 >= width)
ten4_3 -= width;
ten4_3 -= width;
} else if (
"
%(mode)
s"=="half")
{
} else if (
mode == MODE_HALF)
{
ten4_3 -= wrap_centered_half_idx_shift_y;
ten4_3 -= wrap_centered_half_idx_shift_y;
} else if (
"
%(mode)
s"=="full")
{
} else if (
mode == MODE_FULL)
{
ten4_3 -= d - 1;
ten4_3 -= d - 1;
}
}
...
@@ -150,8 +149,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -150,8 +149,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
}
}
}
}
}
}
}"""
%
locals
(
)
}"""
%
dict
(
kname
=
kname
,
type_ten4
=
type_ten4
,
type_z
=
type_z
,
mode_constants
=
mode_constants
)
params
=
[
params
=
[
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'uintp'
,
'uintp'
,
'uintp'
,
'uintp'
,
'uintp'
,
'uintp'
,
'uintp'
,
'uintp'
,
...
@@ -165,7 +165,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -165,7 +165,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kname
=
"k_multi_warp"
kname
=
"k_multi_warp"
k_var
=
"k_multi_warp_"
+
nodename
k_var
=
"k_multi_warp_"
+
nodename
code
=
"""
code
=
"""
%(mode_constants)
s
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
const ga_int mode,
const ga_int nb_batch,
const ga_int nb_batch,
const ga_int nb_stack,
const ga_int nb_stack,
const ga_int height,
const ga_int height,
...
@@ -209,30 +211,30 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -209,30 +211,30 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
for (ga_int i = LID_1; i < c; i+=LDIM_1)
for (ga_int i = LID_1; i < c; i+=LDIM_1)
{
{
ga_int ten4_2 = i + a * step_x;
ga_int ten4_2 = i + a * step_x;
if(
"
%(mode)
s"=="wrap_centered")
{
if(
mode == MODE_WRAP_CENTERED)
{
ten4_2 -= wrap_centered_half_idx_shift_x;
ten4_2 -= wrap_centered_half_idx_shift_x;
if ( ten4_2 < 0 )
if ( ten4_2 < 0 )
ten4_2 += height;
ten4_2 += height;
else if (ten4_2 >= height)
else if (ten4_2 >= height)
ten4_2 -= height;
ten4_2 -= height;
} else if (
"
%(mode)
s"=="half")
{
} else if (
mode == MODE_HALF)
{
ten4_2 -= wrap_centered_half_idx_shift_x;
ten4_2 -= wrap_centered_half_idx_shift_x;
} else if (
"
%(mode)
s"=="full")
{
} else if (
mode == MODE_FULL)
{
ten4_2 -= c - 1;
ten4_2 -= c - 1;
}
}
// loop over d
// loop over d
for (ga_int j = LID_0; j < d; j+=LDIM_0)
for (ga_int j = LID_0; j < d; j+=LDIM_0)
{
{
ga_int ten4_3 = j + b * step_y;
ga_int ten4_3 = j + b * step_y;
if(
"
%(mode)
s"=="wrap_centered")
{
if(
mode == MODE_WRAP_CENTERED)
{
ten4_3 -= wrap_centered_half_idx_shift_y;
ten4_3 -= wrap_centered_half_idx_shift_y;
if ( ten4_3 < 0 )
if ( ten4_3 < 0 )
ten4_3 += width;
ten4_3 += width;
else if (ten4_3 >= width)
else if (ten4_3 >= width)
ten4_3 -= width;
ten4_3 -= width;
} else if (
"
%(mode)
s"=="half")
{
} else if (
mode == MODE_HALF)
{
ten4_3 -= wrap_centered_half_idx_shift_y;
ten4_3 -= wrap_centered_half_idx_shift_y;
} else if (
"
%(mode)
s"=="full")
{
} else if (
mode == MODE_FULL)
{
ten4_3 -= d - 1;
ten4_3 -= d - 1;
}
}
...
@@ -251,8 +253,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -251,8 +253,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
}
}
}
}
}
}
"""
%
locals
(
)
"""
%
dict
(
kname
=
kname
,
type_ten4
=
type_ten4
,
type_z
=
type_z
,
mode_constants
=
mode_constants
)
params
=
[
params
=
[
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'uintp'
,
'uintp'
,
'uintp'
,
'uintp'
,
'uintp'
,
'uintp'
,
'uintp'
,
'uintp'
,
...
@@ -274,18 +277,6 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -274,18 +277,6 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
"""
"""
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
dtype_ten4
=
node
.
inputs
[
0
]
.
dtype
dtype_neib_shape
=
node
.
inputs
[
1
]
.
dtype
dtype_neib_step
=
node
.
inputs
[
2
]
.
dtype
dtype_z
=
node
.
outputs
[
0
]
.
dtype
itemsize_ten4
=
np
.
dtype
(
dtype_ten4
)
.
itemsize
itemsize_z
=
np
.
dtype
(
dtype_z
)
.
itemsize
typecode_z
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
outputs
[
0
]
.
dtype
)
ten4
,
neib_shape
,
neib_step
=
inp
z
,
=
out
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'params'
]
mode
=
self
.
mode
err_check
=
"""
err_check
=
"""
if (err != GA_NO_ERROR) {
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
PyErr_Format(PyExc_RuntimeError,
...
@@ -293,16 +284,23 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -293,16 +284,23 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
GpuKernel_error(fptr, err));
GpuKernel_error(fptr, err));
%(fail)
s;
%(fail)
s;
}
}
"""
%
locals
(
)
"""
%
dict
(
fail
=
sub
[
'fail'
]
)
sync
=
""
sync
=
""
if
config
.
gpuarray
.
sync
:
if
config
.
gpuarray
.
sync
:
sync
=
"""
sync
=
"""
err = GpuArray_sync(&
%(z)
s->ga);
err = GpuArray_sync(&
%(z)
s->ga);
%(err_check)
s
%(err_check)
s
"""
%
locals
()
"""
%
dict
(
z
=
out
[
0
],
err_check
=
err_check
)
# NB: To reduce C code variability:
# For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize
# For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node)
# For typecode_z, I use ten4->ga.typecode (for same reason as above)
return
"""
return
"""
int grid_c = -1;
int grid_c = -1;
int grid_d = -1;
int grid_d = -1;
size_t itemsize_ten4 = GpuArray_ITEMSIZE(&
%(ten4)
s->ga);
size_t itemsize_z = itemsize_ten4;
int typecode_z =
%(ten4)
s->ga.typecode;
{
{
if (PyGpuArray_NDIM(
%(ten4)
s) != 4)
if (PyGpuArray_NDIM(
%(ten4)
s) != 4)
...
@@ -351,7 +349,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -351,7 +349,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
%(fail)
s;
%(fail)
s;
}
}
if (
"
%(mode)
s" == "wrap_centered"
) {
if (
%(params)
s->mode == MODE_WRAP_CENTERED
) {
if (c
%%2
!=1 || d
%%2
!=1){
if (c
%%2
!=1 || d
%%2
!=1){
PyErr_Format(PyExc_TypeError,
PyErr_Format(PyExc_TypeError,
"GpuImages2Neibs: in mode wrap_centered need patch with odd shapes");
"GpuImages2Neibs: in mode wrap_centered need patch with odd shapes");
...
@@ -375,7 +373,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -375,7 +373,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
(size_t)step_y);
(size_t)step_y);
}
else if ( "
%(mode)
s" == "valid"
) {
}
else if (
%(params)
s->mode == MODE_VALID
) {
if ( ((PyGpuArray_DIMS(
%(ten4)
s))[2] < c) ||
if ( ((PyGpuArray_DIMS(
%(ten4)
s))[2] < c) ||
((((PyGpuArray_DIMS(
%(ten4)
s))[2]-c)
%%
step_x)!=0))
((((PyGpuArray_DIMS(
%(ten4)
s))[2]-c)
%%
step_x)!=0))
{
{
...
@@ -400,12 +398,12 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -400,12 +398,12 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
grid_c = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[2]-c)/step_x);
grid_c = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[2]-c)/step_x);
//number of patch in width
//number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[3]-d)/step_y);
grid_d = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[3]-d)/step_y);
}
else if ( "
%(mode)
s" == "ignore_borders"
) {
}
else if (
%(params)
s->mode == MODE_IGNORE_BORDERS
) {
//number of patch in height
//number of patch in height
grid_c = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[2]-c)/step_x);
grid_c = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[2]-c)/step_x);
//number of patch in width
//number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[3]-d)/step_y);
grid_d = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[3]-d)/step_y);
}
else if ( "
%(mode)
s" == "half"
) {
}
else if (
%(params)
s->mode == MODE_HALF
) {
if ( ((PyGpuArray_DIMS(
%(ten4)
s))[2] < c) ||
if ( ((PyGpuArray_DIMS(
%(ten4)
s))[2] < c) ||
((((PyGpuArray_DIMS(
%(ten4)
s))[2]-(c
%%2
))
%%
step_x)!=0))
((((PyGpuArray_DIMS(
%(ten4)
s))[2]-(c
%%2
))
%%
step_x)!=0))
{
{
...
@@ -430,7 +428,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -430,7 +428,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
grid_c = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[2]-(c
%%2
))/step_x);
grid_c = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[2]-(c
%%2
))/step_x);
//number of patch in width
//number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[3]-(d
%%2
))/step_y);
grid_d = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[3]-(d
%%2
))/step_y);
}
else if ( "
%(mode)
s" == "full"
) {
}
else if (
%(params)
s->mode == MODE_FULL
) {
if ( ((PyGpuArray_DIMS(
%(ten4)
s))[2] < c) ||
if ( ((PyGpuArray_DIMS(
%(ten4)
s))[2] < c) ||
( (((PyGpuArray_DIMS(
%(ten4)
s))[2]+c-2)
%%
step_x)!=0))
( (((PyGpuArray_DIMS(
%(ten4)
s))[2]+c-2)
%%
step_x)!=0))
{
{
...
@@ -455,9 +453,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -455,9 +453,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
grid_c = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[2]+c-2)/step_x);
grid_c = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[2]+c-2)/step_x);
//number of patch in width
//number of patch in width
grid_d = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[3]+d-2)/step_y);
grid_d = 1+(((PyGpuArray_DIMS(
%(ten4)
s))[3]+d-2)/step_y);
}
else
{
}
else
{
PyErr_Format(PyExc_TypeError,
PyErr_Format(PyExc_TypeError,
"GpuImages2Neibs:: unknown mode
'
%(mode)
s'"
);
"GpuImages2Neibs:: unknown mode
%%
d",
%(params)
s->mode
);
%(fail)
s;
%(fail)
s;
}
}
...
@@ -476,8 +474,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -476,8 +474,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
size_t dims[2];
size_t dims[2];
dims[0] = z_dim0;
dims[0] = z_dim0;
dims[1] = z_dim1;
dims[1] = z_dim1;
%(z)
s = pygpu_empty(2, dims,
%(typecode_z)
s
,
%(z)
s = pygpu_empty(2, dims,
typecode_z
,
GA_C_ORDER,
%(
ctx)
s
, Py_None);
GA_C_ORDER,
%(
params)
s->context
, Py_None);
if (!
%(z)
s)
if (!
%(z)
s)
{
{
PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
...
@@ -490,6 +488,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -490,6 +488,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
{ // NESTED SCOPE
{ // NESTED SCOPE
const int mode =
%(params)
s->mode;
const int nb_batch = PyGpuArray_DIMS(
%(ten4)
s)[0];
const int nb_batch = PyGpuArray_DIMS(
%(ten4)
s)[0];
const int nb_stack = PyGpuArray_DIMS(
%(ten4)
s)[1];
const int nb_stack = PyGpuArray_DIMS(
%(ten4)
s)[1];
const int height = PyGpuArray_DIMS(
%(ten4)
s)[2];
const int height = PyGpuArray_DIMS(
%(ten4)
s)[2];
...
@@ -507,7 +506,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -507,7 +506,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
size_t threads_per_block[3] = {d, c, 1};
size_t threads_per_block[3] = {d, c, 1};
//get the max threads per blocks
//get the max threads per blocks
size_t max_threads_dim;
size_t max_threads_dim;
int err = gpucontext_property(
%(
ctx)
s
->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
int err = gpucontext_property(
%(
params)
s->context
->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
if (err != GA_NO_ERROR){
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
%(fail)
s;
%(fail)
s;
...
@@ -535,14 +534,19 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -535,14 +534,19 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
}else{
}else{
fptr = &k_multi_warp_
%(name)
s;
fptr = &k_multi_warp_
%(name)
s;
}
}
// printf("
%%
zu
%%
zu
%%
zu
%%
zu
%%
zu
%%
zu
%%
zu
\\
n", max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1], n_blocks[2]);
/*
size_t stride_A0 = PyGpuArray_STRIDES(
%(ten4)
s)[0] /
%(itemsize_ten4)
s;
printf("
%%
zu
%%
zu
%%
zu
%%
zu
%%
zu
%%
zu
%%
zu
\\
n",
size_t stride_A1 = PyGpuArray_STRIDES(
%(ten4)
s)[1] /
%(itemsize_ten4)
s;
max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2],
size_t stride_A2 = PyGpuArray_STRIDES(
%(ten4)
s)[2] /
%(itemsize_ten4)
s;
n_blocks[0], n_blocks[1], n_blocks[2]);
size_t stride_A3 = PyGpuArray_STRIDES(
%(ten4)
s)[3] /
%(itemsize_ten4)
s;
*/
size_t stride_Z0 = PyGpuArray_STRIDES(
%(z)
s)[0] /
%(itemsize_z)
s;
size_t stride_A0 = PyGpuArray_STRIDES(
%(ten4)
s)[0] / itemsize_ten4;
size_t stride_Z1 = PyGpuArray_STRIDES(
%(z)
s)[1] /
%(itemsize_z)
s;
size_t stride_A1 = PyGpuArray_STRIDES(
%(ten4)
s)[1] / itemsize_ten4;
void *kernel_params[] = {(void *)&nb_batch,
size_t stride_A2 = PyGpuArray_STRIDES(
%(ten4)
s)[2] / itemsize_ten4;
size_t stride_A3 = PyGpuArray_STRIDES(
%(ten4)
s)[3] / itemsize_ten4;
size_t stride_Z0 = PyGpuArray_STRIDES(
%(z)
s)[0] / itemsize_z;
size_t stride_Z1 = PyGpuArray_STRIDES(
%(z)
s)[1] / itemsize_z;
void *kernel_params[] = {(void *)&mode,
(void *)&nb_batch,
(void *)&nb_stack,
(void *)&nb_stack,
(void *)&height, (void *)&width,
(void *)&height, (void *)&width,
(void *)&c, (void *)&d,
(void *)&c, (void *)&d,
...
@@ -562,11 +566,18 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -562,11 +566,18 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
%(err_check)
s
%(err_check)
s
%(sync)
s
%(sync)
s
} // END NESTED SCOPE
} // END NESTED SCOPE
"""
%
locals
()
"""
%
dict
(
ten4
=
inp
[
0
],
neib_shape
=
inp
[
1
],
neib_step
=
inp
[
2
],
z
=
out
[
0
],
dtype_neib_shape
=
node
.
inputs
[
1
]
.
dtype
,
def
perform
(
self
,
node
,
inp
,
out
,
ctx
):
dtype_neib_step
=
node
.
inputs
[
2
]
.
dtype
,
err_check
=
err_check
,
sync
=
sync
,
name
=
name
,
params
=
sub
[
'params'
],
fail
=
sub
[
'fail'
])
def
perform
(
self
,
node
,
inp
,
out
,
params
):
# Disable the perform method from the CPU version
# Disable the perform method from the CPU version
Op
.
perform
(
self
,
node
,
inp
,
out
,
ctx
)
Op
.
perform
(
self
,
node
,
inp
,
out
,
params
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
...
...
theano/gpuarray/rng_mrg.py
浏览文件 @
5df0cfd8
...
@@ -7,16 +7,15 @@ http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
...
@@ -7,16 +7,15 @@ http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
"""
"""
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
numpy
as
np
from
theano
import
Apply
,
tensor
from
theano
import
Apply
,
tensor
from
theano.gof
import
local_optimizer
from
theano.gof
import
local_optimizer
from
theano.sandbox.rng_mrg
import
mrg_uniform_base
,
mrg_uniform
from
theano.sandbox.rng_mrg
import
mrg_uniform_base
,
mrg_uniform
from
theano.tensor
import
as_tensor_variable
,
get_vector_length
from
theano.tensor
import
as_tensor_variable
,
get_vector_length
from
theano.scalar
import
int32
as
int_t
from
.basic_ops
import
(
GpuKernelBase
,
Kernel
,
infer_context_name
,
from
.basic_ops
import
(
GpuKernelBase
,
Kernel
,
infer_context_name
,
host_from_gpu
,
as_gpuarray_variable
)
host_from_gpu
,
as_gpuarray_variable
)
from
.type
import
GpuArrayType
from
.type
import
GpuArrayType
,
gpu_context_type
from
.fp16_help
import
write_w
from
.fp16_help
import
write_w
from
.opt
import
register_opt
,
register_opt2
from
.opt
import
register_opt
,
register_opt2
...
@@ -24,6 +23,9 @@ from .opt import register_opt, register_opt2
...
@@ -24,6 +23,9 @@ from .opt import register_opt, register_opt2
class
GPUA_mrg_uniform
(
GpuKernelBase
,
mrg_uniform_base
):
class
GPUA_mrg_uniform
(
GpuKernelBase
,
mrg_uniform_base
):
# GpuArray version
# GpuArray version
_f16_ok
=
True
_f16_ok
=
True
params_type
=
mrg_uniform_base
.
params_type
.
extended
(
otypecode
=
int_t
,
context
=
gpu_context_type
)
otypecode
=
property
(
lambda
self
:
self
.
output_type
.
typecode
)
def
make_node
(
self
,
rstate
,
size
):
def
make_node
(
self
,
rstate
,
size
):
# error checking slightly redundant here, since
# error checking slightly redundant here, since
...
@@ -39,6 +41,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
...
@@ -39,6 +41,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
[
rstate
,
size
],
[
rstate
,
size
],
[
rstate
.
type
(),
output_type
])
[
rstate
.
type
(),
output_type
])
def
get_params
(
self
,
node
):
return
self
.
params_type
.
get_params
(
self
,
context
=
node
.
inputs
[
0
]
.
type
.
context
)
@classmethod
@classmethod
def
new
(
cls
,
rstate
,
ndim
,
dtype
,
size
):
def
new
(
cls
,
rstate
,
ndim
,
dtype
,
size
):
v_size
=
as_tensor_variable
(
size
)
v_size
=
as_tensor_variable
(
size
)
...
@@ -168,40 +173,34 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
...
@@ -168,40 +173,34 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
]
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
rstate
,
size
=
inp
o_rstate
,
o_sample
=
out
inplace
=
int
(
self
.
inplace
)
ndim
=
self
.
output_type
.
ndim
o_type_num
=
np
.
asarray
(
0
,
dtype
=
self
.
output_type
.
dtype
)
.
dtype
.
num
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'params'
]
kname
=
self
.
gpu_kernels
(
node
,
nodename
)[
0
]
.
objvar
otypecode
=
str
(
self
.
output_type
.
typecode
)
return
"""
return
"""
npy_int64 M1 = 2147483647; //2^31 - 1
npy_int64 M1 = 2147483647; //2^31 - 1
// The +1 is to avoid odims[0] which fails on windows
size_t odims[
%(ndim)
s+1];
size_t n_elements = 1;
size_t n_elements = 1;
unsigned int n_streams;
unsigned int n_streams;
int must_alloc_sample = ((NULL ==
%(o_sample)
s)
int must_alloc_sample = ((NULL ==
%(o_sample)
s)
|| !pygpu_GpuArray_Check((PyObject*)
%(o_sample)
s)
|| !pygpu_GpuArray_Check((PyObject*)
%(o_sample)
s)
|| !(
%(o_sample)
s->ga.flags & GA_C_CONTIGUOUS)
|| !(
%(o_sample)
s->ga.flags & GA_C_CONTIGUOUS)
|| (PyGpuArray_NDIM(
%(o_sample)
s) !=
%(ndim)
s));
|| (PyGpuArray_NDIM(
%(o_sample)
s) !=
%(params)
s->ndim));
size_t* odims = (size_t*)malloc(
%(params)
s->ndim * sizeof(size_t));
if (odims == NULL) {
PyErr_NoMemory();
%(just_fail)
s
}
if (PyArray_NDIM(
%(size)
s) != 1)
if (PyArray_NDIM(
%(size)
s) != 1)
{
{
PyErr_SetString(PyExc_ValueError, "size must be vector");
PyErr_SetString(PyExc_ValueError, "size must be vector");
%(fail)
s
%(fail)
s
}
}
if (PyArray_DIMS(
%(size)
s)[0] !=
%(
ndim)
s
)
if (PyArray_DIMS(
%(size)
s)[0] !=
%(
params)
s->ndim
)
{
{
PyErr_Format(PyExc_ValueError, "size must have length
%%
i (not
%%
li)",
PyErr_Format(PyExc_ValueError, "size must have length
%%
i (not
%%
li)",
%(
ndim)
s
, PyArray_DIMS(
%(size)
s)[0]);
%(
params)
s->ndim
, PyArray_DIMS(
%(size)
s)[0]);
%(fail)
s
%(fail)
s
}
}
for (int i = 0; i <
%(
ndim)
s
; ++i)
for (int i = 0; i <
%(
params)
s->ndim
; ++i)
{
{
odims[i] = *(dtype_
%(size)
s *)PyArray_GETPTR1(
%(size)
s, i);
odims[i] = *(dtype_
%(size)
s *)PyArray_GETPTR1(
%(size)
s, i);
n_elements *= odims[i];
n_elements *= odims[i];
...
@@ -219,8 +218,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
...
@@ -219,8 +218,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
if (must_alloc_sample)
if (must_alloc_sample)
{
{
Py_XDECREF(
%(o_sample)
s);
Py_XDECREF(
%(o_sample)
s);
%(o_sample)
s = pygpu_empty(
%(
ndim)
s, odims,
%(otypecode)
s
, GA_C_ORDER,
%(o_sample)
s = pygpu_empty(
%(
params)
s->ndim, odims,
%(params)
s->otypecode
, GA_C_ORDER,
%(
ctx)
s
, Py_None);
%(
params)
s->context
, Py_None);
if(!
%(o_sample)
s)
if(!
%(o_sample)
s)
{
{
%(fail)
s;
%(fail)
s;
...
@@ -233,7 +232,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
...
@@ -233,7 +232,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
}
}
Py_XDECREF(
%(o_rstate)
s);
Py_XDECREF(
%(o_rstate)
s);
if (
%(
inplace)
s
)
if (
%(
params)
s->inplace
)
{
{
Py_INCREF(
%(rstate)
s);
Py_INCREF(
%(rstate)
s);
%(o_rstate)
s =
%(rstate)
s;
%(o_rstate)
s =
%(rstate)
s;
...
@@ -285,10 +284,22 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
...
@@ -285,10 +284,22 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
%(fail)
s
%(fail)
s
}
}
}
}
"""
%
locals
()
free(odims);
"""
%
dict
(
rstate
=
inp
[
0
],
size
=
inp
[
1
],
o_rstate
=
out
[
0
],
o_sample
=
out
[
1
],
kname
=
self
.
gpu_kernels
(
node
,
nodename
)[
0
]
.
objvar
,
params
=
sub
[
'params'
],
just_fail
=
sub
[
'fail'
],
fail
=
"""
{
free(odims);
%(fail)
s
}
"""
%
dict
(
fail
=
sub
[
'fail'
]))
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
1
4
,)
return
(
1
5
,)
@register_opt2
([
mrg_uniform
],
'fast_compile'
)
@register_opt2
([
mrg_uniform
],
'fast_compile'
)
...
...
theano/gpuarray/subtensor.py
浏览文件 @
5df0cfd8
...
@@ -7,9 +7,11 @@ from six import integer_types
...
@@ -7,9 +7,11 @@ from six import integer_types
from
six.moves
import
StringIO
from
six.moves
import
StringIO
from
theano
import
tensor
,
gof
,
Op
from
theano
import
tensor
,
gof
,
Op
from
theano.gof
import
ParamsType
from
theano.gradient
import
grad_not_implemented
from
theano.gradient
import
grad_not_implemented
import
theano.tensor
as
T
import
theano.tensor
as
T
from
theano.tensor.subtensor
import
IncSubtensor
,
Subtensor
,
get_idx_list
from
theano.tensor.subtensor
import
IncSubtensor
,
Subtensor
,
get_idx_list
from
theano.scalar
import
bool
as
bool_t
,
int32
as
int_t
,
uint32
as
size_t
try
:
try
:
import
pygpu
import
pygpu
...
@@ -594,7 +596,15 @@ class GpuAdvancedIncSubtensor1(Op):
...
@@ -594,7 +596,15 @@ class GpuAdvancedIncSubtensor1(Op):
"""
"""
_f16_ok
=
True
_f16_ok
=
True
__props__
=
(
'inplace'
,
'set_instead_of_inc'
)
__props__
=
(
'inplace'
,
'set_instead_of_inc'
)
params_type
=
gpu_context_type
params_type
=
ParamsType
(
inplace
=
bool_t
,
set_instead_of_inc
=
bool_t
,
context
=
gpu_context_type
,
# following params are used into c_init_code_struct(),
# as inputs are not available in that function.
ndim_input_0
=
size_t
,
ndim_input_1
=
size_t
,
typecode_input_0
=
int_t
,
typecode_input_1
=
int_t
)
def
__init__
(
self
,
inplace
=
False
,
set_instead_of_inc
=
False
):
def
__init__
(
self
,
inplace
=
False
,
set_instead_of_inc
=
False
):
self
.
inplace
=
inplace
self
.
inplace
=
inplace
...
@@ -634,12 +644,17 @@ class GpuAdvancedIncSubtensor1(Op):
...
@@ -634,12 +644,17 @@ class GpuAdvancedIncSubtensor1(Op):
return
gof
.
Apply
(
self
,
[
x_
,
y_
,
ilist_
],
[
x_
.
type
()])
return
gof
.
Apply
(
self
,
[
x_
,
y_
,
ilist_
],
[
x_
.
type
()])
def
get_params
(
self
,
node
):
def
get_params
(
self
,
node
):
return
node
.
outputs
[
0
]
.
type
.
context
return
self
.
params_type
.
get_params
(
self
,
context
=
node
.
outputs
[
0
]
.
type
.
context
,
# following params are used into c_init_code_struct().
ndim_input_0
=
node
.
inputs
[
0
]
.
ndim
,
ndim_input_1
=
node
.
inputs
[
1
]
.
ndim
,
typecode_input_0
=
node
.
inputs
[
0
]
.
type
.
typecode
,
typecode_input_1
=
node
.
inputs
[
1
]
.
type
.
typecode
)
# We can't use the parent version that loops on each index
# We can't use the parent version that loops on each index
# as we also need to loop when set_instead_of_inc is True and the
# as we also need to loop when set_instead_of_inc is True and the
# parent doesn't loop in that case.
# parent doesn't loop in that case.
def
perform
(
self
,
node
,
inp
,
out_
,
ctx
=
None
):
def
perform
(
self
,
node
,
inp
,
out_
,
params
=
None
):
# TODO opt to make this inplace
# TODO opt to make this inplace
x
,
y
,
idx
=
inp
x
,
y
,
idx
=
inp
out
,
=
out_
out
,
=
out_
...
@@ -700,21 +715,18 @@ class GpuAdvancedIncSubtensor1(Op):
...
@@ -700,21 +715,18 @@ class GpuAdvancedIncSubtensor1(Op):
return
"""
return
"""
gpuelemwise_arg args[2] = {{0}};
gpuelemwise_arg args[2] = {{0}};
args[0].name = "a";
args[0].name = "a";
args[0].typecode =
%(
type1)
s
;
args[0].typecode =
%(
params)
s->typecode_input_0
;
args[0].flags = GE_READ|GE_WRITE;
args[0].flags = GE_READ|GE_WRITE;
args[1].name = "b";
args[1].name = "b";
args[1].typecode =
%(
type2)
s
;
args[1].typecode =
%(
params)
s->typecode_input_1
;
args[1].flags = GE_READ;
args[1].flags = GE_READ;
iadd = GpuElemwise_new(
%(
ctx)
s
->ctx, "", "a += b",
iadd = GpuElemwise_new(
%(
params)
s->context
->ctx, "", "a += b",
2, args,
%(
nd)
s
, GE_CONVERT_F16);
2, args,
%(
params)
s->ndim_input_1
, GE_CONVERT_F16);
if (iadd == NULL) {
if (iadd == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
%(fail)
s
%(fail)
s
}
}
"""
%
dict
(
ctx
=
sub
[
'params'
],
fail
=
sub
[
'fail'
],
"""
%
dict
(
params
=
sub
[
'params'
],
fail
=
sub
[
'fail'
])
type1
=
node
.
inputs
[
0
]
.
type
.
typecode
,
type2
=
node
.
inputs
[
1
]
.
type
.
typecode
,
nd
=
node
.
inputs
[
1
]
.
ndim
)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
if
(
node
.
inputs
[
0
]
.
ndim
!=
node
.
inputs
[
1
]
.
ndim
):
if
(
node
.
inputs
[
0
]
.
ndim
!=
node
.
inputs
[
1
]
.
ndim
):
...
@@ -722,18 +734,26 @@ class GpuAdvancedIncSubtensor1(Op):
...
@@ -722,18 +734,26 @@ class GpuAdvancedIncSubtensor1(Op):
return
"""
return
"""
PyGpuArrayObject *row_x, *row_y;
PyGpuArrayObject *row_x, *row_y;
ssize_t start[
%(nd)
s], step[
%(nd)
s];
size_t nd =
%(params)
s->ndim_input_0;
ssize_t *start = NULL, *step = NULL;
size_t num_indices, j;
size_t num_indices, j;
int ret;
int ret;
int broadcast_y;
int broadcast_y;
for (j = 0; j <
%(nd)
s; j++) {
start = (ssize_t*)malloc(nd * sizeof(ssize_t));
step = (ssize_t*)malloc(nd * sizeof(ssize_t));
if (start == NULL || step == NULL) {
PyErr_NoMemory();
%(fail)
s
}
for (j = 0; j < nd; ++j) {
start[j] = 0;
start[j] = 0;
step[j] = 1;
step[j] = 1;
}
}
step[0] = 0;
step[0] = 0;
num_indices = PyArray_SIZE(
%(ind)
s);
num_indices = PyArray_SIZE(
%(ind)
s);
if (!
%(
inplace)
s
) {
if (!
%(
params)
s->inplace
) {
%(out)
s = theano_try_copy(
%(out)
s,
%(x)
s);
%(out)
s = theano_try_copy(
%(out)
s,
%(x)
s);
if (
%(out)
s == NULL) {
if (
%(out)
s == NULL) {
// Exception already set
// Exception already set
...
@@ -774,7 +794,7 @@ class GpuAdvancedIncSubtensor1(Op):
...
@@ -774,7 +794,7 @@ class GpuAdvancedIncSubtensor1(Op):
%(fail)
s;
%(fail)
s;
}
}
if (
%(
set_instead_of_inc)
s
) {
if (
%(
params)
s->set_instead_of_inc
) {
ret = GpuArray_setarray(&row_x->ga, &row_y->ga);
ret = GpuArray_setarray(&row_x->ga, &row_y->ga);
} else {
} else {
void *args[2];
void *args[2];
...
@@ -788,13 +808,21 @@ class GpuAdvancedIncSubtensor1(Op):
...
@@ -788,13 +808,21 @@ class GpuAdvancedIncSubtensor1(Op):
PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
}
}
}
}
free(start);
free(step);
"""
%
dict
(
x
=
inputs
[
0
],
y
=
inputs
[
1
],
ind
=
inputs
[
2
],
out
=
outputs
[
0
],
"""
%
dict
(
x
=
inputs
[
0
],
y
=
inputs
[
1
],
ind
=
inputs
[
2
],
out
=
outputs
[
0
],
fail
=
sub
[
'fail'
],
inplace
=
int
(
self
.
inplace
),
params
=
sub
[
'params'
],
nd
=
node
.
inputs
[
0
]
.
ndim
,
fail
=
"""
set_instead_of_inc
=
int
(
self
.
set_instead_of_inc
))
{
free(start);
free(step);
%(fail)
s
}
"""
%
dict
(
fail
=
sub
[
'fail'
]))
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
3
,)
return
(
4
,)
class
GpuAdvancedIncSubtensor1_dev20
(
GpuKernelBase
,
HideC
,
class
GpuAdvancedIncSubtensor1_dev20
(
GpuKernelBase
,
HideC
,
...
@@ -805,6 +833,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
...
@@ -805,6 +833,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
"""
"""
_f16_ok
=
True
_f16_ok
=
True
params_type
=
GpuAdvancedIncSubtensor1
.
params_type
get_params
=
GpuAdvancedIncSubtensor1
.
get_params
def
make_node
(
self
,
x
,
y
,
ilist
):
def
make_node
(
self
,
x
,
y
,
ilist
):
"""
"""
...
@@ -837,14 +867,11 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
...
@@ -837,14 +867,11 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
return
gof
.
Apply
(
self
,
[
x_
,
y_
,
ilist_
],
[
x_
.
type
()])
return
gof
.
Apply
(
self
,
[
x_
,
y_
,
ilist_
],
[
x_
.
type
()])
def
get_params
(
self
,
node
):
def
perform
(
self
,
node
,
inp
,
out
,
params
):
return
node
.
outputs
[
0
]
.
type
.
context
def
perform
(
self
,
node
,
inp
,
out
,
ctx
):
return
super
(
GpuAdvancedIncSubtensor1_dev20
,
self
)
.
perform
(
node
,
inp
,
out
)
return
super
(
GpuAdvancedIncSubtensor1_dev20
,
self
)
.
perform
(
node
,
inp
,
out
)
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
1
2
,)
return
(
1
3
,)
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'<numpy_compat.h>'
,
'<gpuarray_helper.h>'
,
return
[
'<numpy_compat.h>'
,
'<gpuarray_helper.h>'
,
...
@@ -854,7 +881,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
...
@@ -854,7 +881,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
return
[
os
.
path
.
dirname
(
__file__
)]
return
[
os
.
path
.
dirname
(
__file__
)]
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
ctx
=
self
.
get_params
(
node
)
ctx
=
self
.
get_params
(
node
)
.
context
if
ctx
.
kind
!=
b
'cuda'
:
if
ctx
.
kind
!=
b
'cuda'
:
raise
NotImplementedError
(
"cuda only"
)
raise
NotImplementedError
(
"cuda only"
)
if
(
node
.
inputs
[
0
]
.
ndim
!=
node
.
inputs
[
1
]
.
ndim
or
if
(
node
.
inputs
[
0
]
.
ndim
!=
node
.
inputs
[
1
]
.
ndim
or
...
@@ -862,16 +889,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
...
@@ -862,16 +889,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
int
(
ctx
.
bin_id
[
-
2
])
<
2
):
int
(
ctx
.
bin_id
[
-
2
])
<
2
):
raise
NotImplementedError
(
"This case does not have C code yet."
)
raise
NotImplementedError
(
"This case does not have C code yet."
)
x
=
inputs
[
0
]
y
=
inputs
[
1
]
ind
=
inputs
[
2
]
out
=
outputs
[
0
]
fail
=
sub
[
'fail'
]
set_instead_of_inc
=
int
(
self
.
set_instead_of_inc
)
inplace
=
int
(
self
.
inplace
)
return
"""
return
"""
int err;
int err;
if (
%(
inplace)
s
) {
if (
%(
params)
s->inplace
) {
Py_XDECREF(
%(out)
s);
Py_XDECREF(
%(out)
s);
%(out)
s =
%(x)
s;
%(out)
s =
%(x)
s;
Py_INCREF(
%(out)
s);
Py_INCREF(
%(out)
s);
...
@@ -882,25 +902,19 @@ if (!%(out)s) {
...
@@ -882,25 +902,19 @@ if (!%(out)s) {
// Exception already set
// Exception already set
%(fail)
s
%(fail)
s
}
}
if (GpuArray_vector_add_fast(
%(out)
s,
%(y)
s,
%(ind)
s,
%(
set_instead_of_inc)
s
)) {
if (GpuArray_vector_add_fast(
%(out)
s,
%(y)
s,
%(ind)
s,
%(
params)
s->set_instead_of_inc
)) {
%(fail)
s
%(fail)
s
}
}
"""
%
locals
(
)
"""
%
dict
(
x
=
inputs
[
0
],
y
=
inputs
[
1
],
ind
=
inputs
[
2
],
out
=
outputs
[
0
],
fail
=
sub
[
'fail'
],
params
=
sub
[
'params'
]
)
def
gpu_kernels
(
self
,
node
,
nodename
):
def
gpu_kernels
(
self
,
node
,
nodename
):
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_y
=
node
.
inputs
[
1
]
.
dtype
dtype_y
=
node
.
inputs
[
1
]
.
dtype
dtype_ind
=
node
.
inputs
[
2
]
.
dtype
dtype_ind
=
node
.
inputs
[
2
]
.
dtype
dtype_out
=
node
.
outputs
[
0
]
.
dtype
itemsize_x
=
np
.
dtype
(
dtype_x
)
.
itemsize
itemsize_y
=
np
.
dtype
(
dtype_y
)
.
itemsize
itemsize_ind
=
np
.
dtype
(
dtype_ind
)
.
itemsize
itemsize_out
=
np
.
dtype
(
dtype_out
)
.
itemsize
flags
=
Kernel
.
get_flags
(
dtype_x
,
dtype_y
,
dtype_ind
)
type_x
=
gpuarray
.
dtype_to_ctype
(
dtype_x
)
type_x
=
gpuarray
.
dtype_to_ctype
(
dtype_x
)
type_y
=
gpuarray
.
dtype_to_ctype
(
dtype_y
)
type_y
=
gpuarray
.
dtype_to_ctype
(
dtype_y
)
type_ind
=
gpuarray
.
dtype_to_ctype
(
dtype_ind
)
type_ind
=
gpuarray
.
dtype_to_ctype
(
dtype_ind
)
type_out
=
gpuarray
.
dtype_to_ctype
(
dtype_out
)
flags
=
Kernel
.
get_flags
(
dtype_x
,
dtype_y
,
dtype_ind
)
kname
=
"k_vector_add_fast"
kname
=
"k_vector_add_fast"
k_var
=
"k_vector_add_fast_"
+
nodename
k_var
=
"k_vector_add_fast_"
+
nodename
code
=
"""
code
=
"""
...
@@ -1010,7 +1024,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
...
@@ -1010,7 +1024,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
}
}
return;
return;
}
}
"""
%
locals
(
)
"""
%
dict
(
type_x
=
type_x
,
type_y
=
type_y
,
type_ind
=
type_ind
)
params
=
[
params
=
[
'uintp'
,
'uintp'
,
'intp'
,
'intp'
,
gpuarray
.
GpuArray
,
'uintp'
,
'uintp'
,
'uintp'
,
'intp'
,
'intp'
,
gpuarray
.
GpuArray
,
'uintp'
,
'uintp'
,
'uintp'
,
'intp'
,
'intp'
,
gpuarray
.
GpuArray
,
'uintp'
,
'uintp'
,
'uintp'
,
'intp'
,
'intp'
,
gpuarray
.
GpuArray
,
'uintp'
,
...
@@ -1020,26 +1034,19 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
...
@@ -1020,26 +1034,19 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
flags
=
flags
,
objvar
=
k_var
)]
flags
=
flags
,
objvar
=
k_var
)]
def
c_support_code_struct
(
self
,
node
,
nodename
):
def
c_support_code_struct
(
self
,
node
,
nodename
):
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_y
=
node
.
inputs
[
1
]
.
dtype
dtype_ind
=
node
.
inputs
[
2
]
.
dtype
dtype_out
=
node
.
outputs
[
0
]
.
dtype
itemsize_x
=
np
.
dtype
(
dtype_x
)
.
itemsize
itemsize_y
=
np
.
dtype
(
dtype_y
)
.
itemsize
itemsize_ind
=
np
.
dtype
(
dtype_ind
)
.
itemsize
itemsize_out
=
np
.
dtype
(
dtype_out
)
.
itemsize
k_var
=
"k_vector_add_fast_"
+
nodename
return
super
(
GpuAdvancedIncSubtensor1_dev20
,
self
)
.
c_support_code_struct
(
node
,
nodename
)
+
"""
return
super
(
GpuAdvancedIncSubtensor1_dev20
,
self
)
.
c_support_code_struct
(
node
,
nodename
)
+
"""
int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other,
PyGpuArrayObject* py_other,
PyGpuArrayObject
*
indices_arr,
PyGpuArrayObject
*
indices_arr,
const int set_instead_of_inc)
const int set_instead_of_inc)
{
{
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
gpudata *errbuf;
gpudata *errbuf;
int err, kerr = 0;
int err, kerr = 0;
size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga);
size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga);
size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga);
if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
err = gpudata_property(py_self->ga.data,
err = gpudata_property(py_self->ga.data,
...
@@ -1049,11 +1056,11 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
...
@@ -1049,11 +1056,11 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
return 1;
return 1;
}
}
ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] /
%(itemsize_x)
s
;
ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] /
itemsize_x
;
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] /
%(itemsize_x)
s
;
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] /
itemsize_x
;
ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] /
%(itemsize_y)
s
;
ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] /
itemsize_y
;
ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] /
%(itemsize_y)
s
;
ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] /
itemsize_y
;
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] /
%(itemsize_ind)
s
;
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] /
itemsize_ind
;
void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
(void *)&PyGpuArray_DIMS(py_self)[1],
(void *)&PyGpuArray_DIMS(py_self)[1],
(void *)&stride_X0,
(void *)&stride_X0,
...
@@ -1093,7 +1100,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
...
@@ -1093,7 +1100,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
}
}
return 0;
return 0;
}
}
"""
%
locals
(
)
"""
%
dict
(
k_var
=
"k_vector_add_fast_"
+
nodename
)
class
GpuExtractDiag
(
Op
):
class
GpuExtractDiag
(
Op
):
...
...
theano/tensor/nnet/neighbours.py
浏览文件 @
5df0cfd8
...
@@ -8,6 +8,7 @@ import numpy as np
...
@@ -8,6 +8,7 @@ import numpy as np
import
theano
import
theano
from
theano
import
Op
,
Apply
from
theano
import
Op
,
Apply
from
theano.gof
import
EnumList
import
theano.tensor
as
T
import
theano.tensor
as
T
from
theano.gradient
import
grad_not_implemented
from
theano.gradient
import
grad_not_implemented
from
theano.gradient
import
grad_undefined
from
theano.gradient
import
grad_undefined
...
@@ -39,13 +40,21 @@ class Images2Neibs(Op):
...
@@ -39,13 +40,21 @@ class Images2Neibs(Op):
"""
"""
__props__
=
(
"mode"
,)
__props__
=
(
"mode"
,)
BORDER_MODE
=
EnumList
((
'MODE_VALID'
,
'valid'
),
(
'MODE_HALF'
,
'half'
),
(
'MODE_FULL'
,
'full'
),
(
'MODE_WRAP_CENTERED'
,
'wrap_centered'
),
(
'MODE_IGNORE_BORDERS'
,
'ignore_borders'
))
params_type
=
BORDER_MODE
def
get_params
(
self
,
node
):
return
self
.
mode
def
__init__
(
self
,
mode
=
'valid'
):
def
__init__
(
self
,
mode
=
'valid'
):
if
mode
not
in
[
'valid'
,
'half'
,
'full'
,
implemented_modes
=
self
.
BORDER_MODE
.
get_aliases
()
'wrap_centered'
,
'ignore_borders'
]:
if
mode
not
in
implemented_modes
:
raise
NotImplementedError
(
"Only the mode valid, half, full, "
raise
NotImplementedError
(
"Only modes
%
s have been implemented for
%
s"
"ignore_borders and wrap_centered have "
%
(
', '
.
join
(
implemented_modes
),
type
(
self
)
.
__name__
))
"been implemented for Images2Neibs"
)
self
.
mode
=
mode
self
.
mode
=
mode
def
__str__
(
self
):
def
__str__
(
self
):
...
@@ -159,9 +168,9 @@ class Images2Neibs(Op):
...
@@ -159,9 +168,9 @@ class Images2Neibs(Op):
grad_undefined
(
self
,
2
,
neib_step
)]
grad_undefined
(
self
,
2
,
neib_step
)]
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
8
,)
return
(
10
,)
def
perform
(
self
,
node
,
inp
,
out_
):
def
perform
(
self
,
node
,
inp
,
out_
,
params
):
ten4
,
neib_shape
,
neib_step
=
inp
ten4
,
neib_shape
,
neib_step
=
inp
z
,
=
out_
z
,
=
out_
# GpuImages2Neibs should not run this perform in DebugMode
# GpuImages2Neibs should not run this perform in DebugMode
...
@@ -344,11 +353,6 @@ class Images2Neibs(Op):
...
@@ -344,11 +353,6 @@ class Images2Neibs(Op):
return
[(
z_dim0
,
z_dim1
)]
return
[(
z_dim0
,
z_dim1
)]
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
ten4
,
neib_shape
,
neib_step
=
inp
z
,
=
out
fail
=
sub
[
'fail'
]
mode
=
self
.
mode
return
"""
return
"""
#ifndef CEIL_INTDIV
#ifndef CEIL_INTDIV
#define CEIL_INTDIV(a, b) ((a/b) + ((a
%%
b) ? 1: 0))
#define CEIL_INTDIV(a, b) ((a/b) + ((a
%%
b) ? 1: 0))
...
@@ -408,7 +412,7 @@ class Images2Neibs(Op):
...
@@ -408,7 +412,7 @@ class Images2Neibs(Op):
%(fail)
s;
%(fail)
s;
}
}
if (
"
%(mode)
s" == "wrap_centered"
) {
if (
%(mode)
s == MODE_WRAP_CENTERED
) {
if (c
%%2
!=1 || d
%%2
!=1){
if (c
%%2
!=1 || d
%%2
!=1){
PyErr_Format(PyExc_TypeError,
PyErr_Format(PyExc_TypeError,
"Images2Neibs: in mode wrap_centered"
"Images2Neibs: in mode wrap_centered"
...
@@ -430,7 +434,7 @@ class Images2Neibs(Op):
...
@@ -430,7 +434,7 @@ class Images2Neibs(Op):
grid_c = CEIL_INTDIV(((PyArray_DIMS(
%(ten4)
s))[2]),step_x);
grid_c = CEIL_INTDIV(((PyArray_DIMS(
%(ten4)
s))[2]),step_x);
grid_d = CEIL_INTDIV(((PyArray_DIMS(
%(ten4)
s))[3]),step_y);
grid_d = CEIL_INTDIV(((PyArray_DIMS(
%(ten4)
s))[3]),step_y);
}
else if ( "
%(mode)
s" == "valid"
) {
}
else if (
%(mode)
s == MODE_VALID
) {
if ( ((PyArray_DIMS(
%(ten4)
s))[2] < c) ||
if ( ((PyArray_DIMS(
%(ten4)
s))[2] < c) ||
( (((PyArray_DIMS(
%(ten4)
s))[2]-c)
%%
step_x)!=0))
( (((PyArray_DIMS(
%(ten4)
s))[2]-c)
%%
step_x)!=0))
{
{
...
@@ -455,12 +459,12 @@ class Images2Neibs(Op):
...
@@ -455,12 +459,12 @@ class Images2Neibs(Op):
grid_c = 1+(((PyArray_DIMS(
%(ten4)
s))[2]-c)/step_x);
grid_c = 1+(((PyArray_DIMS(
%(ten4)
s))[2]-c)/step_x);
//number of patch in width
//number of patch in width
grid_d = 1+(((PyArray_DIMS(
%(ten4)
s))[3]-d)/step_y);
grid_d = 1+(((PyArray_DIMS(
%(ten4)
s))[3]-d)/step_y);
}
else if ( "
%(mode)
s" == "ignore_borders"
) {
}
else if (
%(mode)
s == MODE_IGNORE_BORDERS
) {
//number of patch in height
//number of patch in height
grid_c = 1+(((PyArray_DIMS(
%(ten4)
s))[2]-c)/step_x);
grid_c = 1+(((PyArray_DIMS(
%(ten4)
s))[2]-c)/step_x);
//number of patch in width
//number of patch in width
grid_d = 1+(((PyArray_DIMS(
%(ten4)
s))[3]-d)/step_y);
grid_d = 1+(((PyArray_DIMS(
%(ten4)
s))[3]-d)/step_y);
}
else if ( "
%(mode)
s" == "half"
) {
}
else if (
%(mode)
s == MODE_HALF
) {
if ( ((PyArray_DIMS(
%(ten4)
s))[2] < c) ||
if ( ((PyArray_DIMS(
%(ten4)
s))[2] < c) ||
( (((PyArray_DIMS(
%(ten4)
s))[2]-(c
%%2
))
%%
step_x)!=0))
( (((PyArray_DIMS(
%(ten4)
s))[2]-(c
%%2
))
%%
step_x)!=0))
{
{
...
@@ -485,7 +489,7 @@ class Images2Neibs(Op):
...
@@ -485,7 +489,7 @@ class Images2Neibs(Op):
grid_c = 1+(((PyArray_DIMS(
%(ten4)
s))[2]-(c
%%2
))/step_x);
grid_c = 1+(((PyArray_DIMS(
%(ten4)
s))[2]-(c
%%2
))/step_x);
//number of patch in width
//number of patch in width
grid_d = 1+(((PyArray_DIMS(
%(ten4)
s))[3]-(d
%%2
))/step_y);
grid_d = 1+(((PyArray_DIMS(
%(ten4)
s))[3]-(d
%%2
))/step_y);
}
else if ( "
%(mode)
s" == "full"
) {
}
else if (
%(mode)
s == MODE_FULL
) {
if ( ((PyArray_DIMS(
%(ten4)
s))[2] < c) ||
if ( ((PyArray_DIMS(
%(ten4)
s))[2] < c) ||
( (((PyArray_DIMS(
%(ten4)
s))[2]+c-2)
%%
step_x)!=0))
( (((PyArray_DIMS(
%(ten4)
s))[2]+c-2)
%%
step_x)!=0))
{
{
...
@@ -510,9 +514,9 @@ class Images2Neibs(Op):
...
@@ -510,9 +514,9 @@ class Images2Neibs(Op):
grid_c = 1+(((PyArray_DIMS(
%(ten4)
s))[2]+c-2)/step_x);
grid_c = 1+(((PyArray_DIMS(
%(ten4)
s))[2]+c-2)/step_x);
//number of patch in width
//number of patch in width
grid_d = 1+(((PyArray_DIMS(
%(ten4)
s))[3]+d-2)/step_y);
grid_d = 1+(((PyArray_DIMS(
%(ten4)
s))[3]+d-2)/step_y);
}else {
}
else {
PyErr_Format(PyExc_TypeError,
PyErr_Format(PyExc_TypeError,
"Images2Neibs: unknow mode
'
%(mode)
s'"
);
"Images2Neibs: unknow mode
%%
d",
%(mode)
s
);
%(fail)
s;
%(fail)
s;
}
}
...
@@ -572,13 +576,13 @@ class Images2Neibs(Op):
...
@@ -572,13 +576,13 @@ class Images2Neibs(Op):
for (int i = 0; i < c; i++) // loop over c
for (int i = 0; i < c; i++) // loop over c
{
{
int ten4_2 = i + a * step_x;
int ten4_2 = i + a * step_x;
if (
"
%(mode)
s" == "wrap_centered" )
{
if (
%(mode)
s == MODE_WRAP_CENTERED)
{
ten4_2 -= wrap_centered_half_idx_shift_x;
ten4_2 -= wrap_centered_half_idx_shift_x;
if ( ten4_2 < 0 ) ten4_2 += height;
if ( ten4_2 < 0 ) ten4_2 += height;
else if (ten4_2 >= height) ten4_2 -= height;
else if (ten4_2 >= height) ten4_2 -= height;
} else if (
"
%(mode)
s" == "half" )
{
} else if (
%(mode)
s == MODE_HALF)
{
ten4_2 -= wrap_centered_half_idx_shift_x;
ten4_2 -= wrap_centered_half_idx_shift_x;
} else if (
"
%(mode)
s" == "full" )
{
} else if (
%(mode)
s == MODE_FULL)
{
ten4_2 -= c - 1;
ten4_2 -= c - 1;
}
}
if (ten4_2 < 0 | ten4_2 >= height) {
if (ten4_2 < 0 | ten4_2 >= height) {
...
@@ -588,13 +592,13 @@ class Images2Neibs(Op):
...
@@ -588,13 +592,13 @@ class Images2Neibs(Op):
for (int j = 0; j < d; j++) // loop over d
for (int j = 0; j < d; j++) // loop over d
{
{
int ten4_3 = j + b * step_y;
int ten4_3 = j + b * step_y;
if (
"
%(mode)
s" == "wrap_centered" )
{
if (
%(mode)
s == MODE_WRAP_CENTERED)
{
ten4_3 -= wrap_centered_half_idx_shift_y;
ten4_3 -= wrap_centered_half_idx_shift_y;
if ( ten4_3 < 0 ) ten4_3 += width;
if ( ten4_3 < 0 ) ten4_3 += width;
else if (ten4_3 >= width) ten4_3 -= width;
else if (ten4_3 >= width) ten4_3 -= width;
} else if (
"
%(mode)
s" == "half" )
{
} else if (
%(mode)
s == MODE_HALF)
{
ten4_3 -= wrap_centered_half_idx_shift_y;
ten4_3 -= wrap_centered_half_idx_shift_y;
} else if (
"
%(mode)
s" == "full" )
{
} else if (
%(mode)
s == MODE_FULL)
{
ten4_3 -= d - 1;
ten4_3 -= d - 1;
}
}
int z_col = j + d * i;
int z_col = j + d * i;
...
@@ -609,7 +613,8 @@ class Images2Neibs(Op):
...
@@ -609,7 +613,8 @@ class Images2Neibs(Op):
}
}
}
}
} // END NESTED SCOPE
} // END NESTED SCOPE
"""
%
locals
()
"""
%
dict
(
ten4
=
inp
[
0
],
neib_shape
=
inp
[
1
],
neib_step
=
inp
[
2
],
z
=
out
[
0
],
fail
=
sub
[
'fail'
],
mode
=
sub
[
'params'
])
def
images2neibs
(
ten4
,
neib_shape
,
neib_step
=
None
,
mode
=
'valid'
):
def
images2neibs
(
ten4
,
neib_shape
,
neib_step
=
None
,
mode
=
'valid'
):
...
...
theano/tensor/subtensor.py
浏览文件 @
5df0cfd8
...
@@ -12,7 +12,7 @@ import theano
...
@@ -12,7 +12,7 @@ import theano
from
theano.compat
import
izip
from
theano.compat
import
izip
from
theano.gradient
import
DisconnectedType
from
theano.gradient
import
DisconnectedType
from
theano
import
gof
from
theano
import
gof
from
theano.gof
import
Apply
,
hashtype
,
Op
,
Type
,
MethodNotDefined
from
theano.gof
import
Apply
,
hashtype
,
Op
,
Type
,
MethodNotDefined
,
ParamsType
from
theano.printing
import
pprint
from
theano.printing
import
pprint
from
theano
import
scalar
as
scal
from
theano
import
scalar
as
scal
from
theano.tensor.basic
import
alloc
from
theano.tensor.basic
import
alloc
...
@@ -1685,6 +1685,7 @@ class AdvancedSubtensor1(Op):
...
@@ -1685,6 +1685,7 @@ class AdvancedSubtensor1(Op):
# of the grad() method.
# of the grad() method.
__props__
=
()
__props__
=
()
_f16_ok
=
True
_f16_ok
=
True
check_input
=
False
def
__init__
(
self
,
sparse_grad
=
False
):
def
__init__
(
self
,
sparse_grad
=
False
):
self
.
sparse_grad
=
sparse_grad
self
.
sparse_grad
=
sparse_grad
...
@@ -1872,10 +1873,13 @@ class AdvancedIncSubtensor1(Op):
...
@@ -1872,10 +1873,13 @@ class AdvancedIncSubtensor1(Op):
"""
"""
__props__
=
(
'inplace'
,
'set_instead_of_inc'
)
__props__
=
(
'inplace'
,
'set_instead_of_inc'
)
check_input
=
False
params_type
=
ParamsType
(
inplace
=
scal
.
bool
,
set_instead_of_inc
=
scal
.
bool
)
def
__init__
(
self
,
inplace
=
False
,
set_instead_of_inc
=
False
):
def
__init__
(
self
,
inplace
=
False
,
set_instead_of_inc
=
False
):
self
.
inplace
=
inplace
self
.
inplace
=
bool
(
inplace
)
self
.
set_instead_of_inc
=
set_instead_of_inc
self
.
set_instead_of_inc
=
bool
(
set_instead_of_inc
)
if
inplace
:
if
inplace
:
self
.
destroy_map
=
{
0
:
[
0
]}
self
.
destroy_map
=
{
0
:
[
0
]}
...
@@ -1955,17 +1959,11 @@ class AdvancedIncSubtensor1(Op):
...
@@ -1955,17 +1959,11 @@ class AdvancedIncSubtensor1(Op):
raise
NotImplementedError
raise
NotImplementedError
x
,
y
,
idx
=
input_names
x
,
y
,
idx
=
input_names
out
=
output_names
[
0
]
out
=
output_names
[
0
]
fail
=
sub
[
'fail'
]
inc_or_set
=
1
-
self
.
set_instead_of_inc
if
self
.
inplace
:
# convert bool to int
inplace
=
1
else
:
inplace
=
0
copy_of_x
=
self
.
copy_of_x
(
x
)
copy_of_x
=
self
.
copy_of_x
(
x
)
return
"""
return
"""
PyObject* rval = NULL;
PyObject* rval = NULL;
if (
%(
inplace)
s
)
if (
%(
params)
s->inplace
)
{
{
if (
%(x)
s !=
%(out)
s)
if (
%(x)
s !=
%(out)
s)
{
{
...
@@ -1983,16 +1981,17 @@ class AdvancedIncSubtensor1(Op):
...
@@ -1983,16 +1981,17 @@ class AdvancedIncSubtensor1(Op):
%(fail)
s
%(fail)
s
}
}
}
}
if (inplace_increment(
%(out)
s, (PyObject *)
%(idx)
s,
%(y)
s,
%(inc_or_set)
d
)) {
if (inplace_increment(
%(out)
s, (PyObject *)
%(idx)
s,
%(y)
s,
(1 -
%(params)
s->set_instead_of_inc)
)) {
%(fail)
s;
%(fail)
s;
}
}
Py_XDECREF(rval);
Py_XDECREF(rval);
"""
%
locals
()
"""
%
dict
(
x
=
x
,
y
=
y
,
idx
=
idx
,
out
=
out
,
copy_of_x
=
copy_of_x
,
params
=
sub
[
'params'
],
fail
=
sub
[
'fail'
])
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
6
,)
return
(
8
,)
def
perform
(
self
,
node
,
inp
,
out_
):
def
perform
(
self
,
node
,
inp
,
out_
,
params
):
# TODO opt to make this inplace
# TODO opt to make this inplace
x
,
y
,
idx
=
inp
x
,
y
,
idx
=
inp
out
,
=
out_
out
,
=
out_
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论