Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
276b9e51
提交
276b9e51
authored
12月 22, 2016
作者:
Frédéric Bastien
提交者:
GitHub
12月 22, 2016
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #5320 from tfjgeorge/cusolver-ccw
Gpu op for solve using cusolver (CCW #5191)
上级
986c6dc6
d825302f
隐藏空白字符变更
内嵌
并排
正在显示
5 个修改的文件
包含
266 行增加
和
34 行删除
+266
-34
linalg.py
theano/gpuarray/linalg.py
+172
-0
opt.py
theano/gpuarray/opt.py
+12
-0
test_linalg.py
theano/gpuarray/tests/test_linalg.py
+65
-0
test_opt.py
theano/gpuarray/tests/test_opt.py
+17
-0
opt.py
theano/sandbox/cuda/opt.py
+0
-34
没有找到文件。
theano/gpuarray/linalg.py
0 → 100644
浏览文件 @
276b9e51
from
__future__
import
absolute_import
,
division
,
print_function
import
pkg_resources
import
theano
from
theano
import
Op
from
theano.gpuarray
import
basic_ops
,
GpuArrayType
try
:
from
pygpu
import
gpuarray
except
ImportError
:
pass
cusolver_available
=
False
try
:
from
skcuda
import
cusolver
cusolver_available
=
True
except
(
ImportError
,
OSError
,
RuntimeError
,
pkg_resources
.
DistributionNotFound
):
pass
cusolver_handle
=
None
class
GpuCusolverSolve
(
Op
):
"""
CUSOLVER GPU solver OP.
Parameters
----------
trans
Whether to take the transpose of the input matrix or not.
"""
__props__
=
(
'trans'
,)
def
__init__
(
self
,
trans
=
'N'
,
inplace
=
False
):
self
.
trans
=
trans
self
.
inplace
=
inplace
if
self
.
inplace
:
self
.
destroy_map
=
{
0
:
[
0
,
1
]}
super
(
GpuCusolverSolve
,
self
)
.
__init__
()
def
make_node
(
self
,
inp1
,
inp2
):
self
.
context
=
basic_ops
.
infer_context_name
(
inp1
,
inp2
)
inp1
=
basic_ops
.
as_gpuarray_variable
(
inp1
,
self
.
context
)
inp2
=
basic_ops
.
as_gpuarray_variable
(
inp2
,
self
.
context
)
inp1
=
basic_ops
.
gpu_contiguous
(
inp1
)
inp2
=
basic_ops
.
gpu_contiguous
(
inp2
)
# this op can only operate on float32 matrices
assert
inp1
.
ndim
==
2
assert
inp2
.
ndim
==
2
assert
inp1
.
dtype
==
'float32'
assert
inp2
.
dtype
==
'float32'
return
theano
.
Apply
(
self
,
[
inp1
,
inp2
],
[
GpuArrayType
(
'float32'
,
broadcastable
=
inp1
.
broadcastable
,
context_name
=
self
.
context
)()])
def
make_thunk
(
self
,
node
,
storage_map
,
_
,
no_recycling
=
[],
impl
=
None
):
if
not
cusolver_available
:
raise
RuntimeError
(
'CUSOLVER is not available and '
'GpuCusolverSolve Op can not be constructed.'
)
inputs
=
[
storage_map
[
v
]
for
v
in
node
.
inputs
]
outputs
=
[
storage_map
[
v
]
for
v
in
node
.
outputs
]
global
cusolver_handle
if
cusolver_handle
is
None
:
cusolver_handle
=
cusolver
.
cusolverDnCreate
()
def
thunk
():
context
=
inputs
[
0
][
0
]
.
context
# Size of the matrices to invert.
z
=
outputs
[
0
]
# Matrix.
A
=
inputs
[
0
][
0
]
# Solution vectors.
b
=
inputs
[
1
][
0
]
assert
(
len
(
A
.
shape
)
==
2
)
assert
(
len
(
b
.
shape
)
==
2
)
if
self
.
trans
in
[
'T'
,
'C'
]:
trans
=
1
l
,
n
=
A
.
shape
k
,
m
=
b
.
shape
elif
self
.
trans
==
'N'
:
trans
=
0
n
,
l
=
A
.
shape
k
,
m
=
b
.
shape
else
:
raise
ValueError
(
'Invalid value for trans'
)
if
l
!=
n
:
raise
ValueError
(
'A must be a square matrix'
)
if
n
!=
k
:
raise
ValueError
(
'A and b must be aligned.'
)
lda
=
max
(
1
,
n
)
ldb
=
max
(
1
,
k
,
m
)
# We copy A and b as cusolver operates inplace
b
=
gpuarray
.
array
(
b
,
copy
=
True
,
order
=
'F'
)
if
not
self
.
inplace
:
A
=
gpuarray
.
array
(
A
,
copy
=
True
)
A_ptr
=
A
.
gpudata
b_ptr
=
b
.
gpudata
# cusolver expects a F ordered matrix, but A is not explicitly
# converted between C and F order, instead we switch the
# "transpose" flag.
if
A
.
flags
[
'C_CONTIGUOUS'
]:
trans
=
1
-
trans
workspace_size
=
cusolver
.
cusolverDnSgetrf_bufferSize
(
cusolver_handle
,
n
,
n
,
A_ptr
,
lda
)
if
(
thunk
.
workspace
is
None
or
thunk
.
workspace
.
size
!=
workspace_size
):
thunk
.
workspace
=
gpuarray
.
zeros
((
workspace_size
,),
dtype
=
'float32'
,
context
=
context
)
if
thunk
.
pivots
is
None
or
thunk
.
pivots
.
size
!=
min
(
n
,
n
):
thunk
.
pivots
=
gpuarray
.
zeros
((
min
(
n
,
n
),),
dtype
=
'float32'
,
context
=
context
)
if
thunk
.
dev_info
is
None
:
thunk
.
dev_info
=
gpuarray
.
zeros
((
1
,),
dtype
=
'float32'
,
context
=
context
)
workspace_ptr
=
thunk
.
workspace
.
gpudata
pivots_ptr
=
thunk
.
pivots
.
gpudata
dev_info_ptr
=
thunk
.
dev_info
.
gpudata
cusolver
.
cusolverDnSgetrf
(
cusolver_handle
,
n
,
n
,
A_ptr
,
lda
,
workspace_ptr
,
pivots_ptr
,
dev_info_ptr
)
cusolver
.
cusolverDnSgetrs
(
cusolver_handle
,
trans
,
n
,
m
,
A_ptr
,
lda
,
pivots_ptr
,
b_ptr
,
ldb
,
dev_info_ptr
)
z
[
0
]
=
b
thunk
.
inputs
=
inputs
thunk
.
outputs
=
outputs
thunk
.
lazy
=
False
thunk
.
workspace
=
None
thunk
.
pivots
=
None
thunk
.
dev_info
=
None
return
thunk
def
gpu_solve
(
A
,
b
,
trans
=
'N'
):
return
GpuCusolverSolve
(
trans
)(
A
,
b
)
theano/gpuarray/opt.py
浏览文件 @
276b9e51
...
@@ -32,6 +32,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
...
@@ -32,6 +32,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
AbstractConv3d_gradWeights
,
AbstractConv3d_gradWeights
,
AbstractConv3d_gradInputs
)
AbstractConv3d_gradInputs
)
import
theano.tensor.signal.pool
as
pool
import
theano.tensor.signal.pool
as
pool
import
theano.tensor.slinalg
as
slinalg
from
theano.tests.breakpoint
import
PdbBreakpoint
from
theano.tests.breakpoint
import
PdbBreakpoint
...
@@ -69,6 +70,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
...
@@ -69,6 +70,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1_dev20
)
GpuAdvancedIncSubtensor1_dev20
)
from
.opt_util
import
alpha_merge
,
output_merge
,
pad_dims
,
unpad_dims
from
.opt_util
import
alpha_merge
,
output_merge
,
pad_dims
,
unpad_dims
from
.reduction
import
GpuMaxAndArgmax
from
.reduction
import
GpuMaxAndArgmax
from
.linalg
import
(
GpuCusolverSolve
,
cusolver_available
)
_logger
=
logging
.
getLogger
(
"theano.gpuarray.opt"
)
_logger
=
logging
.
getLogger
(
"theano.gpuarray.opt"
)
...
@@ -1911,6 +1913,16 @@ def _scan_type_infer(node):
...
@@ -1911,6 +1913,16 @@ def _scan_type_infer(node):
def
local_gpu_maxandargmax
(
op
,
context_name
,
inputs
,
outputs
):
def
local_gpu_maxandargmax
(
op
,
context_name
,
inputs
,
outputs
):
return
GpuMaxAndArgmax
(
op
.
get_params
(
None
))
return
GpuMaxAndArgmax
(
op
.
get_params
(
None
))
# solve
@register_opt
(
'fast_compile'
)
@op_lifter
([
slinalg
.
Solve
])
@register_opt2
([
theano
.
tensor
.
slinalg
.
Solve
],
'fast_compile'
)
def
local_gpu_solve
(
op
,
context_name
,
inputs
,
outputs
):
if
not
cusolver_available
:
return
return
GpuCusolverSolve
()
# Do not register in fast_run or fast_compile.
# Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled.
# It will be added to fast_run if the GPU is enabled.
optdb
.
register
(
'gpua_scanOp_make_inplace'
,
optdb
.
register
(
'gpua_scanOp_make_inplace'
,
...
...
theano/gpuarray/tests/test_linalg.py
0 → 100644
浏览文件 @
276b9e51
from
__future__
import
absolute_import
,
division
,
print_function
import
unittest
import
numpy
import
theano
from
theano.tests
import
unittest_tools
as
utt
from
.config
import
mode_with_gpu
# Skip tests if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
theano.gpuarray.linalg
import
(
cusolver_available
,
gpu_solve
)
if
not
cusolver_available
:
raise
SkipTest
(
'Optional package scikits.cuda.cusolver not available'
)
class
TestCusolver
(
unittest
.
TestCase
):
def
run_gpu_solve
(
self
,
A_val
,
x_val
):
b_val
=
numpy
.
dot
(
A_val
,
x_val
)
b_val_trans
=
numpy
.
dot
(
A_val
.
T
,
x_val
)
A
=
theano
.
tensor
.
matrix
(
"A"
,
dtype
=
"float32"
)
b
=
theano
.
tensor
.
matrix
(
"b"
,
dtype
=
"float32"
)
b_trans
=
theano
.
tensor
.
matrix
(
"b"
,
dtype
=
"float32"
)
solver
=
gpu_solve
(
A
,
b
)
solver_trans
=
gpu_solve
(
A
,
b_trans
,
trans
=
'T'
)
fn
=
theano
.
function
([
A
,
b
,
b_trans
],
[
solver
,
solver_trans
],
mode
=
mode_with_gpu
)
res
=
fn
(
A_val
,
b_val
,
b_val_trans
)
x_res
=
numpy
.
array
(
res
[
0
])
x_res_trans
=
numpy
.
array
(
res
[
1
])
utt
.
assert_allclose
(
x_res
,
x_val
)
utt
.
assert_allclose
(
x_res_trans
,
x_val
)
def
test_diag_solve
(
self
):
numpy
.
random
.
seed
(
1
)
A_val
=
numpy
.
asarray
([[
2
,
0
,
0
],
[
0
,
1
,
0
],
[
0
,
0
,
1
]],
dtype
=
"float32"
)
x_val
=
numpy
.
random
.
uniform
(
-
0.4
,
0.4
,
(
A_val
.
shape
[
1
],
1
))
.
astype
(
"float32"
)
self
.
run_gpu_solve
(
A_val
,
x_val
)
def
test_sym_solve
(
self
):
numpy
.
random
.
seed
(
1
)
A_val
=
numpy
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
5
))
.
astype
(
"float32"
)
A_sym
=
(
A_val
+
A_val
.
T
)
/
2.0
x_val
=
numpy
.
random
.
uniform
(
-
0.4
,
0.4
,
(
A_val
.
shape
[
1
],
1
))
.
astype
(
"float32"
)
self
.
run_gpu_solve
(
A_sym
,
x_val
)
def
test_orth_solve
(
self
):
numpy
.
random
.
seed
(
1
)
A_val
=
numpy
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
5
))
.
astype
(
"float32"
)
A_orth
=
numpy
.
linalg
.
svd
(
A_val
)[
0
]
x_val
=
numpy
.
random
.
uniform
(
-
0.4
,
0.4
,
(
A_orth
.
shape
[
1
],
1
))
.
astype
(
"float32"
)
self
.
run_gpu_solve
(
A_orth
,
x_val
)
def
test_uni_rand_solve
(
self
):
numpy
.
random
.
seed
(
1
)
A_val
=
numpy
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
5
))
.
astype
(
"float32"
)
x_val
=
numpy
.
random
.
uniform
(
-
0.4
,
0.4
,
(
A_val
.
shape
[
1
],
4
))
.
astype
(
"float32"
)
self
.
run_gpu_solve
(
A_val
,
x_val
)
theano/gpuarray/tests/test_opt.py
浏览文件 @
276b9e51
...
@@ -4,6 +4,7 @@ from nose.tools import assert_raises
...
@@ -4,6 +4,7 @@ from nose.tools import assert_raises
import
theano
import
theano
from
theano
import
tensor
from
theano
import
tensor
import
theano.tensor.slinalg
as
slinalg
from
theano.tests.breakpoint
import
PdbBreakpoint
from
theano.tests.breakpoint
import
PdbBreakpoint
from
theano.tests
import
unittest_tools
as
utt
,
test_ifelse
from
theano.tests
import
unittest_tools
as
utt
,
test_ifelse
from
theano.tensor.tests
import
test_basic
from
theano.tensor.tests
import
test_basic
...
@@ -16,6 +17,7 @@ from ..basic_ops import (
...
@@ -16,6 +17,7 @@ from ..basic_ops import (
from
..blas
import
GpuGemm
from
..blas
import
GpuGemm
from
..elemwise
import
GpuCAReduceCuda
,
GpuCAReduceCPY
,
GpuElemwise
from
..elemwise
import
GpuCAReduceCuda
,
GpuCAReduceCPY
,
GpuElemwise
from
..subtensor
import
GpuSubtensor
from
..subtensor
import
GpuSubtensor
from
..linalg
import
GpuCusolverSolve
from
.config
import
mode_with_gpu
,
test_ctx_name
from
.config
import
mode_with_gpu
,
test_ctx_name
...
@@ -496,3 +498,18 @@ def test_no_complex():
...
@@ -496,3 +498,18 @@ def test_no_complex():
stft_out
=
tensor
.
exp
(
width_var
*
freq_var
)
*
signal_var
stft_out
=
tensor
.
exp
(
width_var
*
freq_var
)
*
signal_var
theano
.
function
([
width_var
,
freq_var
,
signal_var
],
stft_out
,
theano
.
function
([
width_var
,
freq_var
,
signal_var
],
stft_out
,
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
def
test_local_lift_solve
():
A
=
tensor
.
fmatrix
()
b
=
tensor
.
fmatrix
()
o
=
slinalg
.
solve
(
A
,
b
)
f_cpu
=
theano
.
function
([
A
,
b
],
o
)
f_gpu
=
theano
.
function
([
A
,
b
],
o
,
mode
=
mode_with_gpu
)
assert
not
any
(
isinstance
(
n
.
op
,
slinalg
.
Solve
)
for
n
in
f_gpu
.
maker
.
fgraph
.
apply_nodes
)
assert
any
(
isinstance
(
n
.
op
,
GpuCusolverSolve
)
for
n
in
f_gpu
.
maker
.
fgraph
.
apply_nodes
)
A_val
=
numpy
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
5
))
.
astype
(
"float32"
)
b_val
=
numpy
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
3
))
.
astype
(
"float32"
)
utt
.
assert_allclose
(
f_cpu
(
A_val
,
b_val
),
f_gpu
(
A_val
,
b_val
))
theano/sandbox/cuda/opt.py
浏览文件 @
276b9e51
...
@@ -49,7 +49,6 @@ from theano.sandbox.cuda.blas import (
...
@@ -49,7 +49,6 @@ from theano.sandbox.cuda.blas import (
GpuCorr3dMM
,
GpuCorr3dMM_gradInputs
,
GpuCorr3dMM_gradWeights
)
GpuCorr3dMM
,
GpuCorr3dMM_gradInputs
,
GpuCorr3dMM_gradWeights
)
from
theano.sandbox.cuda.blas
import
gpu_gemv_inplace
from
theano.sandbox.cuda.blas
import
gpu_gemv_inplace
from
theano.sandbox.cuda.cula
import
gpu_solve
,
cula_available
from
theano.sandbox.cuda.blas
import
gpu_gemv_no_inplace
from
theano.sandbox.cuda.blas
import
gpu_gemv_no_inplace
from
theano.sandbox.cuda.blas
import
gpu_ger_inplace
from
theano.sandbox.cuda.blas
import
gpu_ger_inplace
...
@@ -83,7 +82,6 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
...
@@ -83,7 +82,6 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
from
theano.tensor.blas
import
_is_real_vector
,
_is_real_matrix
from
theano.tensor.blas
import
_is_real_vector
,
_is_real_matrix
from
theano.tensor
import
nlinalg
from
theano.tensor
import
nlinalg
from
theano.tensor
import
slinalg
from
theano.tensor.nnet.Conv3D
import
Conv3D
from
theano.tensor.nnet.Conv3D
import
Conv3D
from
theano.tests.breakpoint
import
PdbBreakpoint
from
theano.tests.breakpoint
import
PdbBreakpoint
...
@@ -700,38 +698,6 @@ def local_gpu_dot22scalar(node):
...
@@ -700,38 +698,6 @@ def local_gpu_dot22scalar(node):
return
False
return
False
@register_opt
()
@local_optimizer
([
gpu_from_host
,
slinalg
.
Solve
])
def
local_gpu_solve
(
node
):
"""
gpu_from_host(CpuSolve) -> GpuSolve(gpu_from_host)
CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)
"""
if
not
cula_available
:
return
if
node
.
outputs
[
0
]
.
dtype
!=
'float32'
:
return
if
isinstance
(
node
.
op
,
GpuFromHost
):
host_input
=
node
.
inputs
[
0
]
if
(
host_input
.
owner
and
isinstance
(
host_input
.
owner
.
op
,
slinalg
.
Solve
)):
x
,
y
=
host_input
.
owner
.
inputs
return
[
gpu_solve
(
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
y
))]
if
isinstance
(
node
.
op
,
slinalg
.
Solve
):
if
any
([
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
)
for
i
in
node
.
inputs
]):
x
,
y
=
node
.
inputs
return
[
host_from_gpu
(
gpu_solve
(
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
y
)))]
return
False
@register_opt
()
@register_opt
()
@local_optimizer
([
gpu_from_host
,
tensor
.
blas_c
.
CGemv
,
tensor
.
blas
.
Gemv
])
@local_optimizer
([
gpu_from_host
,
tensor
.
blas_c
.
CGemv
,
tensor
.
blas
.
Gemv
])
def
local_gpu_gemv
(
node
):
def
local_gpu_gemv
(
node
):
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论