Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
f87853c5
提交
f87853c5
authored
1月 28, 2016
作者:
carriepl
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #3508 from cooijmanstim/batched_gemm
WIP: BatchedDotOp
上级
4800a51c
264d8bf2
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
147 行增加
和
40 行删除
+147
-40
blas.py
theano/sandbox/cuda/blas.py
+4
-0
opt.py
theano/sandbox/cuda/opt.py
+41
-2
test_blas.py
theano/sandbox/cuda/tests/test_blas.py
+13
-2
basic.py
theano/tensor/basic.py
+33
-33
blas.py
theano/tensor/blas.py
+0
-0
test_basic.py
theano/tensor/tests/test_basic.py
+56
-3
没有找到文件。
theano/sandbox/cuda/blas.py
浏览文件 @
f87853c5
...
@@ -212,6 +212,10 @@ class BatchedDotOp(GpuOp):
...
@@ -212,6 +212,10 @@ class BatchedDotOp(GpuOp):
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
1
,)
return
(
1
,)
def
infer_shape
(
self
,
node
,
shapes
):
xshp
,
yshp
=
shapes
return
[
xshp
[:
-
1
]
+
yshp
[
2
:]]
batched_dot
=
BatchedDotOp
()
batched_dot
=
BatchedDotOp
()
"""
"""
Call cublasSgemmBatched. Take 2 3d tensor as input.
Call cublasSgemmBatched. Take 2 3d tensor as input.
...
...
theano/sandbox/cuda/opt.py
浏览文件 @
f87853c5
...
@@ -33,7 +33,7 @@ from theano.sandbox.cuda.basic_ops import (
...
@@ -33,7 +33,7 @@ from theano.sandbox.cuda.basic_ops import (
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda.blas
import
(
from
theano.sandbox.cuda.blas
import
(
gpu_dot22
,
gpu_dot22scalar
,
gpu_gemm_inplace
,
gpu_gemm_no_inplace
,
GpuConv
,
gpu_dot22
,
gpu_dot22scalar
,
gpu_gemm_inplace
,
gpu_gemm_no_inplace
,
GpuConv
,
GpuCorrMM
,
GpuCorrMM_gradInputs
,
GpuCorrMM_gradWeights
,
BatchedDotOp
,
GpuCorrMM
,
GpuCorrMM_gradInputs
,
GpuCorrMM_gradWeights
,
GpuCorr3dMM
,
GpuCorr3dMM_gradInputs
,
GpuCorr3dMM_gradWeights
)
GpuCorr3dMM
,
GpuCorr3dMM_gradInputs
,
GpuCorr3dMM_gradWeights
)
from
theano.sandbox.cuda.blas
import
gpu_gemv_inplace
from
theano.sandbox.cuda.blas
import
gpu_gemv_inplace
...
@@ -156,7 +156,7 @@ cpu_ops_moved_to_gpu = [
...
@@ -156,7 +156,7 @@ cpu_ops_moved_to_gpu = [
tensor
.
Reshape
,
tensor
.
flatten
,
tensor
.
Subtensor
,
tensor
.
Reshape
,
tensor
.
flatten
,
tensor
.
Subtensor
,
tensor
.
AdvancedSubtensor1
,
tensor
.
AdvancedIncSubtensor1
,
tensor
.
AdvancedSubtensor1
,
tensor
.
AdvancedIncSubtensor1
,
tensor
.
IncSubtensor
,
tensor
.
Shape
,
tensor
.
Join
,
tensor
.
IncSubtensor
,
tensor
.
Shape
,
tensor
.
Join
,
tensor
.
Alloc
,
tensor
.
Eye
]
tensor
.
Alloc
,
tensor
.
Eye
,
tensor
.
BatchedDot
]
class
InputToGpuOptimizer
(
Optimizer
):
class
InputToGpuOptimizer
(
Optimizer
):
...
@@ -613,6 +613,45 @@ def local_gpu_dot22(node):
...
@@ -613,6 +613,45 @@ def local_gpu_dot22(node):
return
False
return
False
@register_opt
()
@local_optimizer
([
gpu_from_host
,
tensor
.
BatchedDot
])
def
local_gpu_batched_dot
(
node
):
"""
gpu_from_host(batched_dot) -> gpu_batched_dot(gpu_from_host)
batched_dot(host_from_gpu) -> host_from_gpu(gpu_batched_dot)
"""
def
gpu_batched_dot
(
x
,
y
):
# pad x and y shapes to be third-order tensors
x_
,
y_
=
x
,
y
if
x
.
ndim
==
2
:
x_
=
x_
.
dimshuffle
(
0
,
"x"
,
1
)
if
y
.
ndim
==
2
:
y_
=
y_
.
dimshuffle
(
0
,
1
,
"x"
)
z
=
BatchedDotOp
()(
as_cuda_ndarray_variable
(
x_
),
as_cuda_ndarray_variable
(
y_
))
# unpad z shape
if
x
.
ndim
==
2
:
z
=
z
.
dimshuffle
(
0
,
*
range
(
2
,
z
.
ndim
))
if
y
.
ndim
==
2
:
z
=
z
.
dimshuffle
(
*
range
(
z
.
ndim
-
1
))
return
as_cuda_ndarray_variable
(
z
)
if
isinstance
(
node
.
op
,
GpuFromHost
):
host_input
=
node
.
inputs
[
0
]
if
host_input
.
owner
and
isinstance
(
host_input
.
owner
.
op
,
tensor
.
BatchedDot
):
x
,
y
=
host_input
.
owner
.
inputs
return
[
gpu_batched_dot
(
x
,
y
)]
if
isinstance
(
node
.
op
,
tensor
.
BatchedDot
):
if
any
([(
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
))
for
i
in
node
.
inputs
]):
x
,
y
=
node
.
inputs
return
[
host_from_gpu
(
gpu_batched_dot
(
x
,
y
))]
return
False
@register_opt
()
@register_opt
()
@local_optimizer
([
gpu_from_host
,
tensor
.
blas
.
Dot22Scalar
])
@local_optimizer
([
gpu_from_host
,
tensor
.
blas
.
Dot22Scalar
])
def
local_gpu_dot22scalar
(
node
):
def
local_gpu_dot22scalar
(
node
):
...
...
theano/sandbox/cuda/tests/test_blas.py
浏览文件 @
f87853c5
...
@@ -23,7 +23,7 @@ import theano.compile.mode
...
@@ -23,7 +23,7 @@ import theano.compile.mode
from
theano.tensor.tests.test_blas
import
BaseGemv
,
TestBlasStrides
,
TestGer
from
theano.tensor.tests.test_blas
import
BaseGemv
,
TestBlasStrides
,
TestGer
from
theano.sandbox.cuda.blas
import
gpu_gemv_no_inplace
,
gpu_gemv_inplace
from
theano.sandbox.cuda.blas
import
gpu_gemv_no_inplace
,
gpu_gemv_inplace
from
theano.sandbox.cuda.blas
import
gpu_ger_inplace
,
gpu_ger_no_inplace
from
theano.sandbox.cuda.blas
import
gpu_ger_inplace
,
gpu_ger_no_inplace
from
theano.sandbox.cuda.blas
import
batched_dot
from
theano.sandbox.cuda.blas
import
batched_dot
,
BatchedDotOp
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
...
@@ -44,7 +44,7 @@ def my_rand(*shape):
...
@@ -44,7 +44,7 @@ def my_rand(*shape):
return
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
return
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
class
TestBatchedDot
(
TestCase
):
class
TestBatchedDot
(
unittest_tools
.
InferShapeTester
):
def
test_batched_dot_correctness
(
self
):
def
test_batched_dot_correctness
(
self
):
...
@@ -114,6 +114,17 @@ class TestBatchedDot(TestCase):
...
@@ -114,6 +114,17 @@ class TestBatchedDot(TestCase):
numpy
.
random
.
randn
(
5
,
2
,
6
)
.
astype
(
numpy
.
float32
)],
numpy
.
random
.
randn
(
5
,
2
,
6
)
.
astype
(
numpy
.
float32
)],
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
def
test_infer_shape
(
self
):
# only matrix/matrix is supported
admat
=
tensor
.
ftensor3
()
bdmat
=
tensor
.
ftensor3
()
admat_val
=
my_rand
(
7
,
4
,
5
)
bdmat_val
=
my_rand
(
7
,
5
,
3
)
self
.
_compile_and_check
([
admat
,
bdmat
],
[
BatchedDotOp
()(
admat
,
bdmat
)],
[
admat_val
,
bdmat_val
],
BatchedDotOp
)
def
test_dot22
():
def
test_dot22
():
def
cmp
(
a_shp
,
b_shp
):
def
cmp
(
a_shp
,
b_shp
):
...
...
theano/tensor/basic.py
浏览文件 @
f87853c5
...
@@ -3373,52 +3373,52 @@ def transpose(x, axes=None):
...
@@ -3373,52 +3373,52 @@ def transpose(x, axes=None):
return
ret
return
ret
def
batched_dot
(
x
,
y
):
def
batched_dot
(
a
,
b
):
"""
"""
This function computes the dot product between the two tensors, by
Compute the batched dot product of two variables:
iterating over the first dimension using scan.
Parameters
batched_dot(a, b)[i] = dot(a[i], b[i])
----------
x : tensor
A Tensor with sizes e.g.: for 3D (dim1, dim3, dim2).
y : tensor
A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4).
Returns
Note that this batched_dot function does one of three things, in the
-------
following sequence:
tensor
A tensor of size e.g. if it is 3D: (dim1, dim3, dim4).
Notes
1. If either a or b is a vector, it returns the batched elementwise
-----
product without calling the Theano BatchedDot op.
This is a subset of numpy.einsum, but we do not provide it for now.
But numpy einsum is slower than dot or tensordot:
http://mail.scipy.org/pipermail/numpy-discussion/2012-October/064259.html
Examples
2. If both a and b have either 2 or 3 dimensions, it calls Theano's
--------
BatchedDot op on a and b.
>>> first = tensor.tensor3('first')
>>> second = tensor.tensor3('second')
>>> result = batched_dot(first, second)
3. If either a or b has more than 3 dimensions, it calls Theano's
batched_tensordot function with appropriate axes. The
batched_tensordot function expresses high-dimensional batched
dot products in terms of batched matrix-matrix dot products, so
it may be possible to futherize optimize for performance.
"""
"""
result
,
updates
=
theano
.
scan
(
a
,
b
=
as_tensor_variable
(
a
),
as_tensor_variable
(
b
)
fn
=
lambda
x_mat
,
y_mat
:
theano
.
tensor
.
dot
(
x_mat
,
y_mat
),
if
a
.
ndim
==
0
:
outputs_info
=
None
,
raise
TypeError
(
"a must have at least one (batch) axis"
)
sequences
=
[
x
,
y
],
elif
b
.
ndim
==
0
:
non_sequences
=
None
)
raise
TypeError
(
"b must have at least one (batch) axis"
)
return
result
elif
a
.
ndim
==
1
:
return
a
.
dimshuffle
(
*
([
0
]
+
[
"x"
]
*
(
b
.
ndim
-
1
)))
*
b
elif
b
.
ndim
==
1
:
return
a
*
b
.
dimshuffle
(
*
([
0
]
+
[
"x"
]
*
(
a
.
ndim
-
1
)))
elif
a
.
ndim
>
3
or
b
.
ndim
>
3
:
return
batched_tensordot
(
a
,
b
,
[[
a
.
ndim
-
1
],
[
numpy
.
maximum
(
1
,
b
.
ndim
-
2
)]])
else
:
# avoid circular import
return
theano
.
tensor
.
blas
.
BatchedDot
()(
a
,
b
)
def
batched_tensordot
(
x
,
y
,
axes
=
2
):
def
batched_tensordot
(
x
,
y
,
axes
=
2
):
"""
"""
Compute
the
tensordot product.
Compute
a batched
tensordot product.
A hybrid of batch_dot and tensordot, this function computes the
A hybrid of batch
ed
_dot and tensordot, this function computes the
tensordot product between the two tensors, by iterating over the
tensordot product between the two tensors, by iterating over the
first dimension
using scan
to perform a sequence of tensordots.
first dimension to perform a sequence of tensordots.
Parameters
Parameters
----------
----------
...
...
theano/tensor/blas.py
浏览文件 @
f87853c5
差异被折叠。
点击展开。
theano/tensor/tests/test_basic.py
浏览文件 @
f87853c5
...
@@ -31,9 +31,9 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
...
@@ -31,9 +31,9 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
horizontal_stack
,
vertical_stack
,
argmax
,
get_vector_length
,
horizontal_stack
,
vertical_stack
,
argmax
,
get_vector_length
,
fscalar
,
zeros_like
,
sum
,
tensor3
,
vector
,
add
,
addbroadcast
,
fscalar
,
zeros_like
,
sum
,
tensor3
,
vector
,
add
,
addbroadcast
,
alloc
,
as_tensor_variable
,
tensor_from_scalar
,
ARange
,
autocast_float
,
alloc
,
as_tensor_variable
,
tensor_from_scalar
,
ARange
,
autocast_float
,
clip
,
constant
,
default
,
dot
,
clip
,
constant
,
default
,
dot
,
batched_dot
,
dmatrix
,
dscalar
,
dvector
,
eq
,
eye
,
fill
,
flatten
,
inverse_permutation
,
Flatten
,
dmatrix
,
dscalar
,
dvector
,
eq
,
eye
,
fill
,
flatten
,
inverse_permutation
,
tensor4
,
permute_row_elements
,
fmatrix
,
fscalars
,
grad
,
tensor4
,
permute_row_elements
,
Flatten
,
fmatrix
,
fscalars
,
grad
,
inplace
,
iscalar
,
matrix
,
minimum
,
matrices
,
maximum
,
mul
,
neq
,
inplace
,
iscalar
,
matrix
,
minimum
,
matrices
,
maximum
,
mul
,
neq
,
Reshape
,
row
,
scalar
,
scalars
,
second
,
smallest
,
stack
,
sub
,
Tensor
,
Reshape
,
row
,
scalar
,
scalars
,
second
,
smallest
,
stack
,
sub
,
Tensor
,
tensor_copy
,
tensordot
,
TensorType
,
Tri
,
tri
,
tril
,
triu
,
unbroadcast
,
tensor_copy
,
tensordot
,
TensorType
,
Tri
,
tri
,
tril
,
triu
,
unbroadcast
,
...
@@ -1938,6 +1938,59 @@ DotTester = makeTester(name='DotTester',
...
@@ -1938,6 +1938,59 @@ DotTester = makeTester(name='DotTester',
bad_runtime
=
dict
(
bad1
=
(
rand
(
5
,
7
),
rand
(
5
,
7
)),
bad_runtime
=
dict
(
bad1
=
(
rand
(
5
,
7
),
rand
(
5
,
7
)),
bad2
=
(
rand
(
5
,
7
),
rand
(
8
,
3
))))
bad2
=
(
rand
(
5
,
7
),
rand
(
8
,
3
))))
BatchedDotTester
=
makeTester
(
name
=
'BatchedDotTester'
,
op
=
batched_dot
,
expected
=
(
lambda
xs
,
ys
:
numpy
.
asarray
(
list
(
x
*
y
if
x
.
ndim
==
0
or
y
.
ndim
==
0
else
numpy
.
dot
(
x
,
y
)
for
x
,
y
in
zip
(
xs
,
ys
)),
dtype
=
theano
.
scalar
.
upcast
(
xs
.
dtype
,
ys
.
dtype
))),
checks
=
{},
grad
=
dict
(
correct1
=
(
rand
(
3
,
5
,
7
),
rand
(
3
,
7
,
5
)),
correct2
=
(
rand
(
3
,
5
,
7
),
rand
(
3
,
7
,
9
)),
correct3
=
(
rand
(
3
,
5
,
7
),
rand
(
3
,
7
)),
correct4
=
(
rand
(
3
,
5
),
rand
(
3
,
5
,
7
)),
correct5
=
(
rand
(
3
),
rand
(
3
,
5
,
7
)),
correct6
=
(
rand
(
3
,
5
),
rand
(
3
)),
correct7
=
(
rand
(
3
,
5
),
rand
(
3
,
5
)),
correct8
=
(
rand
(
3
),
rand
(
3
)),
correct9
=
(
rand
(
3
,
5
,
7
,
11
),
rand
(
3
)),
correct10
=
(
rand
(
3
,
7
,
11
,
5
),
rand
(
3
,
5
)),
correct11
=
(
rand
(
3
,
7
,
11
,
5
),
rand
(
3
,
5
,
13
)),
correct12
=
(
rand
(
3
,
7
,
11
,
5
),
rand
(
3
,
13
,
5
,
17
)),
mixed1
=
(
rand
(
3
,
5
)
.
astype
(
'float32'
),
rand
(
3
,
5
,
7
)),
mixed2
=
(
rand
(
3
,
5
)
.
astype
(
'float64'
),
rand
(
3
,
5
,
7
))),
good
=
dict
(
correct1
=
(
rand
(
3
,
5
,
7
),
rand
(
3
,
7
,
5
)),
correct2
=
(
rand
(
3
,
5
,
7
),
rand
(
3
,
7
,
9
)),
correct3
=
(
rand
(
3
,
5
,
7
),
rand
(
3
,
7
)),
correct4
=
(
rand
(
3
,
5
),
rand
(
3
,
5
,
7
)),
correct5
=
(
rand
(
3
),
rand
(
3
,
5
,
7
)),
correct6
=
(
rand
(
3
,
5
),
rand
(
3
)),
correct7
=
(
rand
(
3
,
5
),
rand
(
3
,
5
)),
correct8
=
(
rand
(
3
),
rand
(
3
)),
correct9
=
(
rand
(
3
,
5
,
7
,
11
),
rand
(
3
)),
correct10
=
(
rand
(
3
,
7
,
11
,
5
),
rand
(
3
,
5
)),
correct11
=
(
rand
(
3
,
7
,
11
,
5
),
rand
(
3
,
5
,
13
)),
correct12
=
(
rand
(
3
,
7
,
11
,
5
),
rand
(
3
,
13
,
5
,
17
)),
mixed1
=
(
rand
(
3
,
5
)
.
astype
(
'float32'
),
rand
(
3
,
5
,
7
)),
mixed2
=
(
rand
(
3
,
5
)
.
astype
(
'float64'
),
rand
(
3
,
5
,
7
))),
bad_build
=
dict
(
no_batch_axis2
=
(
rand
(),
rand
(
3
,
5
)),
no_batch_axis3
=
(
rand
(
3
,
5
),
rand
())),
bad_runtime
=
dict
(
batch_dim_mismatch1
=
(
rand
(
2
,
5
,
7
),
rand
(
3
,
7
,
9
)),
batch_dim_mismatch2
=
(
rand
(
3
,
5
,
7
),
rand
(
2
,
7
,
9
)),
batch_dim_mismatch3
=
(
rand
(
3
),
rand
(
5
)),
bad_dim1
=
(
rand
(
3
,
5
,
7
),
rand
(
3
,
5
,
7
)),
bad_dim2
=
(
rand
(
3
,
5
,
7
),
rand
(
3
,
8
,
3
)),
bad_dim3
=
(
rand
(
3
,
5
),
rand
(
3
,
7
)),
bad_dim4
=
(
rand
(
3
,
5
,
7
,
11
),
rand
(
3
,
5
)),
bad_dim5
=
(
rand
(
3
,
5
,
7
,
11
),
rand
(
3
,
5
,
13
)),
bad_dim6
=
(
rand
(
3
,
5
,
7
,
11
),
rand
(
3
,
13
,
5
,
17
))))
def
_numpy_second
(
x
,
y
):
def
_numpy_second
(
x
,
y
):
return
numpy
.
broadcast_arrays
(
x
,
y
)[
1
]
return
numpy
.
broadcast_arrays
(
x
,
y
)[
1
]
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论