Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
b69ad54d
提交
b69ad54d
authored
5月 05, 2016
作者:
Xavier Bouthillier
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #4244 from ChihebTrabelsi/ccw2.0
flake8 sandbox/cuda/*.py
上级
200babca
58267dc2
显示空白字符变更
内嵌
并排
正在显示
33 个修改的文件
包含
1043 行增加
和
927 行删除
+1043
-927
GpuConvGrad3D.py
theano/sandbox/cuda/GpuConvGrad3D.py
+6
-7
GpuConvTransp3D.py
theano/sandbox/cuda/GpuConvTransp3D.py
+40
-17
basic_ops.py
theano/sandbox/cuda/basic_ops.py
+43
-43
blas.py
theano/sandbox/cuda/blas.py
+45
-34
elemwise.py
theano/sandbox/cuda/elemwise.py
+167
-91
fftconv.py
theano/sandbox/cuda/fftconv.py
+13
-7
kernel_codegen.py
theano/sandbox/cuda/kernel_codegen.py
+6
-10
neighbours.py
theano/sandbox/cuda/neighbours.py
+1
-1
nnet.py
theano/sandbox/cuda/nnet.py
+32
-35
nvcc_compiler.py
theano/sandbox/cuda/nvcc_compiler.py
+5
-3
opt.py
theano/sandbox/cuda/opt.py
+38
-58
rng_curand.py
theano/sandbox/cuda/rng_curand.py
+13
-16
test_basic_ops.py
theano/sandbox/cuda/tests/test_basic_ops.py
+113
-101
test_bench_loopfusion.py
theano/sandbox/cuda/tests/test_bench_loopfusion.py
+78
-32
test_blas.py
theano/sandbox/cuda/tests/test_blas.py
+78
-68
test_conv_cuda_ndarray.py
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+110
-114
test_cuda_ndarray.py
theano/sandbox/cuda/tests/test_cuda_ndarray.py
+46
-74
test_driver.py
theano/sandbox/cuda/tests/test_driver.py
+4
-3
test_extra_ops.py
theano/sandbox/cuda/tests/test_extra_ops.py
+24
-23
test_gemmcorr3d.py
theano/sandbox/cuda/tests/test_gemmcorr3d.py
+6
-5
test_gradient.py
theano/sandbox/cuda/tests/test_gradient.py
+1
-1
test_memory.py
theano/sandbox/cuda/tests/test_memory.py
+14
-8
test_mlp.py
theano/sandbox/cuda/tests/test_mlp.py
+30
-30
test_neighbours.py
theano/sandbox/cuda/tests/test_neighbours.py
+4
-4
test_opt.py
theano/sandbox/cuda/tests/test_opt.py
+59
-52
test_rng_curand.py
theano/sandbox/cuda/tests/test_rng_curand.py
+1
-1
test_tensor_op.py
theano/sandbox/cuda/tests/test_tensor_op.py
+6
-8
test_var.py
theano/sandbox/cuda/tests/test_var.py
+10
-10
test_viewop.py
theano/sandbox/cuda/tests/test_viewop.py
+1
-2
walltime.py
theano/sandbox/cuda/tests/walltime.py
+34
-21
type.py
theano/sandbox/cuda/type.py
+11
-12
var.py
theano/sandbox/cuda/var.py
+4
-2
test_flake8.py
theano/tests/test_flake8.py
+0
-34
没有找到文件。
theano/sandbox/cuda/GpuConvGrad3D.py
浏览文件 @
b69ad54d
...
@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
...
@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
d_
=
T
.
as_tensor_variable
(
d
)
d_
=
T
.
as_tensor_variable
(
d
)
WShape_
=
T
.
as_tensor_variable
(
WShape
)
WShape_
=
T
.
as_tensor_variable
(
WShape
)
dCdH_
=
as_cuda_ndarray_variable
(
dCdH
)
dCdH_
=
as_cuda_ndarray_variable
(
dCdH
)
broad
=
(
False
,)
*
5
broad
=
(
False
,)
*
5
return
theano
.
Apply
(
self
,
inputs
=
[
V_
,
d_
,
WShape_
,
dCdH_
],
return
theano
.
Apply
(
self
,
inputs
=
[
V_
,
d_
,
WShape_
,
dCdH_
],
outputs
=
[
CudaNdarrayType
(
dtype
=
V_
.
dtype
,
outputs
=
[
CudaNdarrayType
(
dtype
=
V_
.
dtype
,
broadcastable
=
broad
)()])
broadcastable
=
broad
)()])
...
@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
...
@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize
=
dCdH
.
shape
[
0
]
batchSize
=
dCdH
.
shape
[
0
]
outputFilters
=
dCdH
.
shape
[
1
]
outputHeight
=
dCdH
.
shape
[
2
]
outputHeight
=
dCdH
.
shape
[
2
]
outputWidth
=
dCdH
.
shape
[
3
]
outputWidth
=
dCdH
.
shape
[
3
]
outputDur
=
dCdH
.
shape
[
4
]
outputDur
=
dCdH
.
shape
[
4
]
assert
V
.
shape
[
0
]
==
batchSize
assert
V
.
shape
[
0
]
==
batchSize
inputFilters
=
V
.
shape
[
1
]
inputHeight
=
V
.
shape
[
2
]
inputWidth
=
V
.
shape
[
3
]
inputDur
=
V
.
shape
[
4
]
dr
,
dc
,
dt
=
d
dr
,
dc
,
dt
=
d
dCdW
=
numpy
.
zeros
(
WShape
,
dtype
=
V
.
dtype
)
dCdW
=
numpy
.
zeros
(
WShape
,
dtype
=
V
.
dtype
)
...
@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
...
@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
for
p
in
xrange
(
0
,
outputHeight
):
for
p
in
xrange
(
0
,
outputHeight
):
for
q
in
xrange
(
0
,
outputWidth
):
for
q
in
xrange
(
0
,
outputWidth
):
for
r
in
xrange
(
0
,
outputDur
):
for
r
in
xrange
(
0
,
outputDur
):
dCdW
[
j
,
z
,
k
,
l
,
m
]
+=
dCdH
[
i
,
j
,
p
,
q
,
r
]
*
V
[
i
,
z
,
dr
*
p
+
k
,
dc
*
q
+
l
,
dt
*
r
+
m
]
dCdW
[
j
,
z
,
k
,
l
,
m
]
+=
dCdH
[
i
,
j
,
p
,
q
,
r
]
*
\
V
[
i
,
z
,
dr
*
p
+
k
,
dc
*
q
+
l
,
dt
*
r
+
m
]
output_storage
[
0
][
0
]
=
dCdW
output_storage
[
0
][
0
]
=
dCdW
...
...
theano/sandbox/cuda/GpuConvTransp3D.py
浏览文件 @
b69ad54d
...
@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
...
@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
else
:
else
:
RShape_
=
T
.
as_tensor_variable
([
-
1
,
-
1
,
-
1
])
RShape_
=
T
.
as_tensor_variable
([
-
1
,
-
1
,
-
1
])
return
theano
.
Apply
(
self
,
inputs
=
[
W_
,
b_
,
d_
,
H_
,
RShape_
],
return
theano
.
Apply
(
outputs
=
[
CudaNdarrayType
(
dtype
=
H_
.
dtype
,
self
,
inputs
=
[
W_
,
b_
,
d_
,
H_
,
RShape_
],
broadcastable
=
(
False
,)
*
5
)()])
outputs
=
[
CudaNdarrayType
(
dtype
=
H_
.
dtype
,
broadcastable
=
(
False
,)
*
5
)()])
def
infer_shape
(
self
,
node
,
input_shapes
):
def
infer_shape
(
self
,
node
,
input_shapes
):
W
,
b
,
d
,
H
,
RShape
=
node
.
inputs
W
,
b
,
d
,
H
,
RShape
=
node
.
inputs
...
@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
...
@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
assert
dc
>
0
assert
dc
>
0
assert
dt
>
0
assert
dt
>
0
videoHeight
=
(
outputHeight
-
1
)
*
dr
+
filterHeight
videoHeight
=
(
outputHeight
-
1
)
*
dr
+
filterHeight
videoWidth
=
(
outputWidth
-
1
)
*
dc
+
filterWidth
videoWidth
=
(
outputWidth
-
1
)
*
dc
+
filterWidth
videoDur
=
(
outputDur
-
1
)
*
dt
+
filterDur
videoDur
=
(
outputDur
-
1
)
*
dt
+
filterDur
if
Rshape
is
not
None
and
Rshape
[
0
]
!=
-
1
:
if
Rshape
is
not
None
and
Rshape
[
0
]
!=
-
1
:
if
Rshape
[
0
]
<
videoHeight
:
if
Rshape
[
0
]
<
videoHeight
:
...
@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
...
@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
# else:
# else:
# print "No Rshape passed in"
# print "No Rshape passed in"
# print "video size: "
+
str((videoHeight, videoWidth, videoDur))
# print "video size: "
+
str((videoHeight, videoWidth, videoDur))
R
=
numpy
.
zeros
(
(
batchSize
,
inputChannels
,
videoHeight
,
R
=
numpy
.
zeros
((
batchSize
,
inputChannels
,
videoHeight
,
videoWidth
,
videoDur
)
,
dtype
=
H
.
dtype
)
videoWidth
,
videoDur
),
dtype
=
H
.
dtype
)
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
# sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for
i
in
xrange
(
0
,
batchSize
):
for
i
in
xrange
(
0
,
batchSize
):
# print '\texample '+str(i+1)+'/'+str(batchSize)
# print '\texample '+str(i+1)+'/'+str(batchSize)
for
j
in
xrange
(
0
,
inputChannels
):
for
j
in
xrange
(
0
,
inputChannels
):
# print '\t\tfeature map '
+str(j+1)+'/'+
str(inputChannels)
# print '\t\tfeature map '
+ str(j+1) + '/' +
str(inputChannels)
for
r
in
xrange
(
0
,
videoHeight
):
for
r
in
xrange
(
0
,
videoHeight
):
# print '\t\t\trow '
+str(r+1)+
'/'+str(videoHeight)
# print '\t\t\trow '
+ str(r+1) +
'/'+str(videoHeight)
for
c
in
xrange
(
0
,
videoWidth
):
for
c
in
xrange
(
0
,
videoWidth
):
for
t
in
xrange
(
0
,
videoDur
):
for
t
in
xrange
(
0
,
videoDur
):
R
[
i
,
j
,
r
,
c
,
t
]
=
b
[
j
]
R
[
i
,
j
,
r
,
c
,
t
]
=
b
[
j
]
ftc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
t
-
filterDur
+
1
)
/
float
(
dt
)))
])
ftc
=
max
(
fcc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
c
-
filterWidth
+
1
)
/
float
(
dc
)))
])
[
0
,
int
(
numpy
.
ceil
(
rc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
r
-
filterHeight
+
1
)
/
float
(
dr
)))
])
float
(
t
-
filterDur
+
1
)
/
float
(
dt
)
))
]
)
fcc
=
max
(
[
0
,
int
(
numpy
.
ceil
(
float
(
c
-
filterWidth
+
1
)
/
float
(
dc
)
))
]
)
rc
=
max
(
[
0
,
int
(
numpy
.
ceil
(
float
(
r
-
filterHeight
+
1
)
/
float
(
dr
)
))
]
)
while
rc
<
outputHeight
:
while
rc
<
outputHeight
:
rk
=
r
-
rc
*
dr
rk
=
r
-
rc
*
dr
if
rk
<
0
:
if
rk
<
0
:
...
@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
...
@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
if
tk
<
0
:
if
tk
<
0
:
break
break
R
[
i
,
j
,
r
,
c
,
t
]
+=
numpy
.
dot
(
W
[:,
j
,
rk
,
ck
,
tk
],
H
[
i
,
:,
rc
,
cc
,
tc
]
)
R
[
i
,
j
,
r
,
c
,
t
]
+=
numpy
.
dot
(
W
[:,
j
,
rk
,
ck
,
tk
],
H
[
i
,
:,
rc
,
cc
,
tc
])
tc
+=
1
tc
+=
1
""
# close loop over tc
""
# close loop over tc
...
...
theano/sandbox/cuda/basic_ops.py
浏览文件 @
b69ad54d
...
@@ -2,7 +2,7 @@ from __future__ import absolute_import, print_function, division
...
@@ -2,7 +2,7 @@ from __future__ import absolute_import, print_function, division
import
copy
import
copy
import
logging
import
logging
import
sys
import
sys
import
warnings
import
numpy
import
numpy
from
six
import
iteritems
from
six
import
iteritems
from
six.moves
import
StringIO
,
xrange
from
six.moves
import
StringIO
,
xrange
...
@@ -12,6 +12,9 @@ from theano import gof, Type, Apply
...
@@ -12,6 +12,9 @@ from theano import gof, Type, Apply
from
theano
import
tensor
,
scalar
,
config
from
theano
import
tensor
,
scalar
,
config
from
theano.gradient
import
grad_undefined
from
theano.gradient
import
grad_undefined
from
theano.scalar
import
Scalar
from
theano.scalar
import
Scalar
from
theano.sandbox.cuda
import
GpuOp
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda.elemwise
import
NaiveAlgo
scal
=
scalar
# somewhere scalar gets reassigned to be a function
scal
=
scalar
# somewhere scalar gets reassigned to be a function
...
@@ -24,10 +27,6 @@ try:
...
@@ -24,10 +27,6 @@ try:
except
ImportError
:
except
ImportError
:
pass
pass
from
theano.sandbox.cuda
import
GpuOp
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda.elemwise
import
NaiveAlgo
_logger_name
=
'theano.sandbox.cuda.basic_ops'
_logger_name
=
'theano.sandbox.cuda.basic_ops'
_logger
=
logging
.
getLogger
(
_logger_name
)
_logger
=
logging
.
getLogger
(
_logger_name
)
...
@@ -596,10 +595,8 @@ class GpuCAReduce(GpuOp):
...
@@ -596,10 +595,8 @@ class GpuCAReduce(GpuOp):
if
self
.
pre_scalar_op
:
if
self
.
pre_scalar_op
:
pre
=
"pre=
%
s,red="
%
str
(
self
.
pre_scalar_op
)
pre
=
"pre=
%
s,red="
%
str
(
self
.
pre_scalar_op
)
return
"GpuCAReduce{
%
s
%
s}{
%
s}"
%
(
return
"GpuCAReduce{
%
s
%
s}{
%
s}"
%
(
pre
,
pre
,
str
(
self
.
scalar_op
),
str
(
self
.
scalar_op
),
','
.
join
(
str
(
i
)
for
i
in
self
.
reduce_mask
))
','
.
join
(
str
(
i
)
for
i
in
self
.
reduce_mask
)
)
def
__setstate__
(
self
,
d
):
def
__setstate__
(
self
,
d
):
self
.
__dict__
.
update
(
d
)
self
.
__dict__
.
update
(
d
)
...
@@ -775,15 +772,18 @@ class GpuCAReduce(GpuOp):
...
@@ -775,15 +772,18 @@ class GpuCAReduce(GpuOp):
# check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
# check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
# TODO: check if we are ccontiguous when we un-dimshuffle
# TODO: check if we are ccontiguous when we un-dimshuffle
# TODO: if only some dims are ccontiguous, call version with less dims.
# TODO: if only some dims are ccontiguous, call version with less dims.
print
(
'if(CudaNdarray_is_c_contiguous(
%(x)
s)){'
%
locals
(),
file
=
sio
)
print
(
'if(CudaNdarray_is_c_contiguous(
%(x)
s)){'
%
locals
(),
file
=
sio
)
self
.
c_code_reduce_ccontig
(
sio
,
node
,
name
,
x
,
z
,
fail
)
self
.
c_code_reduce_ccontig
(
sio
,
node
,
name
,
x
,
z
,
fail
)
print
(
"}else{"
,
file
=
sio
)
print
(
"}else{"
,
file
=
sio
)
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
str
(
i
)
for
i
in
self
.
reduce_mask
)))(
sio
,
node
,
name
,
x
,
z
,
fail
)
str
(
i
)
for
i
in
self
.
reduce_mask
)))(
sio
,
node
,
name
,
x
,
z
,
fail
)
print
(
"}"
,
file
=
sio
)
print
(
"}"
,
file
=
sio
)
else
:
else
:
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
str
(
i
)
for
i
in
self
.
reduce_mask
)))(
sio
,
node
,
name
,
x
,
z
,
fail
)
str
(
i
)
for
i
in
self
.
reduce_mask
)))(
sio
,
node
,
name
,
x
,
z
,
fail
)
# \end bracket the reduction ...
# \end bracket the reduction ...
print
(
"""
print
(
"""
...
@@ -976,7 +976,7 @@ class GpuCAReduce(GpuOp):
...
@@ -976,7 +976,7 @@ class GpuCAReduce(GpuOp):
assert
isinstance
(
self
.
scalar_op
,
(
scal
.
Maximum
,
assert
isinstance
(
self
.
scalar_op
,
(
scal
.
Maximum
,
scal
.
Minimum
))
scal
.
Minimum
))
if
self
.
pre_scalar_op
:
if
self
.
pre_scalar_op
:
#dtype = node.inputs[0].dtype
#
dtype = node.inputs[0].dtype
dtype
=
'float32'
dtype
=
'float32'
dummy_var
=
scal
.
Scalar
(
dtype
=
dtype
)()
dummy_var
=
scal
.
Scalar
(
dtype
=
dtype
)()
...
@@ -1834,12 +1834,15 @@ class GpuCAReduce(GpuOp):
...
@@ -1834,12 +1834,15 @@ class GpuCAReduce(GpuOp):
version
=
[
15
]
# the version corresponding to the c code in this Op
version
=
[
15
]
# the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
# now we insert versions for the ops on which we depend...
scalar_node
=
Apply
(
self
.
scalar_op
,
Apply
(
self
.
scalar_op
,
[
Scalar
(
dtype
=
input
.
type
.
dtype
)()
for
input
in
node
.
inputs
],
[
Scalar
(
[
Scalar
(
dtype
=
output
.
type
.
dtype
)()
for
output
in
node
.
outputs
])
dtype
=
input
.
type
.
dtype
)()
for
input
in
node
.
inputs
],
[
Scalar
(
dtype
=
output
.
type
.
dtype
)()
for
output
in
node
.
outputs
])
version
.
extend
(
self
.
scalar_op
.
c_code_cache_version
())
version
.
extend
(
self
.
scalar_op
.
c_code_cache_version
())
for
i
in
node
.
inputs
+
node
.
outputs
:
for
i
in
node
.
inputs
+
node
.
outputs
:
version
.
extend
(
Scalar
(
dtype
=
i
.
type
.
dtype
)
.
c_code_cache_version
())
version
.
extend
(
Scalar
(
dtype
=
i
.
type
.
dtype
)
.
c_code_cache_version
())
if
all
(
version
):
if
all
(
version
):
return
tuple
(
version
)
return
tuple
(
version
)
else
:
else
:
...
@@ -1946,10 +1949,11 @@ class GpuCAReduce(GpuOp):
...
@@ -1946,10 +1949,11 @@ class GpuCAReduce(GpuOp):
%(reducebuf)
s
%(reducebuf)
s
}
}
"""
%
locals
(),
file
=
sio
)
"""
%
locals
(),
file
=
sio
)
#01, 011, 0111
#
01, 011, 0111
if
(
0
==
self
.
reduce_mask
[
0
]
and
if
(
0
==
self
.
reduce_mask
[
0
]
and
all
(
self
.
reduce_mask
[
1
:])
and
all
(
self
.
reduce_mask
[
1
:])
and
nd_in
in
[
2
,
3
,
4
]):
nd_in
in
[
2
,
3
,
4
]):
# this kernel uses one block for each row.
# this kernel uses one block for each row.
# threads per block for each element per row.
# threads per block for each element per row.
...
@@ -2117,10 +2121,10 @@ class GpuCAReduce(GpuOp):
...
@@ -2117,10 +2121,10 @@ class GpuCAReduce(GpuOp):
# this kernel uses one block for multiple column(up to 32TODO),
# this kernel uses one block for multiple column(up to 32TODO),
# threads per block for each element per column.
# threads per block for each element per column.
# thread.x = dim 2 contiguous
# thread.x = dim 2 contiguous
# thread.y = dim 1
# thread.y = dim 1
# block.x = dim 0
# block.x = dim 0
# block.y = dim 1 rest
# block.y = dim 1 rest
init
=
self
.
_k_init
(
node
,
nodename
)
init
=
self
.
_k_init
(
node
,
nodename
)
decl
=
self
.
_k_decl
(
node
,
nodename
,
pattern
=
"010_inner"
)
decl
=
self
.
_k_decl
(
node
,
nodename
,
pattern
=
"010_inner"
)
reducebuf
=
self
.
_k_reduce_buf_multiple
(
'Z[i0 * sZ0 + i2*sZ1]'
,
reducebuf
=
self
.
_k_reduce_buf_multiple
(
'Z[i0 * sZ0 + i2*sZ1]'
,
...
@@ -2470,7 +2474,7 @@ class GpuReshape(tensor.Reshape, GpuOp):
...
@@ -2470,7 +2474,7 @@ class GpuReshape(tensor.Reshape, GpuOp):
if
(
x
.
size
%
ss
)
!=
0
:
if
(
x
.
size
%
ss
)
!=
0
:
raise
ValueError
(
"When using -1 in new shape, the computed new shape must be an multiple of the original shape."
)
raise
ValueError
(
"When using -1 in new shape, the computed new shape must be an multiple of the original shape."
)
shp_new
=
numpy
.
copy
(
shp
)
shp_new
=
numpy
.
copy
(
shp
)
shp_new
[
m1_idx
]
=
x
.
size
/
ss
shp_new
[
m1_idx
]
=
x
.
size
/
ss
shp
=
shp_new
shp
=
shp_new
else
:
else
:
...
@@ -2721,7 +2725,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
...
@@ -2721,7 +2725,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
def
perform
(
self
,
node
,
inp
,
out_
):
def
perform
(
self
,
node
,
inp
,
out_
):
# This don't work as CudaNdarray_Subscript() don't support it.
# This don't work as CudaNdarray_Subscript() don't support it.
#super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
#
super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
x
,
idx
=
inp
x
,
idx
=
inp
out
,
=
out_
out
,
=
out_
x_orig
=
x
x_orig
=
x
...
@@ -2733,7 +2737,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
...
@@ -2733,7 +2737,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
if
x
.
ndim
<=
3
:
if
x
.
ndim
<=
3
:
# CudaNdarray.take only supports ndim <= 3
# CudaNdarray.take only supports ndim <= 3
if
self
.
perform_using_take
is
not
None
:
if
self
.
perform_using_take
is
not
None
:
assert
self
.
perform_using_take
==
True
,
(
assert
self
.
perform_using_take
is
True
,
(
"GpuAdvancedSubtensor1 used the fast version"
)
"GpuAdvancedSubtensor1 used the fast version"
)
if
idx
.
dtype
!=
numpy
.
int64
:
if
idx
.
dtype
!=
numpy
.
int64
:
if
idx
.
dtype
in
[
numpy
.
int8
,
numpy
.
int16
,
numpy
.
int32
,
if
idx
.
dtype
in
[
numpy
.
int8
,
numpy
.
int16
,
numpy
.
int32
,
...
@@ -2762,7 +2766,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
...
@@ -2762,7 +2766,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
out
[
0
]
=
o
out
[
0
]
=
o
else
:
else
:
if
self
.
perform_using_take
is
not
None
:
if
self
.
perform_using_take
is
not
None
:
assert
self
.
perform_using_take
==
False
,
(
assert
self
.
perform_using_take
is
False
,
(
"GpuAdvancedSubtensor1 didn't use the fast version"
)
"GpuAdvancedSubtensor1 didn't use the fast version"
)
if
out_
[
0
][
0
]
is
None
or
out_
[
0
][
0
]
.
shape
!=
out_shape
:
if
out_
[
0
][
0
]
is
None
or
out_
[
0
][
0
]
.
shape
!=
out_shape
:
o
=
cuda_ndarray
.
cuda_ndarray
.
CudaNdarray
.
zeros
(
out_shape
)
o
=
cuda_ndarray
.
cuda_ndarray
.
CudaNdarray
.
zeros
(
out_shape
)
...
@@ -3006,8 +3010,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
...
@@ -3006,8 +3010,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
convert_map
=
{
8
:
tensor
.
basic
.
_convert_to_int8
,
convert_map
=
{
8
:
tensor
.
basic
.
_convert_to_int8
,
16
:
tensor
.
basic
.
_convert_to_int16
,
16
:
tensor
.
basic
.
_convert_to_int16
,
32
:
tensor
.
basic
.
_convert_to_int32
,
32
:
tensor
.
basic
.
_convert_to_int32
,
64
:
tensor
.
basic
.
_convert_to_int64
64
:
tensor
.
basic
.
_convert_to_int64
}
}
intwidth
=
theano
.
configdefaults
.
python_int_bitwidth
()
intwidth
=
theano
.
configdefaults
.
python_int_bitwidth
()
ilist_
=
convert_map
[
intwidth
](
ilist_
)
ilist_
=
convert_map
[
intwidth
](
ilist_
)
...
@@ -3354,7 +3357,6 @@ class GpuFlatten(gof.HideC, tensor.Flatten, GpuOp):
...
@@ -3354,7 +3357,6 @@ class GpuFlatten(gof.HideC, tensor.Flatten, GpuOp):
return
Apply
(
self
,
[
x
],
[
out_type
()])
return
Apply
(
self
,
[
x
],
[
out_type
()])
def
gpu_flatten
(
x
,
outdim
=
1
):
def
gpu_flatten
(
x
,
outdim
=
1
):
"""
"""
Implement flatten on the gpu.
Implement flatten on the gpu.
...
@@ -3378,9 +3380,9 @@ def gpu_flatten(x, outdim=1):
...
@@ -3378,9 +3380,9 @@ def gpu_flatten(x, outdim=1):
"""
"""
x
=
as_cuda_ndarray_variable
(
x
)
x
=
as_cuda_ndarray_variable
(
x
)
if
outdim
>
1
:
if
outdim
>
1
:
dims
=
tuple
(
x
.
shape
[:
outdim
-
1
])
+
(
-
1
,
)
dims
=
tuple
(
x
.
shape
[:
outdim
-
1
])
+
(
-
1
,
)
else
:
else
:
dims
=
(
-
1
,)
dims
=
(
-
1
,
)
return
GpuReshape
(
outdim
)(
x
,
dims
)
return
GpuReshape
(
outdim
)(
x
,
dims
)
...
@@ -3408,12 +3410,11 @@ class GpuJoin(tensor.Join, GpuOp):
...
@@ -3408,12 +3410,11 @@ class GpuJoin(tensor.Join, GpuOp):
as_tensor_variable_args
=
[
as_cuda_ndarray_variable
(
x
)
as_tensor_variable_args
=
[
as_cuda_ndarray_variable
(
x
)
for
x
in
tensors
]
for
x
in
tensors
]
output_maker
=
\
def
output_maker
(
bcast
):
lambda
bcast
:
CudaNdarrayType
(
broadcastable
=
bcast
)(
)
return
(
CudaNdarrayType
(
broadcastable
=
bcast
)()
)
return
tensor
.
Join
.
_make_node_internal
(
self
,
return
tensor
.
Join
.
_make_node_internal
(
axis
,
tensors
,
self
,
axis
,
tensors
,
as_tensor_variable_args
,
output_maker
)
as_tensor_variable_args
,
output_maker
)
def
perform
(
self
,
node
,
axis_and_tensors
,
out_
):
def
perform
(
self
,
node
,
axis_and_tensors
,
out_
):
out
,
=
out_
out
,
=
out_
...
@@ -3464,7 +3465,7 @@ class GpuJoin(tensor.Join, GpuOp):
...
@@ -3464,7 +3465,7 @@ class GpuJoin(tensor.Join, GpuOp):
# except for 'axis'
# except for 'axis'
def
construct_slices
(
curlen
):
def
construct_slices
(
curlen
):
slices
=
[
slice
(
None
,
None
,
None
)
for
i
in
\
slices
=
[
slice
(
None
,
None
,
None
)
for
i
in
xrange
(
len
(
template_shape
))]
xrange
(
len
(
template_shape
))]
slices
[
axis
]
=
slice
(
curpos
,
curpos
+
curlen
,
None
)
slices
[
axis
]
=
slice
(
curpos
,
curpos
+
curlen
,
None
)
return
tuple
(
slices
)
return
tuple
(
slices
)
...
@@ -3829,8 +3830,8 @@ class GpuAlloc(GpuAllocEmpty):
...
@@ -3829,8 +3830,8 @@ class GpuAlloc(GpuAllocEmpty):
# If the output is a constant, it will have to be deepcopied
# If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold.
# each time the function is called. So we do not fold.
return
False
return
False
elif
(
# T
he following ops work inplace of their input id 0.
# Else if t
he following ops work inplace of their input id 0.
client
[
1
]
==
0
and
elif
(
client
[
1
]
==
0
and
isinstance
(
client
[
0
]
.
op
,
(
isinstance
(
client
[
0
]
.
op
,
(
# Ops that will work inplace on the Alloc. So if they
# Ops that will work inplace on the Alloc. So if they
# get constant_folded, they would copy the
# get constant_folded, they would copy the
...
@@ -3844,8 +3845,7 @@ class GpuAlloc(GpuAllocEmpty):
...
@@ -3844,8 +3845,7 @@ class GpuAlloc(GpuAllocEmpty):
GpuAdvancedIncSubtensor1
,
GpuAdvancedIncSubtensor1
,
theano
.
sandbox
.
cuda
.
blas
.
GpuGemm
,
theano
.
sandbox
.
cuda
.
blas
.
GpuGemm
,
theano
.
sandbox
.
cuda
.
blas
.
GpuGemv
,
theano
.
sandbox
.
cuda
.
blas
.
GpuGemv
,
theano
.
sandbox
.
cuda
.
blas
.
GpuGer
,
theano
.
sandbox
.
cuda
.
blas
.
GpuGer
,))):
))):
return
False
return
False
# If the clients is a transfer, we don't want to fold. We
# If the clients is a transfer, we don't want to fold. We
# let the moving opt finish before deciding what to do.
# let the moving opt finish before deciding what to do.
...
...
theano/sandbox/cuda/blas.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
copy
import
os
import
os
import
logging
import
logging
_logger
=
logging
.
getLogger
(
__name__
)
from
six
import
integer_types
from
six
import
integer_types
from
six.moves
import
StringIO
,
reduce
from
six.moves
import
StringIO
,
reduce
import
theano
import
theano
from
theano
import
Apply
from
theano
import
Apply
from
theano
import
tensor
from
theano
import
tensor
...
@@ -15,6 +11,7 @@ from theano.sandbox.cuda import GpuOp
...
@@ -15,6 +11,7 @@ from theano.sandbox.cuda import GpuOp
from
theano.sandbox.cuda.basic_ops
import
(
as_cuda_ndarray_variable
,
from
theano.sandbox.cuda.basic_ops
import
(
as_cuda_ndarray_variable
,
gpu_contiguous
)
gpu_contiguous
)
from
theano.tensor
import
as_tensor_variable
from
theano.tensor
import
as_tensor_variable
_logger
=
logging
.
getLogger
(
__name__
)
class
GpuBatchedDot
(
GpuOp
):
class
GpuBatchedDot
(
GpuOp
):
...
@@ -183,8 +180,7 @@ class GpuBatchedDot(GpuOp):
...
@@ -183,8 +180,7 @@ class GpuBatchedDot(GpuOp):
}
}
} else {
} else {
// copy inputs if not contiguous
// copy inputs if not contiguous
"""
+
"""
+
(
"
\n
"
.
join
(
"""
(
"
\n
"
.
join
(
"""
if (( CudaNdarray_HOST_DIMS(
%(var)
s)[0] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[0] != 1
if (( CudaNdarray_HOST_DIMS(
%(var)
s)[0] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[0] != 1
&& CudaNdarray_HOST_DIMS(
%(var)
s)[1] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[1] != 1
&& CudaNdarray_HOST_DIMS(
%(var)
s)[1] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[1] != 1
&& CudaNdarray_HOST_DIMS(
%(var)
s)[2] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[2] != 1)
&& CudaNdarray_HOST_DIMS(
%(var)
s)[2] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[2] != 1)
...
@@ -198,8 +194,7 @@ class GpuBatchedDot(GpuOp):
...
@@ -198,8 +194,7 @@ class GpuBatchedDot(GpuOp):
Py_XDECREF(
%(var)
s);
Py_XDECREF(
%(var)
s);
%(var)
s = _copy;
%(var)
s = _copy;
}
}
"""
%
dict
(
var
=
var
,
fail
=
fail
)
for
var
in
(
bx
,
by
)))
"""
%
dict
(
var
=
var
,
fail
=
fail
)
for
var
in
(
bx
,
by
)))
+
"""
+
"""
// fail if the output is not contiguous; we can't copy it because we
// fail if the output is not contiguous; we can't copy it because we
// need to write to the original memory
// need to write to the original memory
...
@@ -537,8 +532,8 @@ class GpuGemm(GpuOp):
...
@@ -537,8 +532,8 @@ class GpuGemm(GpuOp):
return
'GpuGemm{no_inplace}'
return
'GpuGemm{no_inplace}'
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
\
return
(
type
(
self
)
==
type
(
other
)
and
and
self
.
inplace
==
other
.
inplace
)
self
.
inplace
==
other
.
inplace
)
def
__hash__
(
self
):
def
__hash__
(
self
):
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
...
@@ -562,7 +557,7 @@ class GpuGemm(GpuOp):
...
@@ -562,7 +557,7 @@ class GpuGemm(GpuOp):
return
(
4
,)
return
(
4
,)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
#z_out = alpha * dot(x,y) + beta * z_in
#
z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in
# inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out.
# not inplace version, we copy z_in to z_out.
z_in
,
a
,
x
,
y
,
b
=
inputs
z_in
,
a
,
x
,
y
,
b
=
inputs
...
@@ -657,8 +652,8 @@ class GpuGemv(GpuOp):
...
@@ -657,8 +652,8 @@ class GpuGemv(GpuOp):
return
'GpuGemv{no_inplace}'
return
'GpuGemv{no_inplace}'
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
\
return
(
type
(
self
)
==
type
(
other
)
and
and
self
.
inplace
==
other
.
inplace
)
self
.
inplace
==
other
.
inplace
)
def
__hash__
(
self
):
def
__hash__
(
self
):
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
...
@@ -682,7 +677,7 @@ class GpuGemv(GpuOp):
...
@@ -682,7 +677,7 @@ class GpuGemv(GpuOp):
return
(
3
,)
return
(
3
,)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
#z_out = alpha * dot(x,y) + beta * z_in
#
z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in
# inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out.
# not inplace version, we copy z_in to z_out.
z_in
,
a
,
x
,
y
,
b
=
inputs
z_in
,
a
,
x
,
y
,
b
=
inputs
...
@@ -757,8 +752,8 @@ class GpuGer(GpuOp):
...
@@ -757,8 +752,8 @@ class GpuGer(GpuOp):
return
'GpuGer{no_inplace}'
return
'GpuGer{no_inplace}'
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
\
return
(
type
(
self
)
==
type
(
other
)
and
and
self
.
inplace
==
other
.
inplace
)
self
.
inplace
==
other
.
inplace
)
def
__hash__
(
self
):
def
__hash__
(
self
):
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
...
@@ -782,7 +777,7 @@ class GpuGer(GpuOp):
...
@@ -782,7 +777,7 @@ class GpuGer(GpuOp):
return
(
2
,)
return
(
2
,)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
#z_out = alpha * dot(x,y) + beta * z_in
#
z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in
# inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out.
# not inplace version, we copy z_in to z_out.
z_in
,
a
,
x
,
y
=
inputs
z_in
,
a
,
x
,
y
=
inputs
...
@@ -1283,11 +1278,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
...
@@ -1283,11 +1278,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
bottom
,
top
=
inp
[:
2
]
bottom
,
top
=
inp
[:
2
]
weights
,
=
grads
weights
,
=
grads
weights
=
gpu_contiguous
(
weights
)
weights
=
gpu_contiguous
(
weights
)
d_bottom
=
GpuCorrMM_gradInputs
(
self
.
border_mode
,
self
.
subsample
)(
d_bottom
=
GpuCorrMM_gradInputs
(
weights
,
top
,
bottom
.
shape
[
-
2
:])
self
.
border_mode
,
self
.
subsample
)(
weights
,
d_top
=
GpuCorrMM
(
self
.
border_mode
,
self
.
subsample
)(
top
,
bottom
,
weights
)
bottom
.
shape
[
-
2
:])
d_height_width
=
(
theano
.
gradient
.
DisconnectedType
()(),)
*
2
if
len
(
inp
)
==
4
else
()
d_top
=
GpuCorrMM
(
self
.
border_mode
,
self
.
subsample
)(
bottom
,
weights
)
d_height_width
=
(
theano
.
gradient
.
DisconnectedType
()(),
)
*
2
if
len
(
inp
)
==
4
else
()
return
(
d_bottom
,
d_top
)
+
d_height_width
return
(
d_bottom
,
d_top
)
+
d_height_width
def
connection_pattern
(
self
,
node
):
def
connection_pattern
(
self
,
node
):
...
@@ -1342,11 +1341,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
...
@@ -1342,11 +1341,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
weights
,
top
=
inp
[:
2
]
weights
,
top
=
inp
[:
2
]
bottom
,
=
grads
bottom
,
=
grads
bottom
=
gpu_contiguous
(
bottom
)
bottom
=
gpu_contiguous
(
bottom
)
d_weights
=
GpuCorrMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
)(
d_weights
=
GpuCorrMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
)(
bottom
,
top
,
weights
.
shape
[
-
2
:])
bottom
,
top
,
weights
.
shape
[
-
2
:])
d_top
=
GpuCorrMM
(
self
.
border_mode
,
self
.
subsample
)(
d_top
=
GpuCorrMM
(
bottom
,
weights
)
self
.
border_mode
,
self
.
subsample
)(
bottom
,
weights
)
d_height_width
=
(
theano
.
gradient
.
DisconnectedType
()(),)
*
2
if
len
(
inp
)
==
4
else
()
d_height_width
=
(
theano
.
gradient
.
DisconnectedType
()(),
)
*
2
if
len
(
inp
)
==
4
else
()
return
(
d_weights
,
d_top
)
+
d_height_width
return
(
d_weights
,
d_top
)
+
d_height_width
def
connection_pattern
(
self
,
node
):
def
connection_pattern
(
self
,
node
):
...
@@ -1755,10 +1757,16 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
...
@@ -1755,10 +1757,16 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
bottom
,
weights
=
inp
bottom
,
weights
=
inp
top
,
=
grads
top
,
=
grads
top
=
gpu_contiguous
(
top
)
top
=
gpu_contiguous
(
top
)
d_bottom
=
GpuCorr3dMM_gradInputs
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
d_bottom
=
GpuCorr3dMM_gradInputs
(
self
.
border_mode
,
weights
,
top
,
bottom
.
shape
[
-
3
:])
self
.
subsample
,
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
self
.
pad
)(
weights
,
bottom
,
top
,
weights
.
shape
[
-
3
:])
top
,
bottom
.
shape
[
-
3
:])
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
top
,
weights
.
shape
[
-
3
:])
return
d_bottom
,
d_weights
return
d_bottom
,
d_weights
...
@@ -1863,11 +1871,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
...
@@ -1863,11 +1871,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
weights
,
top
=
inp
[:
2
]
weights
,
top
=
inp
[:
2
]
bottom
,
=
grads
bottom
,
=
grads
bottom
=
gpu_contiguous
(
bottom
)
bottom
=
gpu_contiguous
(
bottom
)
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
top
,
weights
.
shape
[
-
3
:])
bottom
,
top
,
weights
.
shape
[
-
3
:])
d_top
=
GpuCorr3dMM
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
d_top
=
GpuCorr3dMM
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
weights
)
bottom
,
weights
)
d_height_width_depth
=
(
theano
.
gradient
.
DisconnectedType
()(),)
*
3
if
len
(
inp
)
==
5
else
()
d_height_width_depth
=
(
theano
.
gradient
.
DisconnectedType
()(),)
\
*
3
if
len
(
inp
)
==
5
else
()
return
(
d_weights
,
d_top
)
+
d_height_width_depth
return
(
d_weights
,
d_top
)
+
d_height_width_depth
def
connection_pattern
(
self
,
node
):
def
connection_pattern
(
self
,
node
):
...
@@ -2186,7 +2197,7 @@ class GpuDownsampleFactorMax(GpuOp):
...
@@ -2186,7 +2197,7 @@ class GpuDownsampleFactorMax(GpuOp):
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
# def perform(self, node, input_storage, output_storage):
# def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented')
#
raise NotImplementedError('only C is implemented')
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
6
)
return
(
6
)
...
...
theano/sandbox/cuda/elemwise.py
浏览文件 @
b69ad54d
...
@@ -108,8 +108,8 @@ class NaiveAlgo(object):
...
@@ -108,8 +108,8 @@ class NaiveAlgo(object):
s
=
", "
.
join
([
"float * o
%
i_data"
%
ipos
]
+
s
=
", "
.
join
([
"float * o
%
i_data"
%
ipos
]
+
[
"int o
%
i_str_
%
i"
%
(
ipos
,
d
)
for
d
in
xrange
(
nd
)])
[
"int o
%
i_str_
%
i"
%
(
ipos
,
d
)
for
d
in
xrange
(
nd
)])
print
(
"
\t
,"
,
s
,
file
=
sio
)
print
(
"
\t
,"
,
s
,
file
=
sio
)
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#
print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#print >> sio, "\t,", "float * o%i_data" % ipos
#
print >> sio, "\t,", "float * o%i_data" % ipos
print
(
"
\t
)
\n
{"
,
file
=
sio
)
print
(
"
\t
)
\n
{"
,
file
=
sio
)
print
(
" const int idx = blockIdx.x * blockDim.x + threadIdx.x;"
,
file
=
sio
)
print
(
" const int idx = blockIdx.x * blockDim.x + threadIdx.x;"
,
file
=
sio
)
print
(
" const int numThreads = blockDim.x * gridDim.x;"
,
file
=
sio
)
print
(
" const int numThreads = blockDim.x * gridDim.x;"
,
file
=
sio
)
...
@@ -129,7 +129,7 @@ class NaiveAlgo(object):
...
@@ -129,7 +129,7 @@ class NaiveAlgo(object):
print
(
" const float * ii_i
%
i_data = i
%
i_data;"
%
(
ipos
,
ipos
),
file
=
sio
)
print
(
" const float * ii_i
%
i_data = i
%
i_data;"
%
(
ipos
,
ipos
),
file
=
sio
)
for
ipos
,
i
in
enumerate
(
node
.
outputs
):
for
ipos
,
i
in
enumerate
(
node
.
outputs
):
print
(
" float * ii_o
%
i_data = o
%
i_data;"
%
(
ipos
,
ipos
),
file
=
sio
)
print
(
" float * ii_o
%
i_data = o
%
i_data;"
%
(
ipos
,
ipos
),
file
=
sio
)
for
d
in
xrange
(
nd
-
1
,
-
1
,
-
1
):
for
d
in
xrange
(
nd
-
1
,
-
1
,
-
1
):
if
d
>
0
:
if
d
>
0
:
print
(
" int pos
%
i = ii
%%
dim
%
i;"
%
(
d
,
d
),
file
=
sio
)
print
(
" int pos
%
i = ii
%%
dim
%
i;"
%
(
d
,
d
),
file
=
sio
)
print
(
" ii = ii / dim
%
i;"
%
d
,
file
=
sio
)
print
(
" ii = ii / dim
%
i;"
%
d
,
file
=
sio
)
...
@@ -161,9 +161,9 @@ class NaiveAlgo(object):
...
@@ -161,9 +161,9 @@ class NaiveAlgo(object):
print
(
"ii_o
%
i_data[0] = o
%
i_i;"
%
(
ipos
,
ipos
),
file
=
sio
)
print
(
"ii_o
%
i_data[0] = o
%
i_i;"
%
(
ipos
,
ipos
),
file
=
sio
)
print
(
" }"
,
file
=
sio
)
print
(
" }"
,
file
=
sio
)
#indent = " "*(4*d+7)
#
indent = " "*(4*d+7)
# for ipos, i in enumerate(node.inputs):
# for ipos, i in enumerate(node.inputs):
#
print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
#
print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print
(
"}"
,
file
=
sio
)
print
(
"}"
,
file
=
sio
)
# print sio.getvalue()
# print sio.getvalue()
...
@@ -211,10 +211,11 @@ class NaiveAlgo(object):
...
@@ -211,10 +211,11 @@ class NaiveAlgo(object):
print
(
"// Input "
,
ipos
,
str
(
i
.
type
),
file
=
sio
)
print
(
"// Input "
,
ipos
,
str
(
i
.
type
),
file
=
sio
)
for
ipos
,
i
in
enumerate
(
node
.
outputs
):
for
ipos
,
i
in
enumerate
(
node
.
outputs
):
print
(
"// Output "
,
ipos
,
str
(
i
.
type
),
file
=
sio
)
print
(
"// Output "
,
ipos
,
str
(
i
.
type
),
file
=
sio
)
print
(
"static __global__ void kernel_
%
s_
%
s_
%
s(unsigned int numEls"
%
(
print
(
self
.
scalar_op
.
__class__
.
__name__
,
"static __global__ void kernel_
%
s_
%
s_
%
s(unsigned int numEls"
%
(
self
.
scalar_op
.
__class__
.
__name__
,
nodename
,
nodename
,
'tiling
%
i'
%
nd
),
file
=
sio
)
'tiling
%
i'
%
nd
),
file
=
sio
)
if
(
nd
):
if
(
nd
):
print
(
"
\t
,"
,
", "
.
join
(
"const int dim
%
i"
%
i
for
i
in
xrange
(
nd
)),
file
=
sio
)
print
(
"
\t
,"
,
", "
.
join
(
"const int dim
%
i"
%
i
for
i
in
xrange
(
nd
)),
file
=
sio
)
# declare inputs
# declare inputs
...
@@ -225,15 +226,15 @@ class NaiveAlgo(object):
...
@@ -225,15 +226,15 @@ class NaiveAlgo(object):
for
ipos
,
i
in
enumerate
(
node
.
outputs
):
for
ipos
,
i
in
enumerate
(
node
.
outputs
):
s
=
", "
.
join
([
"float * o
%
i_data"
%
ipos
]
+
list
(
"int o
%
i_str_
%
i"
%
(
ipos
,
d
)
for
d
in
xrange
(
nd
)))
s
=
", "
.
join
([
"float * o
%
i_data"
%
ipos
]
+
list
(
"int o
%
i_str_
%
i"
%
(
ipos
,
d
)
for
d
in
xrange
(
nd
)))
print
(
"
\t
,"
,
s
,
file
=
sio
)
print
(
"
\t
,"
,
s
,
file
=
sio
)
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#
print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#print >> sio, "\t,", "float * o%i_data" % ipos
#
print >> sio, "\t,", "float * o%i_data" % ipos
print
(
"
\t
)
\n
{"
,
file
=
sio
)
print
(
"
\t
)
\n
{"
,
file
=
sio
)
# For each input that is a scalar which has been broadcasted to a tensor,
# For each input that is a scalar which has been broadcasted to a tensor,
# load it into a local variable
# load it into a local variable
print
(
" __shared__ float value0[
%
i];"
%
len
(
node
.
inputs
),
file
=
sio
)
print
(
" __shared__ float value0[
%
i];"
%
len
(
node
.
inputs
),
file
=
sio
)
print
(
" __shared__ int shared_dims[
%(nd)
s];"
%
locals
(),
file
=
sio
)
print
(
" __shared__ int shared_dims[
%(nd)
s];"
%
locals
(),
file
=
sio
)
#print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]"
#
print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]"
print
(
" if ((threadIdx.x == 0) && (threadIdx.y == 0)) {"
,
file
=
sio
)
print
(
" if ((threadIdx.x == 0) && (threadIdx.y == 0)) {"
,
file
=
sio
)
for
ipos
,
i
in
enumerate
(
node
.
inputs
):
for
ipos
,
i
in
enumerate
(
node
.
inputs
):
if
_logical_scalar
(
i
):
if
_logical_scalar
(
i
):
...
@@ -274,15 +275,18 @@ class NaiveAlgo(object):
...
@@ -274,15 +275,18 @@ class NaiveAlgo(object):
# perform the scalar operation on the input and output references
# perform the scalar operation on the input and output references
# TODO: What if the scalar_op needs support_code??
# TODO: What if the scalar_op needs support_code??
task_code
=
self
.
scalar_op
.
c_code
(
task_code
=
self
.
scalar_op
.
c_code
(
Apply
(
self
.
scalar_op
,
Apply
(
[
scalar
.
Scalar
(
dtype
=
input
.
type
.
dtype
)
.
make_variable
()
self
.
scalar_op
,
[
scalar
.
Scalar
(
dtype
=
input
.
type
.
dtype
)
.
make_variable
()
for
input
in
node
.
inputs
],
for
input
in
node
.
inputs
],
[
scalar
.
Scalar
(
dtype
=
output
.
type
.
dtype
)
.
make_variable
()
[
scalar
.
Scalar
(
for
output
in
node
.
outputs
])
dtype
=
output
.
type
.
dtype
)
.
make_variable
()
,
nodename
+
'_scalar_'
for
output
in
node
.
outputs
]),
,
get_str_list_logical_scalar
(
node
,
value_str
=
'value0[
%
i]'
)
nodename
+
'_scalar_'
,
,
[
'ii_o
%
i_data[0]'
%
ipos
for
ipos
,
i
in
enumerate
(
node
.
outputs
)]
get_str_list_logical_scalar
(
node
,
value_str
=
'value0[
%
i]'
),
,
sub
=
dict
(
fail
=
'return;'
))
# TODO: set a failure code somehow!!!
[
'ii_o
%
i_data[0]'
%
ipos
for
ipos
,
i
in
enumerate
(
node
.
outputs
)],
sub
=
dict
(
fail
=
'return;'
))
# TODO: set a failure code somehow!!!
print
(
" "
,
task_code
,
file
=
sio
)
print
(
" "
,
task_code
,
file
=
sio
)
print
(
" }"
*
nd
,
file
=
sio
)
print
(
" }"
*
nd
,
file
=
sio
)
...
@@ -290,9 +294,9 @@ class NaiveAlgo(object):
...
@@ -290,9 +294,9 @@ class NaiveAlgo(object):
# TODO: insert runtime stride checks that select the best loop order either here, or in
# TODO: insert runtime stride checks that select the best loop order either here, or in
# the host code that launched the kernel (host code probably better spot)
# the host code that launched the kernel (host code probably better spot)
#indent = " "*(4*d+7)
#
indent = " "*(4*d+7)
# for ipos, i in enumerate(node.inputs):
# for ipos, i in enumerate(node.inputs):
#
print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
#
print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print
(
"}"
,
file
=
sio
)
print
(
"}"
,
file
=
sio
)
print
(
sio
.
getvalue
())
print
(
sio
.
getvalue
())
...
@@ -319,10 +323,11 @@ class NaiveAlgo(object):
...
@@ -319,10 +323,11 @@ class NaiveAlgo(object):
print
(
"// Input "
,
ipos
,
str
(
i
.
type
),
file
=
sio
)
print
(
"// Input "
,
ipos
,
str
(
i
.
type
),
file
=
sio
)
for
ipos
,
i
in
enumerate
(
node
.
outputs
):
for
ipos
,
i
in
enumerate
(
node
.
outputs
):
print
(
"// Output "
,
ipos
,
str
(
i
.
type
),
file
=
sio
)
print
(
"// Output "
,
ipos
,
str
(
i
.
type
),
file
=
sio
)
print
(
"static __global__ void kernel_
%
s_
%
s_
%
s(unsigned int numEls"
%
(
print
(
self
.
scalar_op
.
__class__
.
__name__
,
"static __global__ void kernel_
%
s_
%
s_
%
s(unsigned int numEls"
%
(
self
.
scalar_op
.
__class__
.
__name__
,
nodename
,
nodename
,
'tiling
%
i_less_registers'
%
nd
),
file
=
sio
)
'tiling
%
i_less_registers'
%
nd
),
file
=
sio
)
if
(
nd
):
if
(
nd
):
print
(
"
\t
,"
,
", "
.
join
(
"const int dim
%
i"
%
i
for
i
in
xrange
(
nd
)),
file
=
sio
)
print
(
"
\t
,"
,
", "
.
join
(
"const int dim
%
i"
%
i
for
i
in
xrange
(
nd
)),
file
=
sio
)
# declare inputs
# declare inputs
...
@@ -333,8 +338,8 @@ class NaiveAlgo(object):
...
@@ -333,8 +338,8 @@ class NaiveAlgo(object):
for
ipos
,
i
in
enumerate
(
node
.
outputs
):
for
ipos
,
i
in
enumerate
(
node
.
outputs
):
s
=
", "
.
join
([
"float * o
%
i_data_0"
%
ipos
]
+
list
(
"int o
%
i_str_
%
i"
%
(
ipos
,
d
)
for
d
in
xrange
(
nd
)))
s
=
", "
.
join
([
"float * o
%
i_data_0"
%
ipos
]
+
list
(
"int o
%
i_str_
%
i"
%
(
ipos
,
d
)
for
d
in
xrange
(
nd
)))
print
(
"
\t
,"
,
s
,
file
=
sio
)
print
(
"
\t
,"
,
s
,
file
=
sio
)
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#
print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#print >> sio, "\t,", "float * o%i_data" % ipos
#
print >> sio, "\t,", "float * o%i_data" % ipos
print
(
"
\t
)
\n
{"
,
file
=
sio
)
print
(
"
\t
)
\n
{"
,
file
=
sio
)
# TODO: Setting these to true makes the function fail SOMETIMES. I don't know why yet.
# TODO: Setting these to true makes the function fail SOMETIMES. I don't know why yet.
...
@@ -350,6 +355,7 @@ class NaiveAlgo(object):
...
@@ -350,6 +355,7 @@ class NaiveAlgo(object):
return
"s
%
s_str[
%
i][
%
i]"
%
(
io
,
p
,
d
)
return
"s
%
s_str[
%
i][
%
i]"
%
(
io
,
p
,
d
)
else
:
else
:
return
"
%
s
%
i_str_
%
i"
%
(
io
,
p
,
d
)
return
"
%
s
%
i_str_
%
i"
%
(
io
,
p
,
d
)
def
limits
(
d
):
def
limits
(
d
):
if
use_shared_limits
:
if
use_shared_limits
:
return
"limits[
%
i]"
%
d
return
"limits[
%
i]"
%
d
...
@@ -417,15 +423,19 @@ class NaiveAlgo(object):
...
@@ -417,15 +423,19 @@ class NaiveAlgo(object):
def
task_code
(
d
):
def
task_code
(
d
):
print
(
self
.
scalar_op
.
c_code
(
print
(
self
.
scalar_op
.
c_code
(
Apply
(
self
.
scalar_op
,
Apply
(
self
.
scalar_op
,
[
scalar
.
Scalar
(
dtype
=
input
.
type
.
dtype
)
.
make_variable
()
[
scalar
.
Scalar
(
dtype
=
input
.
type
.
dtype
)
.
make_variable
()
for
input
in
node
.
inputs
],
for
input
in
node
.
inputs
],
[
scalar
.
Scalar
(
dtype
=
output
.
type
.
dtype
)
.
make_variable
()
[
scalar
.
Scalar
(
dtype
=
output
.
type
.
dtype
)
.
make_variable
()
for
output
in
node
.
outputs
])
for
output
in
node
.
outputs
]),
,
nodename
+
'_scalar_'
nodename
+
'_scalar_'
,
,
[
'i
%
i_data_
%
i[0]'
%
(
ipos
,
d
)
for
ipos
,
i
in
enumerate
(
node
.
inputs
)]
[
'i
%
i_data_
%
i[0]'
%
(
ipos
,
d
)
for
ipos
,
,
[
'o
%
i_data_
%
i[0]'
%
(
ipos
,
d
)
for
ipos
,
i
in
enumerate
(
node
.
outputs
)]
i
in
enumerate
(
node
.
inputs
)],
,
sub
=
dict
(
fail
=
'return;'
)),
file
=
sio
)
# TODO: set a failure code somehow!!!
[
'o
%
i_data_
%
i[0]'
%
(
ipos
,
d
)
for
ipos
,
i
in
enumerate
(
node
.
outputs
)],
sub
=
dict
(
fail
=
'return;'
)),
file
=
sio
)
# TODO: set a failure code somehow!!!
if
nd
==
4
:
if
nd
==
4
:
decl_shared_stride
(
n_in
,
n_out
,
nd
)
decl_shared_stride
(
n_in
,
n_out
,
nd
)
...
@@ -495,16 +505,19 @@ class NaiveAlgo(object):
...
@@ -495,16 +505,19 @@ class NaiveAlgo(object):
for
ipos
,
i
in
enumerate
(
node
.
outputs
):
for
ipos
,
i
in
enumerate
(
node
.
outputs
):
print
(
"npy_
%
s o
%
d_i;"
%
(
i
.
dtype
,
ipos
),
file
=
sio
)
print
(
"npy_
%
s o
%
d_i;"
%
(
i
.
dtype
,
ipos
),
file
=
sio
)
task_code
=
self
.
scalar_op
.
c_code
(
task_code
=
self
.
scalar_op
.
c_code
(
Apply
(
self
.
scalar_op
,
Apply
(
self
.
scalar_op
,
[
scalar
.
Scalar
(
dtype
=
input
.
type
.
dtype
)
.
make_variable
()
[
scalar
.
Scalar
(
dtype
=
input
.
type
.
dtype
)
.
make_variable
()
for
input
in
node
.
inputs
],
for
input
in
node
.
inputs
],
[
scalar
.
Scalar
(
dtype
=
output
.
type
.
dtype
)
.
make_variable
()
[
scalar
.
Scalar
(
dtype
=
output
.
type
.
dtype
)
.
make_variable
()
for
output
in
node
.
outputs
])
for
output
in
node
.
outputs
]),
,
nodename
+
'_scalar_'
nodename
+
'_scalar_'
,
#, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)]
# , ['i%i_data[i]'%ipos for ipos,
,
get_str_list_logical_scalar
(
node
,
data_str
=
'i
%
i_data[i]'
)
# i in enumerate(node.inputs)]
,
[
'o
%
i_i'
%
ipos
for
ipos
,
i
in
enumerate
(
node
.
outputs
)]
get_str_list_logical_scalar
(
node
,
data_str
=
'i
%
i_data[i]'
),
,
sub
=
dict
(
fail
=
'return;'
))
# TODO: set a failure code somehow!!!
[
'o
%
i_i'
%
ipos
for
ipos
,
i
in
enumerate
(
node
.
outputs
)],
sub
=
dict
(
fail
=
'return;'
))
# TODO: set a failure code somehow!!!
print
(
" "
,
task_code
,
file
=
sio
)
print
(
" "
,
task_code
,
file
=
sio
)
for
ipos
,
_
in
enumerate
(
node
.
outputs
):
for
ipos
,
_
in
enumerate
(
node
.
outputs
):
print
(
"o
%
i_data[i] = o
%
i_i;"
%
(
ipos
,
ipos
),
file
=
sio
)
print
(
"o
%
i_data[i] = o
%
i_i;"
%
(
ipos
,
ipos
),
file
=
sio
)
...
@@ -539,18 +552,21 @@ class NaiveAlgo(object):
...
@@ -539,18 +552,21 @@ class NaiveAlgo(object):
nb_outputs
=
len
(
node
.
outputs
)
nb_outputs
=
len
(
node
.
outputs
)
d
=
dict
()
d
=
dict
()
# input_params and output_params go into the function declaration/definition
# input_params and output_params go into the function declaration/definition
input_params
=
", "
.
join
(
"const float * i
%
i_data, const int * i
%
i_str"
%
(
ipos
,
ipos
)
input_params
=
", "
.
join
(
"const float * i
%
i_data, const int * i
%
i_str"
%
(
ipos
,
ipos
)
for
ipos
in
xrange
(
len
(
node
.
inputs
)))
for
ipos
in
xrange
(
len
(
node
.
inputs
)))
output_params
=
", "
.
join
(
"float * o
%
i_data, const int * o
%
i_str"
%
(
ipos
,
ipos
)
output_params
=
", "
.
join
(
"float * o
%
i_data, const int * o
%
i_str"
%
(
ipos
,
ipos
)
for
ipos
in
xrange
(
len
(
node
.
outputs
)))
for
ipos
in
xrange
(
len
(
node
.
outputs
)))
# input_args and output_args go into the recursive call.
# input_args and output_args go into the recursive call.
input_args
=
", "
.
join
(
"i
%
i_data, i
%
i_str"
%
(
ipos
,
ipos
)
input_args
=
", "
.
join
(
"i
%
i_data, i
%
i_str"
%
(
ipos
,
ipos
)
for
ipos
in
xrange
(
len
(
node
.
inputs
)))
for
ipos
in
xrange
(
len
(
node
.
inputs
)))
output_args
=
", "
.
join
(
"o
%
i_data, o
%
i_str"
%
(
ipos
,
ipos
)
output_args
=
", "
.
join
(
"o
%
i_data, o
%
i_str"
%
(
ipos
,
ipos
)
for
ipos
in
xrange
(
len
(
node
.
outputs
)))
for
ipos
in
xrange
(
len
(
node
.
outputs
)))
prod_dims
=
'*'
.
join
([
"dims[
%
i]"
%
di
for
di
in
xrange
(
nd
)]
+
[
'1'
])
prod_dims
=
'*'
.
join
(
[
"dims[
%
i]"
%
di
for
di
in
xrange
(
nd
)]
+
[
'1'
])
scalar_op
=
self
.
scalar_op
.
__class__
.
__name__
scalar_op
=
self
.
scalar_op
.
__class__
.
__name__
...
@@ -578,20 +594,30 @@ class NaiveAlgo(object):
...
@@ -578,20 +594,30 @@ class NaiveAlgo(object):
print
(
"""
print
(
"""
std::cerr << "calling kernel_
%(scalar_op)
s_
%(nodename)
s w numEls" << numEls << " dims"<< d << "
\\
n";
std::cerr << "calling kernel_
%(scalar_op)
s_
%(nodename)
s w numEls" << numEls << " dims"<< d << "
\\
n";
"""
%
locals
(),
file
=
sio
)
"""
%
locals
(),
file
=
sio
)
print
(
'std::cerr << '
+
" << ' ' << "
.
join
([
'" "'
]
+
list
(
"dims[
%
i]"
%
di
print
(
for
di
in
xrange
(
nd
))
+
[
"'
\\
n';"
]),
file
=
sio
)
'std::cerr << '
+
" << ' ' << "
.
join
(
[
'" "'
]
+
list
(
"dims[
%
i]"
%
di
for
di
in
xrange
(
nd
))
+
[
"'
\\
n';"
]),
file
=
sio
)
if
self
.
verbose
>
1
:
if
self
.
verbose
>
1
:
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
istrings
=
[
"i
%
s_str[
%
i]"
%
(
ipos
,
di
)
for
di
in
xrange
(
nd
)]
ipositions
=
" << ' ' << "
.
join
(
[
"i
%
s_data"
%
ipos
]
+
istrings
)
print
(
"""
print
(
"""
std::cerr << "
%(ipos)
s data strides" <<
std::cerr << "
%(ipos)
s data strides" <<
%(ipositions)
s << "
\\
n";
"""
%
locals
()
+
" << ' ' << "
.
join
([
"i
%
s_data"
%
ipos
]
"""
%
dict
(
ipos
=
ipos
,
ipositions
=
ipositions
),
file
=
sio
)
+
list
(
"i
%
s_str[
%
i]"
%
(
ipos
,
di
)
for
di
in
xrange
(
nd
)))
+
''' << "
\\
n"; '''
,
file
=
sio
)
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
print
(
"""
print
(
"""
std::cerr << "
%(ipos)
s data strides" <<
std::cerr << "
%(ipos)
s data strides" <<
"""
%
locals
()
+
" << ' ' << "
.
join
([
"o
%
s_data"
%
ipos
]
"""
%
locals
()
+
" << ' ' << "
.
join
(
+
list
(
"o
%
s_str[
%
i]"
%
(
ipos
,
di
)
for
di
in
xrange
(
nd
)))
+
''' << "
\\
n"; '''
,
file
=
sio
)
[
"o
%
s_data"
%
ipos
]
+
list
(
"o
%
s_str[
%
i]"
%
(
ipos
,
di
)
for
di
in
xrange
(
nd
)
))
+
''' << "
\\
n"; '''
,
file
=
sio
)
# collapse dimension that are broadcast in all inputs.
# collapse dimension that are broadcast in all inputs.
# need to be done before contiguous collapse as it will break it.
# need to be done before contiguous collapse as it will break it.
# do the dimensions and the strides
# do the dimensions and the strides
...
@@ -636,11 +662,19 @@ class NaiveAlgo(object):
...
@@ -636,11 +662,19 @@ class NaiveAlgo(object):
print
(
'std::cerr << "
\\
n";'
,
file
=
sio
)
print
(
'std::cerr << "
\\
n";'
,
file
=
sio
)
if
nd
>
0
:
if
nd
>
0
:
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
print
(
'std::cerr << " local_str inputs
%(ipos)
s: " <<'
%
locals
()
+
\
print
(
' << " " << '
.
join
([
"local_str[
%
s][
%
s]"
%
(
ipos
,
x
)
for
x
in
xrange
(
nd
)])
+
'<<"
\\
n";'
,
file
=
sio
)
'std::cerr << " local_str inputs
%(ipos)
s: " <<'
%
locals
()
+
' << " " << '
.
join
([
"local_str[
%
s][
%
s]"
%
(
ipos
,
x
)
for
x
in
xrange
(
nd
)])
+
'<<"
\\
n";'
,
file
=
sio
)
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
print
(
'std::cerr << " local_ostr inputs
%(ipos)
s: " <<'
%
locals
()
+
\
print
(
' << " " << '
.
join
([
"local_ostr[
%
s][
%
s]"
%
(
ipos
,
x
)
for
x
in
xrange
(
nd
)])
+
'<<"
\\
n";'
,
file
=
sio
)
'std::cerr << " local_ostr inputs
%(ipos)
s: " <<'
%
locals
()
+
' << " " << '
.
join
(
[
"local_ostr[
%
s][
%
s]"
%
(
ipos
,
x
)
for
x
in
xrange
(
nd
)])
+
'<<"
\\
n";'
,
file
=
sio
)
print
(
"""
print
(
"""
for(int id=0;id<nd_collapse;id++){
for(int id=0;id<nd_collapse;id++){
...
@@ -668,35 +702,51 @@ class NaiveAlgo(object):
...
@@ -668,35 +702,51 @@ class NaiveAlgo(object):
nd_collapse--; id--;
nd_collapse--; id--;
}
}
}
}
"""
%
locals
(),
file
=
sio
)
"""
%
locals
(),
file
=
sio
)
if
self
.
verbose
>
2
:
if
self
.
verbose
>
2
:
print
(
'std::cerr <<"after broadcast collapse
\\
n";'
,
file
=
sio
)
print
(
'std::cerr <<"after broadcast collapse
\\
n";'
,
file
=
sio
)
print
(
'std::cerr<< "nd_collapse "<< nd_collapse << "
\\
n"; '
,
file
=
sio
)
print
(
'std::cerr<< "nd_collapse "<< nd_collapse << "
\\
n"; '
,
file
=
sio
)
print
(
'std::cerr << "local_dims";'
,
file
=
sio
)
print
(
'std::cerr << "local_dims";'
,
file
=
sio
)
for
d
in
xrange
(
nd
):
for
d
in
xrange
(
nd
):
print
(
'std::cerr << " " << local_dims[
%(d)
s]; '
%
locals
(),
file
=
sio
)
print
(
'std::cerr << " " << local_dims[
%(d)
s]; '
%
locals
(),
file
=
sio
)
print
(
'std::cerr << "
\\
n";'
,
file
=
sio
)
print
(
'std::cerr << "
\\
n";'
,
file
=
sio
)
if
nd
>
0
:
if
nd
>
0
:
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
print
(
'std::cerr << " local_str
%(ipos)
s: " <<'
%
locals
()
+
' << " " << '
.
join
([
"local_str[
%
s][
%
s]"
%
(
ipos
,
x
)
for
x
in
xrange
(
nd
)])
+
'<<"
\\
n";'
,
file
=
sio
)
print
(
'std::cerr << " local_str
%(ipos)
s: " <<'
%
locals
()
+
' << " " << '
.
join
(
[
"local_str[
%
s][
%
s]"
%
(
ipos
,
x
)
for
x
in
xrange
(
nd
)])
+
'<<"
\\
n";'
,
file
=
sio
)
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
print
(
'std::cerr << " local_ostr
%(ipos)
s: " <<'
%
locals
()
+
' << " " << '
.
join
([
"local_ostr[
%
s][
%
s]"
%
(
ipos
,
x
)
for
x
in
xrange
(
nd
)])
+
'<<"
\\
n";'
,
file
=
sio
)
print
(
'std::cerr << " local_ostr
%(ipos)
s: " <<'
%
locals
()
+
' << " " << '
.
join
(
[
"local_ostr[
%
s][
%
s]"
%
(
ipos
,
x
)
for
x
in
xrange
(
nd
)])
+
'<<"
\\
n";'
,
file
=
sio
)
# collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
# collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
# this is a good idea because we make less index calculation in the gpu.
# this is a good idea because we make less index calculation in the gpu.
if
nd
>
0
:
if
nd
>
0
:
print
(
"int nd_collapse_[
%(nd)
s] = {"
%
locals
()
+
','
.
join
([
'1'
for
x
in
xrange
(
nd
)])
+
"};"
,
file
=
sio
)
print
(
"int nd_collapse_[
%(nd)
s] = {"
%
locals
()
+
','
.
join
(
[
'1'
for
x
in
xrange
(
nd
)])
+
"};"
,
file
=
sio
)
else
:
else
:
print
(
"int *nd_collapse_ = NULL;"
,
file
=
sio
)
print
(
"int *nd_collapse_ = NULL;"
,
file
=
sio
)
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
if
not
_logical_scalar
(
node
.
inputs
[
ipos
]):
if
not
_logical_scalar
(
node
.
inputs
[
ipos
]):
if
nd
>
0
:
if
nd
>
0
:
print
(
"""
print
(
"""
int nd_collapse_
%(ipos)
s[
%(nd)
s] = {"""
%
locals
()
+
','
.
join
([
'1'
for
x
in
xrange
(
nd
)])
+
"};"
,
file
=
sio
)
int nd_collapse_
%(ipos)
s[
%(nd)
s] = {"""
%
locals
()
+
','
.
join
([
'1'
for
x
in
xrange
(
nd
)])
+
"};"
,
file
=
sio
)
else
:
else
:
print
(
"""
print
(
"""
int *nd_collapse_
%(ipos)
s = NULL;"""
%
locals
(),
file
=
sio
)
int * nd_collapse_
%(ipos)
s = NULL;"""
%
locals
(),
file
=
sio
)
print
(
"""
print
(
"""
can_collapse_
%(nodename)
s(nd_collapse, local_dims, local_str[
%(ipos)
s], nd_collapse_
%(ipos)
s);
can_collapse_
%(nodename)
s(nd_collapse, local_dims, local_str[
%(ipos)
s], nd_collapse_
%(ipos)
s);
for(int i=0;i<nd_collapse;i++){
for(int i=0;i<nd_collapse;i++){
...
@@ -707,8 +757,10 @@ nd_collapse_[i]=0;
...
@@ -707,8 +757,10 @@ nd_collapse_[i]=0;
if
self
.
verbose
>
1
:
if
self
.
verbose
>
1
:
print
(
"""
print
(
"""
std::cerr<< "nd_collapse_
%(ipos)
s "<<
std::cerr<< "nd_collapse_
%(ipos)
s "<<
"""
%
locals
(),
file
=
sio
)
"""
%
locals
(),
file
=
sio
)
print
(
' << " " << '
.
join
([
"nd_collapse_
%
s["
%
ipos
+
str
(
i
)
+
"]"
for
i
in
xrange
(
nd
)]),
file
=
sio
)
print
(
' << " " << '
.
join
([
"nd_collapse_
%
s["
%
ipos
+
str
(
i
)
+
"]"
for
i
in
xrange
(
nd
)]),
file
=
sio
)
print
(
'<< "
\\
n";'
,
file
=
sio
)
print
(
'<< "
\\
n";'
,
file
=
sio
)
# update the local stride.
# update the local stride.
...
@@ -721,7 +773,7 @@ nd_collapse_[i]=0;
...
@@ -721,7 +773,7 @@ nd_collapse_[i]=0;
local_str[
%(ipos)
s][j-1]=local_str[
%(ipos)
s][j];
local_str[
%(ipos)
s][j-1]=local_str[
%(ipos)
s][j];
}
}
}
}
"""
%
locals
(),
file
=
sio
)
"""
%
locals
(),
file
=
sio
)
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
print
(
"""
print
(
"""
...
@@ -732,7 +784,7 @@ nd_collapse_[i]=0;
...
@@ -732,7 +784,7 @@ nd_collapse_[i]=0;
local_ostr[
%(ipos)
s][j-1]=local_ostr[
%(ipos)
s][j];
local_ostr[
%(ipos)
s][j-1]=local_ostr[
%(ipos)
s][j];
}
}
}
}
"""
%
locals
(),
file
=
sio
)
"""
%
locals
(),
file
=
sio
)
# update the local dims.
# update the local dims.
print
(
"""
print
(
"""
...
@@ -743,16 +795,20 @@ nd_collapse_[i]=0;
...
@@ -743,16 +795,20 @@ nd_collapse_[i]=0;
local_dims[j-1]=local_dims[j];
local_dims[j-1]=local_dims[j];
}
}
}
}
"""
%
locals
(),
file
=
sio
)
"""
%
locals
(),
file
=
sio
)
# update the new number of dim
# update the new number of dim
print
(
"""
print
(
"""
for(int i=1, end=nd_collapse;i<end;i++){
for(int i=1, end=nd_collapse;i<end;i++){
if(nd_collapse_[i]==1)nd_collapse--;
if(nd_collapse_[i]==1)nd_collapse--;
}
}
if(nd_collapse == 1 """
%
locals
(),
file
=
sio
)
if(nd_collapse == 1 """
%
locals
(),
file
=
sio
)
l
=
[
"local_str[
%
s][nd_collapse-1]==1 "
%
ipos
for
ipos
in
xrange
(
len
(
node
.
inputs
))
if
not
_logical_scalar
(
node
.
inputs
[
ipos
])]
l
=
[
"local_str[
%
s][nd_collapse-1]==1 "
%
l
+=
[
"local_ostr[
%
s][nd_collapse-1]==1 "
%
ipos
for
ipos
in
xrange
(
len
(
node
.
outputs
))
if
not
_logical_scalar
(
node
.
outputs
[
ipos
])]
ipos
for
ipos
in
xrange
(
len
(
node
.
inputs
))
if
not
_logical_scalar
(
node
.
inputs
[
ipos
])]
l
+=
[
"local_ostr[
%
s][nd_collapse-1]==1 "
%
ipos
for
ipos
in
xrange
(
len
(
node
.
outputs
))
if
not
_logical_scalar
(
node
.
outputs
[
ipos
])]
if
len
(
l
)
>
0
:
if
len
(
l
)
>
0
:
print
(
" && "
,
" && "
.
join
(
l
),
file
=
sio
)
print
(
" && "
,
" && "
.
join
(
l
),
file
=
sio
)
print
(
"""){nd_collapse=0;} """
,
file
=
sio
)
print
(
"""){nd_collapse=0;} """
,
file
=
sio
)
...
@@ -762,20 +818,31 @@ nd_collapse_[i]=0;
...
@@ -762,20 +818,31 @@ nd_collapse_[i]=0;
print
(
"""std::cerr << "nd_collapse " << nd_collapse << "
\\
n"; """
%
locals
(),
file
=
sio
)
print
(
"""std::cerr << "nd_collapse " << nd_collapse << "
\\
n"; """
%
locals
(),
file
=
sio
)
if
self
.
verbose
>
1
:
if
self
.
verbose
>
1
:
for
d
in
xrange
(
nd
):
for
d
in
xrange
(
nd
):
print
(
'std::cerr << " " << local_dims[
%(d)
s]; '
%
locals
(),
file
=
sio
)
print
(
'std::cerr << " " << local_dims[
%(d)
s]; '
%
locals
(),
file
=
sio
)
print
(
'std::cerr << "
\\
n";'
,
file
=
sio
)
print
(
'std::cerr << "
\\
n";'
,
file
=
sio
)
if
nd
>
0
:
if
nd
>
0
:
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
print
(
'std::cerr << " local_str
%(ipos)
s: " <<'
%
locals
()
+
' << " " << '
.
join
([
"local_str[
%
s][
%
s]"
%
(
ipos
,
x
)
for
x
in
xrange
(
nd
)])
+
'<<"
\\
n";'
,
file
=
sio
)
print
(
'std::cerr << " local_str
%
(ipos)s: " <<'
%
locals
()
+
' << " " << '
.
join
(
[
"local_str[
%
s][
%
s]"
%
(
ipos
,
x
)
for
x
in
xrange
(
nd
)])
+
'<<"
\\
n";'
,
file
=
sio
)
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
print
(
'std::cerr << " local_ostr
%(ipos)
s: " <<'
%
locals
()
+
' << " " << '
.
join
([
"local_ostr[
%
s][
%
s]"
%
(
ipos
,
x
)
for
x
in
xrange
(
nd
)])
+
'<<"
\\
n";'
,
file
=
sio
)
print
(
'std::cerr << " local_ostr
%
(ipos)s: " <<'
%
locals
()
+
' << " " << '
.
join
(
[
"local_ostr[
%
s][
%
s]"
%
(
ipos
,
x
)
for
x
in
xrange
(
nd
)])
+
'<<"
\\
n";'
,
file
=
sio
)
def
launch_Ccontiguous
(
nodename
,
scalar_op
,
sync
=
True
):
def
launch_Ccontiguous
(
nodename
,
scalar_op
,
sync
=
True
):
kernel_call_args
=
[
"numEls"
]
kernel_call_args
=
[
"numEls"
]
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
kernel_call_args
.
append
(
"i
%
i_data"
%
ipos
)
kernel_call_args
.
append
(
"i
%
i_data"
%
ipos
)
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
kernel_call_args
.
append
(
"o
%
i_data"
%
ipos
)
kernel_call_args
.
append
(
"o
%
i_data"
%
ipos
)
kernel_call_args
=
", "
.
join
(
kernel_call_args
)
kernel_call_args
=
", "
.
join
(
kernel_call_args
)
verb
=
""
verb
=
""
if
self
.
verbose
:
if
self
.
verbose
:
...
@@ -817,20 +884,27 @@ nd_collapse_[i]=0;
...
@@ -817,20 +884,27 @@ nd_collapse_[i]=0;
# kernel_call_args are used to invoke the cuda kernel
# kernel_call_args are used to invoke the cuda kernel
local
=
"local_"
local
=
"local_"
kernel_call_args
=
[
"numEls"
]
kernel_call_args
=
[
"numEls"
]
kernel_call_args
.
extend
(
local
+
"dims[
%
i]"
%
di
for
di
in
xrange
(
force_nd
))
kernel_call_args
.
extend
(
local
+
"dims[
%
i]"
%
di
for
di
in
xrange
(
force_nd
))
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
for
ipos
in
xrange
(
len
(
node
.
inputs
)):
kernel_call_args
+=
[
"i
%
i_data"
%
ipos
]
+
list
(
local
+
"str[
%
i][
%
i]"
%
(
ipos
,
di
)
for
di
in
xrange
(
force_nd
))
kernel_call_args
+=
[
"i
%
i_data"
%
ipos
]
+
list
(
#strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
local
+
"str[
%
i][
%
i]"
%
#kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
(
ipos
,
di
)
for
di
in
xrange
(
force_nd
))
# strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
# kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
for
ipos
in
xrange
(
len
(
node
.
outputs
)):
kernel_call_args
+=
[
"o
%
i_data"
%
ipos
]
+
list
(
local
+
"ostr[
%
i][
%
i]"
%
(
ipos
,
di
)
for
di
in
xrange
(
force_nd
))
kernel_call_args
+=
[
"o
%
i_data"
%
ipos
]
+
list
(
#strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
local
+
"ostr[
%
i][
%
i]"
%
#kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
(
ipos
,
di
)
for
di
in
xrange
(
force_nd
))
# strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
# kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
if
self
.
verbose
:
if
self
.
verbose
:
print
(
"""
print
(
"""
std::cerr << " Running general version with
%(force_nd)
s dims
\\
n";
std::cerr << " Running general version with
%(force_nd)
s dims
\\
n";
"""
%
locals
(),
file
=
sio
)
"""
%
locals
(),
file
=
sio
)
print
(
"std::cerr << "
+
' << " " << '
.
join
(
kernel_call_args
)
+
' << "
\\
n";'
,
file
=
sio
)
print
(
"std::cerr << "
+
' << " " << '
.
join
(
kernel_call_args
)
+
' << "
\\
n";'
,
file
=
sio
)
# std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n;
# std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n;
kernel_call_args
=
", "
.
join
(
kernel_call_args
)
kernel_call_args
=
", "
.
join
(
kernel_call_args
)
...
@@ -866,12 +940,13 @@ nd_collapse_[i]=0;
...
@@ -866,12 +940,13 @@ nd_collapse_[i]=0;
else
:
else
:
print
(
" return 0; "
%
locals
(),
file
=
sio
)
print
(
" return 0; "
%
locals
(),
file
=
sio
)
print
(
"if(numEls==0) return 0;"
,
file
=
sio
)
print
(
"if(numEls==0) return 0;"
,
file
=
sio
)
print
(
"switch (nd_collapse==0?0:min(
%(nd)
s,nd_collapse)) {"
%
locals
(),
file
=
sio
)
print
(
"switch (nd_collapse==0?0:min(
%(nd)
s,nd_collapse)) {"
%
locals
(),
file
=
sio
)
print
(
"case 0: {"
,
file
=
sio
)
print
(
"case 0: {"
,
file
=
sio
)
launch_Ccontiguous
(
nodename
,
scalar_op
,
self
.
sync
)
launch_Ccontiguous
(
nodename
,
scalar_op
,
self
.
sync
)
print
(
" } break;"
,
file
=
sio
)
print
(
" } break;"
,
file
=
sio
)
for
i
in
xrange
(
1
,
nd
+
1
):
for
i
in
xrange
(
1
,
nd
+
1
):
print
(
"case "
+
str
(
i
)
+
": {"
,
file
=
sio
)
print
(
"case "
+
str
(
i
)
+
": {"
,
file
=
sio
)
launch_General
(
nodename
,
scalar_op
,
i
,
self
.
sync
)
launch_General
(
nodename
,
scalar_op
,
i
,
self
.
sync
)
print
(
" } break;"
,
file
=
sio
)
print
(
" } break;"
,
file
=
sio
)
...
@@ -889,9 +964,10 @@ nd_collapse_[i]=0;
...
@@ -889,9 +964,10 @@ nd_collapse_[i]=0;
#define INTMOD_POW2(a, b) (a & ((1<<b)-1))
#define INTMOD_POW2(a, b) (a & ((1<<b)-1))
"""
"""
kernels
=
""
.
join
(
kernels
=
""
.
join
(
[
self
.
c_src_kernel
(
node
,
nodename
,
x
)
for
x
in
xrange
(
1
,
nd
+
1
)]
[
self
.
c_src_kernel
(
node
,
nodename
,
x
)
+
[
self
.
c_src_kernel_Ccontiguous
(
node
,
nodename
)]
for
x
in
xrange
(
1
,
nd
+
1
)]
+
+
[
self
.
c_src_callkernel
(
node
,
nodename
)])
[
self
.
c_src_kernel_Ccontiguous
(
node
,
nodename
)]
+
[
self
.
c_src_callkernel
(
node
,
nodename
)])
return
defines
+
kernels
return
defines
+
kernels
def
c_support_code
(
self
):
def
c_support_code
(
self
):
...
...
theano/sandbox/cuda/fftconv.py
浏览文件 @
b69ad54d
...
@@ -5,9 +5,9 @@ import numpy as np
...
@@ -5,9 +5,9 @@ import numpy as np
import
theano
import
theano
import
theano.tensor
as
T
import
theano.tensor
as
T
from
theano.misc.pycuda_init
import
pycuda_available
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
from
theano.ifelse
import
ifelse
from
theano.ifelse
import
ifelse
from
theano.misc.pycuda_init
import
pycuda_available
if
cuda_available
:
if
cuda_available
:
from
theano.sandbox.cuda
import
(
basic_ops
,
CudaNdarrayType
,
from
theano.sandbox.cuda
import
(
basic_ops
,
CudaNdarrayType
,
...
@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
...
@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# special way because we specify explicitly here
# how much values are expected.
# how much values are expected.
if
border_mode
==
'valid'
:
if
border_mode
==
'valid'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
)]
elif
border_mode
==
'full'
:
elif
border_mode
==
'full'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
)]
else
:
else
:
raise
ValueError
(
'invalid mode'
)
raise
ValueError
(
'invalid mode'
)
...
@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
...
@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
output_fft_s
=
mult_and_reduce
(
input_fft_v
,
filters_fft_v
,
output_fft_s
=
mult_and_reduce
(
input_fft_v
,
filters_fft_v
,
input_shape
=
input_fft_v_shape
,
input_shape
=
input_fft_v_shape
,
filter_shape
=
filters_fft_v_shape
)
filter_shape
=
filters_fft_v_shape
)
#output_fft_s = input_fft_v
#
output_fft_s = input_fft_v
# reshape for IFFT
# reshape for IFFT
output_fft_flat
=
output_fft_s
.
reshape
((
b
*
oc
,
o0
,
o1
,
o2
//
2
+
1
,
2
))
output_fft_flat
=
output_fft_s
.
reshape
((
b
*
oc
,
o0
,
o1
,
o2
//
2
+
1
,
2
))
...
@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
...
@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# special way because we specify explicitly here
# how much values are expected.
# how much values are expected.
if
border_mode
==
'valid'
:
if
border_mode
==
'valid'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
),
(
f2
-
1
):(
f2
-
1
+
i2
-
f2
+
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
),
(
f2
-
1
):(
f2
-
1
+
i2
-
f2
+
1
)]
elif
border_mode
==
'full'
:
elif
border_mode
==
'full'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
),
(
f2
-
1
):(
f2
-
1
+
i2
+
f2
-
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
),
(
f2
-
1
):(
f2
-
1
+
i2
+
f2
-
1
)]
else
:
else
:
raise
ValueError
(
'invalid mode'
)
raise
ValueError
(
'invalid mode'
)
#output = output_circ[:, :, :, :, :]
#
output = output_circ[:, :, :, :, :]
# Rescale manually. This is just a factor that comes in during the
# Rescale manually. This is just a factor that comes in during the
# trip through FFT and inverse FFT.
# trip through FFT and inverse FFT.
...
...
theano/sandbox/cuda/kernel_codegen.py
浏览文件 @
b69ad54d
...
@@ -167,17 +167,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
...
@@ -167,17 +167,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
We use __i as an int variable in a loop.
We use __i as an int variable in a loop.
"""
"""
return
[
return
[
# get max of buf (trashing all but buf[0])
# get max of buf (trashing all but buf[0])
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'__syncthreads()'
,
'float row_max = '
+
buf
+
'[0]'
,
'float row_max = '
+
buf
+
'[0]'
,
'__syncthreads()'
,
'__syncthreads()'
,
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
'; __i+='
+
threadCount
+
'){'
,
threadCount
+
'){'
,
buf
+
'[__i] = exp('
+
buf2
+
'[__i] - row_max)'
,
buf
+
'[__i] = exp('
+
buf2
+
'[__i] - row_max)'
,
buf2
+
'[__i] = '
+
buf
+
'[__i]'
,
buf2
+
'[__i] = '
+
buf
+
'[__i]'
,
'}'
,
'}'
,
'__syncthreads()'
,
'__syncthreads()'
,
inline_reduce_sum
(
N
,
buf
,
threadPos
,
threadCount
),
inline_reduce_sum
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'__syncthreads()'
,
...
@@ -186,8 +184,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
...
@@ -186,8 +184,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
# divide each exp() result by the sum to complete the job.
# divide each exp() result by the sum to complete the job.
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
threadCount
+
'){'
,
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = '
+
buf2
+
'[__i] / row_sum'
,
buf
+
'[__i] = '
+
buf2
+
'[__i] / row_sum'
,
'}'
,
'}'
,
'__syncthreads()'
,
'__syncthreads()'
,
]
]
...
@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
...
@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
init
=
manner_init
(
"
%(x)
s[
%(pos)
s *
%(stride_x)
s]"
%
locals
())
init
=
manner_init
(
"
%(x)
s[
%(pos)
s *
%(stride_x)
s]"
%
locals
())
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(x)
s[i *
%(stride_x)
s]"
%
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(x)
s[i *
%(stride_x)
s]"
%
locals
()))
locals
()))
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
buf
)
"
%
s[i]"
%
buf
)
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
...
...
theano/sandbox/cuda/neighbours.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
# This is work in progress
# This is work in progress
from
theano
import
Op
,
Apply
,
tensor
from
theano
import
Apply
,
tensor
from
theano.gof
import
local_optimizer
from
theano.gof
import
local_optimizer
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
...
...
theano/sandbox/cuda/nnet.py
浏览文件 @
b69ad54d
...
@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
...
@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
"""
%
locals
()
"""
%
locals
()
def
c_support_code_apply
(
self
,
node
,
nodename
):
def
c_support_code_apply
(
self
,
node
,
nodename
):
ret1
=
nvcc_kernel
(
"kSoftmax_
%
s"
%
nodename
,
ret1
=
nvcc_kernel
(
"kSoftmax_
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'const float * x'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
'const int sx0'
,
body
=
[
'const int sx1'
,
"extern __shared__ float buf[]"
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
"extern __shared__ float buf[]"
,
"float * buf2 = buf + N"
,
"float * buf2 = buf + N"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf2[tx] = buf[tx]"
,
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
"}"
,
inline_softmax
(
'N'
,
"__syncthreads()"
,
'buf'
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
'threadIdx.x'
,
'blockDim.x'
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
# This set all value correctly
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"}"
,
"__syncthreads()"
,
"}"
,
])
"__syncthreads()"
,
ret2
=
nvcc_kernel
(
"}"
,
"kSoftmax_fixed_shared
%
s"
%
nodename
,
])
ret2
=
nvcc_kernel
(
"kSoftmax_fixed_shared
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
body
=
[
"extern __shared__ float buf[]"
,
"extern __shared__ float buf[]"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
" blockIDX += gridDim.x){"
,
"const float *x_ptr = &x[blockIDX * sx0]"
,
"const float *x_ptr = &x[blockIDX * sx0]"
,
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
'sm_ptr'
,
'sm_s1'
,
'sm_ptr'
,
'sm_s1'
,
'threadIdx.x'
,
'blockDim.x'
),
'threadIdx.x'
,
"__syncthreads()"
,
'blockDim.x'
),
"}"
,
"__syncthreads()"
,
"}"
,
])
])
return
ret1
+
"
\n
"
+
ret2
return
ret1
+
"
\n
"
+
ret2
gpu_softmax
=
GpuSoftmax
()
gpu_softmax
=
GpuSoftmax
()
...
@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
...
@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'const float * b'
,
'const int sb0'
,
'const float * b'
,
'const int sb0'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
body
=
[
"extern __shared__ float buf[]"
,
"extern __shared__ float buf[]"
,
"float * buf2 = buf + N"
,
"float * buf2 = buf + N"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf[tx] += b[tx * sb0]"
,
"buf[tx] += b[tx * sb0]"
,
"buf2[tx] = buf[tx]"
,
"buf2[tx] = buf[tx]"
,
"}"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
"__syncthreads()"
,
'threadIdx.x'
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'blockDim.x'
),
'threadIdx.x'
,
'blockDim.x'
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"}"
,
"__syncthreads()"
,
"}"
,
])
"__syncthreads()"
,
"}"
,
])
ret2
=
nvcc_kernel
(
"kSoftmaxWithBias_fixed_shared
%
s"
%
nodename
,
ret2
=
nvcc_kernel
(
"kSoftmaxWithBias_fixed_shared
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
'const float * x'
,
...
@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
...
@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
'x_ptr'
,
'sx1'
,
'sm_ptr'
,
'sm_s1'
,
'sm_ptr'
,
'sm_s1'
,
'threadIdx.x'
,
'threadIdx.x'
,
'blockDim.x'
,
'blockDim.x'
,
'b'
,
'sb0'
),
'b'
,
'sb0'
),
...
...
theano/sandbox/cuda/nvcc_compiler.py
浏览文件 @
b69ad54d
...
@@ -4,7 +4,6 @@ import logging
...
@@ -4,7 +4,6 @@ import logging
import
os
import
os
import
subprocess
import
subprocess
import
sys
import
sys
import
warnings
from
locale
import
getpreferredencoding
from
locale
import
getpreferredencoding
import
numpy
import
numpy
...
@@ -249,7 +248,8 @@ class NVCC_compiler(Compiler):
...
@@ -249,7 +248,8 @@ class NVCC_compiler(Compiler):
_logger
.
debug
(
'Writing module C++ code to
%
s'
,
cppfilename
)
_logger
.
debug
(
'Writing module C++ code to
%
s'
,
cppfilename
)
cppfile
.
write
(
src_code
)
cppfile
.
write
(
src_code
)
lib_filename
=
os
.
path
.
join
(
location
,
'
%
s.
%
s'
%
lib_filename
=
os
.
path
.
join
(
location
,
'
%
s.
%
s'
%
(
module_name
,
get_lib_extension
()))
(
module_name
,
get_lib_extension
()))
_logger
.
debug
(
'Generating shared lib
%
s'
,
lib_filename
)
_logger
.
debug
(
'Generating shared lib
%
s'
,
lib_filename
)
...
@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
...
@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
indexof
=
cmd
.
index
(
'-u'
)
indexof
=
cmd
.
index
(
'-u'
)
cmd
.
pop
(
indexof
)
# Remove -u
cmd
.
pop
(
indexof
)
# Remove -u
cmd
.
pop
(
indexof
)
# Remove argument to -u
cmd
.
pop
(
indexof
)
# Remove argument to -u
except
ValueError
as
e
:
except
ValueError
:
done
=
True
done
=
True
# CUDA Toolkit v4.1 Known Issues:
# CUDA Toolkit v4.1 Known Issues:
...
@@ -364,6 +364,8 @@ class NVCC_compiler(Compiler):
...
@@ -364,6 +364,8 @@ class NVCC_compiler(Compiler):
console_encoding
=
getpreferredencoding
()
console_encoding
=
getpreferredencoding
()
nvcc_stdout
=
decode_with
(
nvcc_stdout_raw
,
console_encoding
)
nvcc_stdout
=
decode_with
(
nvcc_stdout_raw
,
console_encoding
)
nvcc_stderr
=
decode_with
(
nvcc_stderr_raw
,
console_encoding
)
nvcc_stderr
=
decode_with
(
nvcc_stderr_raw
,
console_encoding
)
p
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
finally
:
finally
:
os
.
chdir
(
orig_dir
)
os
.
chdir
(
orig_dir
)
...
...
theano/sandbox/cuda/opt.py
浏览文件 @
b69ad54d
...
@@ -10,22 +10,32 @@ import warnings
...
@@ -10,22 +10,32 @@ import warnings
import
numpy
import
numpy
from
six.moves
import
reduce
,
xrange
from
six.moves
import
reduce
,
xrange
from
.
import
dnn
import
theano
import
theano
from
theano
import
scalar
as
scal
from
theano
import
scalar
as
scal
from
theano
import
config
,
tensor
,
gof
from
theano
import
config
,
tensor
,
gof
import
theano.ifelse
import
theano.ifelse
import
theano.tensor.signal.pool
import
theano.tensor.nnet
import
theano.tensor.nnet.neighbours
# Convolution
from
theano.tensor.nnet
import
conv
from
theano.tensor.nnet.ConvGrad3D
import
ConvGrad3D
from
theano.tensor.nnet.ConvTransp3D
import
ConvTransp3D
# Pooling
import
theano.tensor.signal.pool
as
pool
from
theano.compile
import
optdb
from
theano.compile
import
optdb
from
theano.gof
import
(
local_optimizer
,
EquilibriumDB
,
ProxyDB
,
from
theano.gof
import
(
local_optimizer
,
EquilibriumDB
,
ProxyDB
,
Optimizer
,
TopoOptimizer
,
toolbox
)
Optimizer
,
TopoOptimizer
,
toolbox
)
from
theano.gof.opt
import
LocalMetaOptimizer
from
theano.gof.opt
import
LocalMetaOptimizer
from
theano.sandbox.cuda.basic_ops
import
gpu_join
,
GpuJoin
from
theano.sandbox.cuda
import
as_cuda_ndarray_variable
from
theano.sandbox.cuda
import
as_cuda_ndarray_variable
from
theano.sandbox.cuda.basic_ops
import
(
from
theano.sandbox.cuda.basic_ops
import
(
gpu_eye
,
gpu_contiguous
,
gpu_eye
,
gpu_contiguous
,
gpu_from_host
,
host_from_gpu
,
GpuFromHost
,
HostFromGpu
,
gpu_from_host
,
host_from_gpu
,
GpuFromHost
,
HostFromGpu
,
GpuContiguous
,
GpuContiguous
,
GpuElemwise
,
GpuDimShuffle
,
GpuReshape
,
GpuCAReduce
,
GpuElemwise
,
GpuDimShuffle
,
GpuReshape
,
GpuCAReduce
,
GpuFlatten
,
gpu_flatten
,
gpu_flatten
,
GpuSubtensor
,
GpuAdvancedSubtensor1
,
GpuSubtensor
,
GpuAdvancedSubtensor1
,
GpuAdvancedIncSubtensor1
,
GpuAdvancedIncSubtensor1_dev20
,
GpuAdvancedIncSubtensor1
,
GpuAdvancedIncSubtensor1_dev20
,
GpuIncSubtensor
,
gpu_alloc
,
GpuAlloc
,
gpu_shape
,
GpuSplit
,
GpuAllocEmpty
)
GpuIncSubtensor
,
gpu_alloc
,
GpuAlloc
,
gpu_shape
,
GpuSplit
,
GpuAllocEmpty
)
...
@@ -137,8 +147,6 @@ register_opt(name='local_gpu_reshape_chain')(
...
@@ -137,8 +147,6 @@ register_opt(name='local_gpu_reshape_chain')(
# This is a partial list of CPU ops that can be in some circonstance
# This is a partial list of CPU ops that can be in some circonstance
# moved to the GPU. This list is used by an optimization.
# moved to the GPU. This list is used by an optimization.
# Hopefully, we can keep this list up to date.
# Hopefully, we can keep this list up to date.
import
theano.tensor.signal.pool
import
theano.tensor.nnet.neighbours
cpu_ops_moved_to_gpu
=
[
cpu_ops_moved_to_gpu
=
[
tensor
.
blas
.
Dot22
,
tensor
.
blas
.
Dot22Scalar
,
tensor
.
blas
.
Gemm
,
tensor
.
blas
.
Dot22
,
tensor
.
blas
.
Dot22Scalar
,
tensor
.
blas
.
Gemm
,
tensor
.
blas
.
Gemv
,
tensor
.
blas
.
Ger
,
tensor
.
nnet
.
conv
.
ConvOp
,
tensor
.
blas
.
Gemv
,
tensor
.
blas
.
Ger
,
tensor
.
nnet
.
conv
.
ConvOp
,
...
@@ -850,8 +858,8 @@ def local_gpu_careduce(node):
...
@@ -850,8 +858,8 @@ def local_gpu_careduce(node):
if
x
.
type
==
node
.
outputs
[
0
]
.
type
:
if
x
.
type
==
node
.
outputs
[
0
]
.
type
:
return
[
x
]
return
[
x
]
elif
(
all
([
c
!=
"output"
and
isinstance
(
c
.
op
,
GpuFromHost
)
elif
(
all
([
c
!=
"output"
and
isinstance
(
c
.
op
,
GpuFromHost
)
for
c
,
i
in
node
.
outputs
[
0
]
.
clients
])
for
c
,
i
in
node
.
outputs
[
0
]
.
clients
])
and
and
x
.
owner
and
x
.
owner
.
op
.
__class__
in
x
.
owner
and
x
.
owner
.
op
.
__class__
in
cpu_ops_moved_to_gpu
):
cpu_ops_moved_to_gpu
):
# It is not always good to transfer the reduction to
# It is not always good to transfer the reduction to
# the GPU when the clients are on the GPU but not the
# the GPU when the clients are on the GPU but not the
...
@@ -1023,7 +1031,8 @@ def local_gpu_flatten(node):
...
@@ -1023,7 +1031,8 @@ def local_gpu_flatten(node):
return
[
gpu_flatten
(
host_input
.
owner
.
inputs
[
0
],
outdim
)(
return
[
gpu_flatten
(
host_input
.
owner
.
inputs
[
0
],
outdim
)(
as_cuda_ndarray_variable
(
host_input
.
owner
.
inputs
[
0
]))]
as_cuda_ndarray_variable
(
host_input
.
owner
.
inputs
[
0
]))]
if
isinstance
(
node
.
op
,
tensor
.
Flatten
):
if
isinstance
(
node
.
op
,
tensor
.
Flatten
):
x
,
=
node
.
inputs
x
,
shp
=
node
.
inputs
outdim
=
node
.
op
.
outdim
if
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
):
if
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
):
outdim
=
node
.
op
.
outdim
outdim
=
node
.
op
.
outdim
gpu_x
,
=
x
.
owner
.
inputs
gpu_x
,
=
x
.
owner
.
inputs
...
@@ -1050,15 +1059,13 @@ def local_gpu_subtensor(node):
...
@@ -1050,15 +1059,13 @@ def local_gpu_subtensor(node):
*
coords
)]
*
coords
)]
if
isinstance
(
node
.
op
,
tensor
.
Subtensor
):
if
isinstance
(
node
.
op
,
tensor
.
Subtensor
):
x
=
node
.
inputs
[
0
]
x
=
node
.
inputs
[
0
]
if
(
x
.
owner
and
if
(
x
.
owner
and
x
.
dtype
==
"float32"
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)):
x
.
dtype
==
"float32"
):
gpu_x
=
x
.
owner
.
inputs
[
0
]
gpu_x
=
x
.
owner
.
inputs
[
0
]
if
(
gpu_x
.
owner
and
if
(
gpu_x
.
owner
and
# And it is a shared var or an input of the graph.
isinstance
(
gpu_x
.
owner
.
op
,
GpuFromHost
)
and
not
(
gpu_x
.
owner
.
inputs
[
0
]
.
owner
)
and
# And it is a shared var or an input of the graph.
isinstance
(
gpu_x
.
owner
.
op
,
GpuFromHost
)):
not
gpu_x
.
owner
.
inputs
[
0
]
.
owner
):
if
len
(
x
.
clients
)
==
1
:
if
len
(
x
.
clients
)
==
1
:
if
any
([
n
==
'output'
or
isinstance
(
n
.
op
,
GpuOp
)
if
any
([
n
==
'output'
or
isinstance
(
n
.
op
,
GpuOp
)
...
@@ -1119,9 +1126,7 @@ def local_gpu_advanced_incsubtensor1(node):
...
@@ -1119,9 +1126,7 @@ def local_gpu_advanced_incsubtensor1(node):
'least
\'
0.6
\'
.'
,
stacklevel
=
1
)
'least
\'
0.6
\'
.'
,
stacklevel
=
1
)
active_device_no
=
theano
.
sandbox
.
cuda
.
active_device_number
()
active_device_no
=
theano
.
sandbox
.
cuda
.
active_device_number
()
compute_capability
=
device_properties
(
active_device_no
)[
'major'
]
compute_capability
=
device_properties
(
active_device_no
)[
'major'
]
if
(
compute_capability
<
2
or
if
(
compute_capability
<
2
or
y
.
ndim
!=
2
or
x
.
ndim
!=
2
):
x
.
ndim
!=
2
or
y
.
ndim
!=
2
):
gpu_op
=
GpuAdvancedIncSubtensor1
(
gpu_op
=
GpuAdvancedIncSubtensor1
(
set_instead_of_inc
=
set_instead_of_inc
)
set_instead_of_inc
=
set_instead_of_inc
)
...
@@ -1162,9 +1167,7 @@ def local_gpu_advanced_incsubtensor1(node):
...
@@ -1162,9 +1167,7 @@ def local_gpu_advanced_incsubtensor1(node):
active_device_no
=
theano
.
sandbox
.
cuda
.
active_device_number
()
active_device_no
=
theano
.
sandbox
.
cuda
.
active_device_number
()
compute_capability
=
device_properties
(
active_device_no
)[
'major'
]
compute_capability
=
device_properties
(
active_device_no
)[
'major'
]
if
(
compute_capability
<
2
or
if
(
compute_capability
<
2
or
y
.
ndim
!=
2
or
x
.
ndim
!=
2
):
x
.
ndim
!=
2
or
y
.
ndim
!=
2
):
gpu_op
=
GpuAdvancedIncSubtensor1
(
gpu_op
=
GpuAdvancedIncSubtensor1
(
set_instead_of_inc
=
set_instead_of_inc
)
set_instead_of_inc
=
set_instead_of_inc
)
else
:
else
:
...
@@ -1203,8 +1206,8 @@ def local_gpu_incsubtensor(node):
...
@@ -1203,8 +1206,8 @@ def local_gpu_incsubtensor(node):
# Incrementing a float32 x results in a float32
# Incrementing a float32 x results in a float32
# output even if y is float64, so we can downcast
# output even if y is float64, so we can downcast
# y to put it on GPU
# y to put it on GPU
elif
type
(
node
.
op
)
==
tensor
.
IncSubtensor
and
\
elif
(
type
(
node
.
op
)
==
tensor
.
IncSubtensor
and
node
.
inputs
[
0
]
.
dtype
==
"float32"
:
node
.
inputs
[
0
]
.
dtype
==
"float32"
)
:
x
,
y
=
node
.
inputs
[
0
:
2
]
x
,
y
=
node
.
inputs
[
0
:
2
]
assert
isinstance
(
x
.
type
,
tensor
.
TensorType
)
assert
isinstance
(
x
.
type
,
tensor
.
TensorType
)
assert
isinstance
(
y
.
type
,
tensor
.
TensorType
)
assert
isinstance
(
y
.
type
,
tensor
.
TensorType
)
...
@@ -1346,8 +1349,6 @@ def cast(x, dtype):
...
@@ -1346,8 +1349,6 @@ def cast(x, dtype):
cast_op
=
theano
.
tensor
.
Elemwise
(
scal
.
Identity
(
scal
.
specific_out
(
stype
)))
cast_op
=
theano
.
tensor
.
Elemwise
(
scal
.
Identity
(
scal
.
specific_out
(
stype
)))
return
cast_op
(
x
)
return
cast_op
(
x
)
import
theano.tensor.nnet
@register_opt
()
@register_opt
()
@local_optimizer
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
])
@local_optimizer
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
])
...
@@ -1419,18 +1420,13 @@ def local_gpu_softmax_with_bias(node):
...
@@ -1419,18 +1420,13 @@ def local_gpu_softmax_with_bias(node):
return
False
return
False
# Convolution
from
theano.tensor.nnet
import
conv
def
_gpu_conv_to_fftconv
(
node
):
def
_gpu_conv_to_fftconv
(
node
):
# shared helper function for local_conv_fft_valid and local_conv_fft_full.
# shared helper function for local_conv_fft_valid and local_conv_fft_full.
# we import conv2d_fft locally to avoid pycuda warnings
# we import conv2d_fft locally to avoid pycuda warnings
from
theano.sandbox.cuda.fftconv
import
conv2d_fft
from
theano.sandbox.cuda.fftconv
import
conv2d_fft
kwargs
=
{
'border_mode'
:
node
.
op
.
border_mode
}
kwargs
=
{
'border_mode'
:
node
.
op
.
border_mode
}
if
(
node
.
op
.
imshp
is
not
None
and
if
(
node
.
op
.
imshp
is
not
None
and
node
.
op
.
imshp
[
-
1
]
%
2
==
1
and
node
.
op
.
imshp
[
-
1
]
is
not
None
and
node
.
op
.
imshp
[
-
1
]
is
not
None
):
node
.
op
.
imshp
[
-
1
]
%
2
==
1
):
kwargs
[
'pad_last_dim'
]
=
True
kwargs
[
'pad_last_dim'
]
=
True
# If the user supplied the full nonsymbolic image_shape and
# If the user supplied the full nonsymbolic image_shape and
...
@@ -1459,9 +1455,8 @@ def _gpu_conv_to_fftconv(node):
...
@@ -1459,9 +1455,8 @@ def _gpu_conv_to_fftconv(node):
@local_optimizer
([
GpuConv
])
@local_optimizer
([
GpuConv
])
def
local_conv_fft_valid
(
node
):
def
local_conv_fft_valid
(
node
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
(
node
.
op
.
border_mode
==
'valid'
and
if
(
node
.
op
.
border_mode
==
'valid'
and
node
.
op
.
fft_opt
and
node
.
op
.
subsample
==
(
1
,
1
)
and
node
.
op
.
subsample
==
(
1
,
1
)):
node
.
op
.
fft_opt
):
return
[
_gpu_conv_to_fftconv
(
node
)]
return
[
_gpu_conv_to_fftconv
(
node
)]
return
False
return
False
...
@@ -1470,9 +1465,8 @@ def local_conv_fft_valid(node):
...
@@ -1470,9 +1465,8 @@ def local_conv_fft_valid(node):
@local_optimizer
([
GpuConv
])
@local_optimizer
([
GpuConv
])
def
local_conv_fft_full
(
node
):
def
local_conv_fft_full
(
node
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
(
node
.
op
.
border_mode
==
'full'
and
if
(
node
.
op
.
border_mode
==
'full'
and
node
.
op
.
fft_opt
and
node
.
op
.
subsample
==
(
1
,
1
)
and
node
.
op
.
subsample
==
(
1
,
1
)):
node
.
op
.
fft_opt
):
return
[
_gpu_conv_to_fftconv
(
node
)]
return
[
_gpu_conv_to_fftconv
(
node
)]
return
return
...
@@ -1659,7 +1653,6 @@ conv_groupopt.register('conv_fft_full', local_conv_fft_full, 10,
...
@@ -1659,7 +1653,6 @@ conv_groupopt.register('conv_fft_full', local_conv_fft_full, 10,
'conv_fft'
)
'conv_fft'
)
# cuDNN is the second, but only registered if cuDNN is available.
# cuDNN is the second, but only registered if cuDNN is available.
# It can be disabled by excluding 'conv_dnn' or 'cudnn'.
# It can be disabled by excluding 'conv_dnn' or 'cudnn'.
from
.
import
dnn
# We can't check at import if dnn is available, so we must always
# We can't check at import if dnn is available, so we must always
# register it. This do not cause problem as if it is not avail, the
# register it. This do not cause problem as if it is not avail, the
# opt will do nothing.
# opt will do nothing.
...
@@ -1708,8 +1701,7 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
...
@@ -1708,8 +1701,7 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
shapes
=
((
node
.
op
.
bsize
,)
+
node
.
op
.
imshp
,
shapes
=
((
node
.
op
.
bsize
,)
+
node
.
op
.
imshp
,
(
node
.
op
.
nkern
,
nchannels
)
+
node
.
op
.
kshp
)
(
node
.
op
.
nkern
,
nchannels
)
+
node
.
op
.
kshp
)
for
(
var
,
shape
)
in
zip
(
vars
,
shapes
):
for
(
var
,
shape
)
in
zip
(
vars
,
shapes
):
if
((
var
in
inputs
)
and
if
((
var
in
inputs
)
and
(
shape
is
not
None
)
and
(
shape
is
not
None
)
and
not
any
(
s
is
None
for
s
in
shape
)):
not
any
(
s
is
None
for
s
in
shape
)):
result
[
var
]
=
theano
.
shared
(
result
[
var
]
=
theano
.
shared
(
...
@@ -1763,8 +1755,6 @@ def local_conv3d_fft(node):
...
@@ -1763,8 +1755,6 @@ def local_conv3d_fft(node):
gpu_optimizer
.
register
(
"conv3d_fft"
,
local_conv3d_fft
)
gpu_optimizer
.
register
(
"conv3d_fft"
,
local_conv3d_fft
)
from
theano.tensor.nnet.ConvGrad3D
import
ConvGrad3D
@local_optimizer
([
ConvGrad3D
])
@local_optimizer
([
ConvGrad3D
])
def
local_convgrad3d_fft
(
node
):
def
local_convgrad3d_fft
(
node
):
...
@@ -1794,8 +1784,6 @@ def local_convgrad3d_fft(node):
...
@@ -1794,8 +1784,6 @@ def local_convgrad3d_fft(node):
gpu_optimizer
.
register
(
"convgrad3d_fft"
,
local_convgrad3d_fft
)
gpu_optimizer
.
register
(
"convgrad3d_fft"
,
local_convgrad3d_fft
)
from
theano.tensor.nnet.ConvTransp3D
import
ConvTransp3D
@local_optimizer
([
ConvTransp3D
])
@local_optimizer
([
ConvTransp3D
])
def
local_convtransp3d_fft
(
node
):
def
local_convtransp3d_fft
(
node
):
...
@@ -1894,15 +1882,11 @@ def local_convtransp3d_gemm(node):
...
@@ -1894,15 +1882,11 @@ def local_convtransp3d_gemm(node):
gpu_optimizer
.
register
(
"convtransp3d_gemm"
,
local_convtransp3d_gemm
)
gpu_optimizer
.
register
(
"convtransp3d_gemm"
,
local_convtransp3d_gemm
)
# Pooling
import
theano.tensor.signal.pool
as
pool
@register_opt
()
@register_opt
()
@local_optimizer
([
pool
.
Pool
])
@local_optimizer
([
pool
.
Pool
])
def
local_gpu_downsample_factor_max
(
node
):
def
local_gpu_downsample_factor_max
(
node
):
if
(
isinstance
(
node
.
op
,
pool
.
Pool
)
if
(
isinstance
(
node
.
op
,
pool
.
Pool
)
and
and
node
.
op
.
ds
==
node
.
op
.
st
):
node
.
op
.
ds
==
node
.
op
.
st
):
assert
node
.
op
.
__props__
==
(
'ds'
,
'ignore_border'
,
'st'
,
'padding'
,
assert
node
.
op
.
__props__
==
(
'ds'
,
'ignore_border'
,
'st'
,
'padding'
,
'mode'
)
'mode'
)
...
@@ -1917,9 +1901,7 @@ def local_gpu_downsample_factor_max(node):
...
@@ -1917,9 +1901,7 @@ def local_gpu_downsample_factor_max(node):
@register_opt
()
@register_opt
()
@local_optimizer
([
pool
.
MaxPoolGrad
])
@local_optimizer
([
pool
.
MaxPoolGrad
])
def
local_gpu_downsample_factor_max_grad
(
node
):
def
local_gpu_downsample_factor_max_grad
(
node
):
if
(
isinstance
(
node
.
op
,
pool
.
MaxPoolGrad
)
and
if
(
isinstance
(
node
.
op
,
pool
.
MaxPoolGrad
)
and
node
.
op
.
ds
==
node
.
op
.
st
):
node
.
op
.
ds
==
node
.
op
.
st
):
assert
node
.
op
.
__props__
==
(
'ds'
,
'ignore_border'
,
'st'
,
'padding'
,
assert
node
.
op
.
__props__
==
(
'ds'
,
'ignore_border'
,
'st'
,
'padding'
,
'mode'
)
'mode'
)
if
(
node
.
op
.
padding
!=
(
0
,
0
)
or
if
(
node
.
op
.
padding
!=
(
0
,
0
)
or
...
@@ -1955,9 +1937,6 @@ def local_gpu_downsample_factor_max_grad_grad(node):
...
@@ -1955,9 +1937,6 @@ def local_gpu_downsample_factor_max_grad_grad(node):
as_cuda_ndarray_variable
(
gx
)))]
as_cuda_ndarray_variable
(
gx
)))]
from
theano.sandbox.cuda.basic_ops
import
gpu_join
,
GpuJoin
@register_opt
()
@register_opt
()
@local_optimizer
([
tensor
.
Join
])
@local_optimizer
([
tensor
.
Join
])
def
local_gpu_join
(
node
):
def
local_gpu_join
(
node
):
...
@@ -2310,6 +2289,7 @@ def local_gpu_eye(node):
...
@@ -2310,6 +2289,7 @@ def local_gpu_eye(node):
if
(
host_input
.
owner
and
if
(
host_input
.
owner
and
isinstance
(
host_input
.
owner
.
op
,
tensor
.
Eye
)
and
isinstance
(
host_input
.
owner
.
op
,
tensor
.
Eye
)
and
host_input
.
owner
.
op
.
dtype
==
"float32"
):
host_input
.
owner
.
op
.
dtype
==
"float32"
):
if
tensor
.
extract_constant
(
host_input
.
owner
.
inputs
[
2
])
!=
0
:
if
tensor
.
extract_constant
(
host_input
.
owner
.
inputs
[
2
])
!=
0
:
return
return
return
[
gpu_eye
(
*
host_input
.
owner
.
inputs
)]
return
[
gpu_eye
(
*
host_input
.
owner
.
inputs
)]
...
@@ -2492,8 +2472,8 @@ def gpuScanOptimization(node):
...
@@ -2492,8 +2472,8 @@ def gpuScanOptimization(node):
return
_outputs
return
_outputs
# scan(host_from_gpu) -> host_from_gpu(GPUscan)
# scan(host_from_gpu) -> host_from_gpu(GPUscan)
if
(
type
(
node
.
op
)
==
scan_op
.
Scan
if
(
type
(
node
.
op
)
==
scan_op
.
Scan
and
and
not
node
.
op
.
info
[
'gpu'
]):
not
node
.
op
.
info
[
'gpu'
]):
if
any
([(
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
))
if
any
([(
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
))
for
i
in
node
.
inputs
]):
for
i
in
node
.
inputs
]):
...
...
theano/sandbox/cuda/rng_curand.py
浏览文件 @
b69ad54d
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
__authors__
=
"James Bergstra"
__copyright__
=
"(c) 2011, University of Montreal"
__license__
=
"3-clause BSD License"
__contact__
=
"theano-dev@googlegroups.com"
import
numpy
import
numpy
import
theano.gof
import
theano.gof
from
theano.compat
import
PY3
from
theano.compat
import
PY3
...
@@ -17,6 +7,15 @@ from theano.tensor import (get_vector_length, cast, opt)
...
@@ -17,6 +7,15 @@ from theano.tensor import (get_vector_length, cast, opt)
from
theano.compile
import
optdb
from
theano.compile
import
optdb
from
theano.gof
import
local_optimizer
,
Variable
from
theano.gof
import
local_optimizer
,
Variable
__authors__
=
"James Bergstra"
__copyright__
=
"(c) 2011, University of Montreal"
__license__
=
"3-clause BSD License"
__contact__
=
"theano-dev@googlegroups.com"
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
config
=
theano
.
config
config
=
theano
.
config
...
@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
...
@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
Return a tuple of attributes that define the Op.
Return a tuple of attributes that define the Op.
"""
"""
return
(
return
(
self
.
destructive
,
self
.
destructive
,
self
.
output_type
,
self
.
output_type
,
self
.
seed
,
self
.
seed
,
)
)
...
@@ -101,8 +99,7 @@ class CURAND_Base(GpuOp):
...
@@ -101,8 +99,7 @@ class CURAND_Base(GpuOp):
v_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
v_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
if
ndim
is
None
:
if
ndim
is
None
:
ndim
=
get_vector_length
(
v_size
)
ndim
=
get_vector_length
(
v_size
)
self
=
cls
(
self
=
cls
(
output_type
=
CudaNdarrayType
((
False
,)
*
ndim
),
output_type
=
CudaNdarrayType
((
False
,)
*
ndim
),
seed
=
seed
,
seed
=
seed
,
destructive
=
False
)
destructive
=
False
)
...
@@ -386,5 +383,5 @@ def local_destructive(node):
...
@@ -386,5 +383,5 @@ def local_destructive(node):
return
new_op
.
make_node
(
*
node
.
inputs
)
.
outputs
return
new_op
.
make_node
(
*
node
.
inputs
)
.
outputs
return
False
return
False
optdb
.
register
(
'CURAND_destructive'
,
optdb
.
register
(
'CURAND_destructive'
,
opt
.
in2out
(
local_destructive
,
ignore_newtrees
=
True
),
99
,
'fast_run'
,
opt
.
in2out
(
local_destructive
,
ignore_newtrees
=
True
)
,
'inplace'
)
99
,
'fast_run'
,
'inplace'
)
theano/sandbox/cuda/tests/test_basic_ops.py
浏览文件 @
b69ad54d
...
@@ -9,19 +9,20 @@ import numpy
...
@@ -9,19 +9,20 @@ import numpy
from
six.moves
import
xrange
from
six.moves
import
xrange
import
theano
import
theano
import
theano.tensor
as
T
import
theano.tensor
as
T
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.tools
import
assert_raises
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.sandbox.cuda
as
tcn
import
theano.sandbox.cuda
as
tcn
import
theano.sandbox.cuda
as
cuda
import
theano.sandbox.cuda
as
cuda
import
theano.sandbox.cuda.basic_ops
as
B
import
theano.sandbox.cuda.basic_ops
as
B
from
theano.tensor.basic
import
_allclose
from
theano.tensor.basic
import
_allclose
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
import
theano.tensor.tests.test_basic
import
theano.tensor.tests.test_subtensor
import
theano.tensor.tests.test_sharedvar
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
...
@@ -75,8 +76,8 @@ def test_careduce():
...
@@ -75,8 +76,8 @@ def test_careduce():
# The following 2 cases could work if the scalar_op.c_code work with float* dtype.
# The following 2 cases could work if the scalar_op.c_code work with float* dtype.
# Currently we have this error:
# Currently we have this error:
# error: invalid operands of types 'npy_float32' and 'npy_float32' to binary 'operator&'
# error: invalid operands of types 'npy_float32' and 'npy_float32' to binary 'operator&'
#(theano.scalar.and_, tensor.elemwise.CAReduce),
#
(theano.scalar.and_, tensor.elemwise.CAReduce),
#(theano.scalar.or_, tensor.elemwise.CAReduce),
#
(theano.scalar.or_, tensor.elemwise.CAReduce),
]:
]:
for
shape
,
pattern
in
[((
1
,
1
),
(
1
,)),
for
shape
,
pattern
in
[((
1
,
1
),
(
1
,)),
((
1
,
0
),
(
1
,)),
((
1
,
0
),
(
1
,)),
...
@@ -113,7 +114,7 @@ def test_careduce():
...
@@ -113,7 +114,7 @@ def test_careduce():
((
4100
,
4
,
3
),
[
2
]),
((
5
,
4100
,
3
),
[
2
]),
((
5
,
4
,
4100
),
[
2
]),
# 001
((
4100
,
4
,
3
),
[
2
]),
((
5
,
4100
,
3
),
[
2
]),
((
5
,
4
,
4100
),
[
2
]),
# 001
((
4100
,
4
,
3
),
[
0
,
1
]),
((
5
,
4100
,
3
),
[
0
,
1
]),
((
5
,
4
,
4100
),
[
0
,
1
]),
# 110
((
4100
,
4
,
3
),
[
0
,
1
]),
((
5
,
4100
,
3
),
[
0
,
1
]),
((
5
,
4
,
4100
),
[
0
,
1
]),
# 110
((
4100
,
4
,
3
),
[
1
,
2
]),
((
5
,
4100
,
3
),
[
1
,
2
]),
((
5
,
4
,
4100
),
[
1
,
2
]),
# 011
((
4100
,
4
,
3
),
[
1
,
2
]),
((
5
,
4100
,
3
),
[
1
,
2
]),
((
5
,
4
,
4100
),
[
1
,
2
]),
# 011
((
4100
,
4
,
3
),[
0
,
2
]),((
5
,
4100
,
3
),[
0
,
2
]),((
5
,
4
,
4100
),[
0
,
2
]),
((
4100
,
4
,
3
),
[
0
,
2
]),
((
5
,
4100
,
3
),
[
0
,
2
]),
((
5
,
4
,
4100
),
[
0
,
2
]),
((
4100
,
4
,
3
),
[
0
,
1
,
2
]),
((
5
,
4100
,
3
),
[
0
,
1
,
2
]),
((
5
,
4
,
4100
),
[
0
,
1
,
2
]),
# 111
((
4100
,
4
,
3
),
[
0
,
1
,
2
]),
((
5
,
4100
,
3
),
[
0
,
1
,
2
]),
((
5
,
4
,
4100
),
[
0
,
1
,
2
]),
# 111
((
65
,
4
,
3
),
[
0
,
1
,
2
]),
((
5
,
65
,
3
),
[
0
,
1
,
2
]),
((
5
,
4
,
65
),
[
0
,
1
,
2
]),
# 111
((
65
,
4
,
3
),
[
0
,
1
,
2
]),
((
5
,
65
,
3
),
[
0
,
1
,
2
]),
((
5
,
4
,
65
),
[
0
,
1
,
2
]),
# 111
...
@@ -127,15 +128,15 @@ def test_careduce():
...
@@ -127,15 +128,15 @@ def test_careduce():
((
4100
,
4
,
3
,
2
),
[
2
,
3
]),
((
4
,
4100
,
3
,
2
),
[
2
,
3
]),
((
4
,
3
,
4100
,
2
),
[
2
,
3
]),
((
4
,
3
,
2
,
4100
),
[
2
,
3
]),
# 0011
((
4100
,
4
,
3
,
2
),
[
2
,
3
]),
((
4
,
4100
,
3
,
2
),
[
2
,
3
]),
((
4
,
3
,
4100
,
2
),
[
2
,
3
]),
((
4
,
3
,
2
,
4100
),
[
2
,
3
]),
# 0011
((
4100
,
4
,
3
,
2
),
[
1
,
3
]),
((
4
,
4100
,
3
,
2
),
[
1
,
3
]),
((
4
,
3
,
4100
,
2
),
[
1
,
3
]),
((
4
,
3
,
2
,
4100
),
[
1
,
3
]),
# 0101
((
4100
,
4
,
3
,
2
),
[
1
,
3
]),
((
4
,
4100
,
3
,
2
),
[
1
,
3
]),
((
4
,
3
,
4100
,
2
),
[
1
,
3
]),
((
4
,
3
,
2
,
4100
),
[
1
,
3
]),
# 0101
((
4100
,
4
,
3
,
2
),
[
1
,
2
]),
((
4
,
4100
,
3
,
2
),
[
1
,
2
]),
((
4
,
3
,
4100
,
2
),
[
1
,
2
]),
((
4
,
3
,
2
,
4100
),
[
1
,
2
]),
# 0110
((
4100
,
4
,
3
,
2
),
[
1
,
2
]),
((
4
,
4100
,
3
,
2
),
[
1
,
2
]),
((
4
,
3
,
4100
,
2
),
[
1
,
2
]),
((
4
,
3
,
2
,
4100
),
[
1
,
2
]),
# 0110
((
4100
,
4
,
3
,
2
),[
0
,
3
]),((
4
,
4100
,
3
,
2
),[
0
,
3
]),((
4
,
3
,
4100
,
2
),[
0
,
3
]),((
4
,
3
,
2
,
4100
),[
0
,
3
]),
#
1001
((
4100
,
4
,
3
,
2
),
[
0
,
3
]),
((
4
,
4100
,
3
,
2
),
[
0
,
3
]),
((
4
,
3
,
4100
,
2
),
[
0
,
3
]),
((
4
,
3
,
2
,
4100
),
[
0
,
3
]),
#
1001
#
((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented
#
((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented
((
4100
,
4
,
3
,
2
),
[
0
,
1
]),
((
4
,
4100
,
3
,
2
),
[
0
,
1
]),
((
4
,
3
,
4100
,
2
),
[
0
,
1
]),
((
4
,
3
,
2
,
4100
),
[
0
,
1
]),
# 1100
((
4100
,
4
,
3
,
2
),
[
0
,
1
]),
((
4
,
4100
,
3
,
2
),
[
0
,
1
]),
((
4
,
3
,
4100
,
2
),
[
0
,
1
]),
((
4
,
3
,
2
,
4100
),
[
0
,
1
]),
# 1100
# reduce over 3d
# reduce over 3d
# 3d not tested: 1101, 1110, 1111
# 3d not tested: 1101, 1110, 1111
((
4100
,
4
,
3
,
2
),[
0
,
1
,
3
]),((
4
,
4100
,
3
,
2
),[
0
,
1
,
3
]),((
4
,
3
,
4100
,
2
),[
0
,
1
,
3
]),((
4
,
3
,
2
,
4100
),[
0
,
1
,
3
]),
#
1101
((
4100
,
4
,
3
,
2
),
[
0
,
1
,
3
]),
((
4
,
4100
,
3
,
2
),
[
0
,
1
,
3
]),
((
4
,
3
,
4100
,
2
),
[
0
,
1
,
3
]),
((
4
,
3
,
2
,
4100
),
[
0
,
1
,
3
]),
#
1101
((
4100
,
4
,
3
,
2
),
[
0
,
1
,
2
]),
((
4
,
4100
,
3
,
2
),
[
0
,
1
,
2
]),
((
4
,
3
,
4100
,
2
),
[
0
,
1
,
2
]),
((
4
,
3
,
2
,
4100
),
[
0
,
1
,
2
]),
# 1110
((
4100
,
4
,
3
,
2
),
[
0
,
1
,
2
]),
((
4
,
4100
,
3
,
2
),
[
0
,
1
,
2
]),
((
4
,
3
,
4100
,
2
),
[
0
,
1
,
2
]),
((
4
,
3
,
2
,
4100
),
[
0
,
1
,
2
]),
# 1110
((
4100
,
4
,
3
,
2
),
[
0
,
2
,
3
]),
((
4
,
4100
,
3
,
2
),
[
0
,
2
,
3
]),
((
4
,
3
,
4100
,
2
),
[
0
,
2
,
3
]),
# ((4,
3,2,4100),[0,2,3]),#
1011
((
4100
,
4
,
3
,
2
),
[
0
,
2
,
3
]),
((
4
,
4100
,
3
,
2
),
[
0
,
2
,
3
]),
((
4
,
3
,
4100
,
2
),
[
0
,
2
,
3
]),
# ((4,
3, 2, 4100), [0, 2, 3]), #
1011
((
4100
,
4
,
3
,
2
),
[
1
,
2
,
3
]),
((
4
,
4100
,
3
,
2
),
[
1
,
2
,
3
]),
((
4
,
3
,
4100
,
2
),
[
1
,
2
,
3
]),
((
4
,
3
,
2
,
4100
),
[
1
,
2
,
3
]),
# 0111
((
4100
,
4
,
3
,
2
),
[
1
,
2
,
3
]),
((
4
,
4100
,
3
,
2
),
[
1
,
2
,
3
]),
((
4
,
3
,
4100
,
2
),
[
1
,
2
,
3
]),
((
4
,
3
,
2
,
4100
),
[
1
,
2
,
3
]),
# 0111
((
65
,
4
,
3
,
2
),
[
1
,
2
,
3
]),
((
4
,
65
,
3
,
2
),
[
1
,
2
,
3
]),
((
4
,
3
,
65
,
2
),
[
1
,
2
,
3
]),
((
4
,
3
,
2
,
65
),
[
1
,
2
,
3
]),
# 0111
((
65
,
4
,
3
,
2
),
[
1
,
2
,
3
]),
((
4
,
65
,
3
,
2
),
[
1
,
2
,
3
]),
((
4
,
3
,
65
,
2
),
[
1
,
2
,
3
]),
((
4
,
3
,
2
,
65
),
[
1
,
2
,
3
]),
# 0111
...
@@ -148,25 +149,25 @@ def test_careduce():
...
@@ -148,25 +149,25 @@ def test_careduce():
]:
]:
op
=
careduce_op
(
scalar_op
,
axis
=
pattern
)
op
=
careduce_op
(
scalar_op
,
axis
=
pattern
)
pat
=
tensor_pattern_to_gpu_pattern
(
shape
,
pattern
)
tensor_pattern_to_gpu_pattern
(
shape
,
pattern
)
a
=
tensor
.
TensorType
(
'float32'
,
(
False
,)
*
len
(
shape
))()
a
=
tensor
.
TensorType
(
'float32'
,
(
False
,)
*
len
(
shape
))()
b
=
op
(
a
*
a
)
b
=
op
(
a
*
a
)
val
=
numpy
.
random
.
rand
(
numpy
.
prod
(
shape
))
.
reshape
(
shape
)
val
=
numpy
.
random
.
rand
(
numpy
.
prod
(
shape
))
.
reshape
(
shape
)
# val = numpy.ones(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val
=
theano
.
_asarray
(
val
,
dtype
=
'float32'
)
val
=
theano
.
_asarray
(
val
,
dtype
=
'float32'
)
f
=
theano
.
function
([
a
],
b
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
a
],
b
,
mode
=
mode_with_gpu
)
f2
=
theano
.
function
([
a
],
b
,
mode
=
mode_without_gpu
)
f2
=
theano
.
function
([
a
],
b
,
mode
=
mode_without_gpu
)
assert
tcn
.
GpuCAReduce
in
[
x
.
op
.
__class__
assert
tcn
.
GpuCAReduce
in
[
for
x
in
f
.
maker
.
fgraph
.
toposort
()],
(
x
.
op
.
__class__
for
x
in
f
.
maker
.
fgraph
.
toposort
()],
(
scalar_op
,
shape
,
pattern
)
scalar_op
,
shape
,
pattern
)
if
tcn
.
GpuElemwise
in
[
x
.
op
.
__class__
if
(
tcn
.
GpuElemwise
in
[
for
x
in
f
.
maker
.
fgraph
.
toposort
()]
:
x
.
op
.
__class__
for
x
in
f
.
maker
.
fgraph
.
toposort
()])
:
assert
tcn
.
GpuReshape
in
[
x
.
op
.
__class__
assert
tcn
.
GpuReshape
in
[
for
x
in
f
.
maker
.
fgraph
.
toposort
()]
x
.
op
.
__class__
for
x
in
f
.
maker
.
fgraph
.
toposort
()]
assert
op
.
__class__
in
[
x
.
op
.
__class__
assert
op
.
__class__
in
[
for
x
in
f2
.
maker
.
fgraph
.
toposort
()],
(
x
.
op
.
__class__
for
x
in
f2
.
maker
.
fgraph
.
toposort
()],
(
scalar_op
,
shape
,
pattern
)
scalar_op
,
shape
,
pattern
)
f_caused_value_error
=
False
f_caused_value_error
=
False
try
:
try
:
...
@@ -176,7 +177,8 @@ def test_careduce():
...
@@ -176,7 +177,8 @@ def test_careduce():
f_caused_value_error
=
True
f_caused_value_error
=
True
except
NotImplementedError
:
except
NotImplementedError
:
if
(
numpy
.
prod
(
shape
)
==
0
and
if
(
numpy
.
prod
(
shape
)
==
0
and
getattr
(
scalar_op
,
'identity'
,
None
)
!=
0
):
getattr
(
scalar_op
,
'identity'
,
None
)
!=
0
):
continue
continue
raise
raise
...
@@ -208,9 +210,11 @@ def test_careduce():
...
@@ -208,9 +210,11 @@ def test_careduce():
# example in debug mode with unittests.rseed=9275
# example in debug mode with unittests.rseed=9275
orig_rtol
=
theano
.
tensor
.
basic
.
float32_rtol
orig_rtol
=
theano
.
tensor
.
basic
.
float32_rtol
theano
.
tensor
.
basic
.
float32_rtol
=
2e-5
theano
.
tensor
.
basic
.
float32_rtol
=
2e-5
assert
_allclose
(
f_out
,
f2_out
),
(
'shape'
,
shape
,
assert
_allclose
(
f_out
,
f2_out
),
(
'pattern'
,
pattern
,
'shape'
,
scalar_op
,
shape
,
'pattern'
,
pattern
,
scalar_op
,
sum
([
shape
[
i
]
for
i
in
pattern
]),
sum
([
shape
[
i
]
for
i
in
pattern
]),
f2
(
val
),
f
(
val
),
val
)
f2
(
val
),
f
(
val
),
val
)
finally
:
finally
:
...
@@ -218,34 +222,36 @@ def test_careduce():
...
@@ -218,34 +222,36 @@ def test_careduce():
# test with dimshuffle
# test with dimshuffle
# we shuffle the 2 outer dims.
# we shuffle the 2 outer dims.
for
shape
,
pattern
in
[
# ((5,),[0]),
# for shape, pattern in [((5,), [0]),
((
5
,
4
),
[
0
,
1
]),
((
5
,
4
),
[
0
]),
for
shape
,
pattern
in
[((
5
,
4
),
[
0
,
1
]),
((
5
,
4
),
[
0
]),
((
5
,
4
,
3
),
[
0
]),
((
5
,
4
,
3
),
[
0
,
1
]),
((
5
,
4
,
3
),
[
2
]),
((
5
,
4
,
3
),
[
0
,
1
,
2
]),
((
5
,
4
,
3
),
[
0
]),
((
5
,
4
,
3
),
[
0
,
1
]),
((
5
,
4
,
3
,
2
),
[
0
,
1
,
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
0
,
2
,
3
]),
((
5
,
4
,
3
),
[
2
]),
((
5
,
4
,
3
),
[
0
,
1
,
2
]),
((
128
,
1
,
3
,
3
),
[
0
,
1
,
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
0
,
1
,
2
,
3
]),
]:
((
5
,
4
,
3
,
2
),
[
0
,
2
,
3
]),
((
128
,
1
,
3
,
3
),
[
0
,
1
,
2
,
3
]),
]:
op
=
careduce_op
(
scalar_op
,
axis
=
pattern
)
op
=
careduce_op
(
scalar_op
,
axis
=
pattern
)
pat
=
tensor_pattern_to_gpu_pattern
(
shape
,
pattern
)
tensor_pattern_to_gpu_pattern
(
shape
,
pattern
)
a
=
tensor
.
TensorType
(
'float32'
,
(
False
,)
*
len
(
shape
))()
a
=
tensor
.
TensorType
(
'float32'
,
(
False
,)
*
len
(
shape
))()
dim_pattern
=
list
(
range
(
len
(
shape
)))
dim_pattern
=
list
(
range
(
len
(
shape
)))
dim_pattern
[
0
]
=
1
dim_pattern
[
0
]
=
1
dim_pattern
[
1
]
=
0
dim_pattern
[
1
]
=
0
a
=
a
.
dimshuffle
(
dim_pattern
)
a
=
a
.
dimshuffle
(
dim_pattern
)
b
=
op
(
a
*
a
)
b
=
op
(
a
*
a
)
val
=
numpy
.
random
.
rand
(
numpy
.
prod
(
shape
))
.
reshape
(
shape
)
val
=
numpy
.
random
.
rand
(
numpy
.
prod
(
shape
))
.
reshape
(
shape
)
# val = numpy.ones(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val
=
theano
.
_asarray
(
val
,
dtype
=
'float32'
)
val
=
theano
.
_asarray
(
val
,
dtype
=
'float32'
)
f
=
theano
.
function
([
a
],
b
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
a
],
b
,
mode
=
mode_with_gpu
)
f2
=
theano
.
function
([
a
],
b
,
mode
=
mode_without_gpu
)
f2
=
theano
.
function
([
a
],
b
,
mode
=
mode_without_gpu
)
assert
tcn
.
GpuCAReduce
in
[
x
.
op
.
__class__
assert
tcn
.
GpuCAReduce
in
[
for
x
in
f
.
maker
.
fgraph
.
toposort
()],
(
x
.
op
.
__class__
for
x
in
f
.
maker
.
fgraph
.
toposort
()],
(
scalar_op
,
shape
,
pattern
)
scalar_op
,
shape
,
pattern
)
assert
tcn
.
GpuElemwise
not
in
[
x
.
op
.
__class__
assert
tcn
.
GpuElemwise
not
in
[
for
x
in
f
.
maker
.
fgraph
.
toposort
()]
x
.
op
.
__class__
for
x
in
f
.
maker
.
fgraph
.
toposort
()]
assert
op
.
__class__
in
[
x
.
op
.
__class__
assert
op
.
__class__
in
[
for
x
in
f2
.
maker
.
fgraph
.
toposort
()],
(
x
.
op
.
__class__
for
x
in
f2
.
maker
.
fgraph
.
toposort
()],
(
scalar_op
,
shape
,
pattern
)
scalar_op
,
shape
,
pattern
)
assert
_allclose
(
f2
(
val
),
f
(
val
)),
(
'shape'
,
shape
,
assert
_allclose
(
f2
(
val
),
f
(
val
)),
(
'shape'
,
shape
,
'pattern'
,
pattern
,
'pattern'
,
pattern
,
...
@@ -258,16 +264,15 @@ def test_careduce():
...
@@ -258,16 +264,15 @@ def test_careduce():
((
5
,
4
,
3
),
[
0
]),
((
5
,
4
,
3
),
[
0
,
1
]),
((
5
,
4
,
3
),
[
0
]),
((
5
,
4
,
3
),
[
0
,
1
]),
((
5
,
4
,
3
),
[
2
]),
((
5
,
4
,
3
),
[
0
,
1
,
2
]),
((
5
,
4
,
3
),
[
2
]),
((
5
,
4
,
3
),
[
0
,
1
,
2
]),
((
5
,
4
,
3
,
2
),
[
0
,
1
,
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
0
,
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
0
,
1
,
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
0
,
2
,
3
]),
((
128
,
1
,
3
,
3
),
[
0
,
1
,
2
,
3
]),
((
128
,
1
,
3
,
3
),
[
0
,
1
,
2
,
3
]),
]:
]:
op
=
careduce_op
(
scalar_op
,
axis
=
pattern
)
op
=
careduce_op
(
scalar_op
,
axis
=
pattern
)
pat
=
tensor_pattern_to_gpu_pattern
(
shape
,
pattern
)
tensor_pattern_to_gpu_pattern
(
shape
,
pattern
)
shape
=
numpy
.
asarray
(
shape
)
*
2
shape
=
numpy
.
asarray
(
shape
)
*
2
a
=
tensor
.
TensorType
(
'float32'
,
(
False
,)
*
len
(
shape
))()
a
=
tensor
.
TensorType
(
'float32'
,
(
False
,)
*
len
(
shape
))()
a2
=
tcn
.
CudaNdarrayType
((
False
,)
*
len
(
shape
))()
a2
=
tcn
.
CudaNdarrayType
((
False
,)
*
len
(
shape
))()
b
=
op
(
a
*
a
)
b
=
op
(
a
*
a
)
b2
=
op
(
a2
*
a2
)
b2
=
op
(
a2
*
a2
)
val
=
numpy
.
random
.
rand
(
numpy
.
prod
(
shape
))
.
reshape
(
shape
)
val
=
numpy
.
random
.
rand
(
numpy
.
prod
(
shape
))
.
reshape
(
shape
)
# val = numpy.ones(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
...
@@ -287,8 +292,8 @@ def test_careduce():
...
@@ -287,8 +292,8 @@ def test_careduce():
val2
=
val2
[::
2
,
::
2
,
::
2
,
::
2
]
val2
=
val2
[::
2
,
::
2
,
::
2
,
::
2
]
f
=
theano
.
function
([
a
],
b
,
mode
=
mode_without_gpu
)
f
=
theano
.
function
([
a
],
b
,
mode
=
mode_without_gpu
)
f2
=
theano
.
function
([
a2
],
b2
,
mode
=
mode_with_gpu
)
f2
=
theano
.
function
([
a2
],
b2
,
mode
=
mode_with_gpu
)
assert
tcn
.
GpuCAReduce
in
[
x
.
op
.
__class__
assert
tcn
.
GpuCAReduce
in
[
for
x
in
f2
.
maker
.
fgraph
.
toposort
()],
(
x
.
op
.
__class__
for
x
in
f2
.
maker
.
fgraph
.
toposort
()],
(
scalar_op
,
shape
,
pattern
)
scalar_op
,
shape
,
pattern
)
assert
tcn
.
GpuElemwise
not
in
[
x
.
op
.
__class__
assert
tcn
.
GpuElemwise
not
in
[
x
.
op
.
__class__
for
x
in
f
.
maker
.
fgraph
.
toposort
()]
for
x
in
f
.
maker
.
fgraph
.
toposort
()]
...
@@ -374,8 +379,10 @@ def test_reshape():
...
@@ -374,8 +379,10 @@ def test_reshape():
# Test zero dimensions are allowed
# Test zero dimensions are allowed
x
=
T
.
vector
(
'x'
)
x
=
T
.
vector
(
'x'
)
f_reshp
=
theano
.
function
([
x
],
x
.
reshape
((
0
,
100
)),
mode
=
mode_with_gpu
)
f_reshp
=
theano
.
function
(
assert
f_reshp
(
numpy
.
ndarray
((
0
,),
dtype
=
'float32'
))
.
shape
==
(
0
,
100
)
[
x
],
x
.
reshape
((
0
,
100
)),
mode
=
mode_with_gpu
)
assert
f_reshp
(
numpy
.
ndarray
((
0
,
),
dtype
=
'float32'
))
.
shape
==
(
0
,
100
)
def
test_alloc_empty
():
def
test_alloc_empty
():
...
@@ -406,7 +413,7 @@ def test_elemwise_empty():
...
@@ -406,7 +413,7 @@ def test_elemwise_empty():
b
=
tensor
.
fmatrix
()
b
=
tensor
.
fmatrix
()
f
=
pfunc
([
b
],
[],
updates
=
[(
a
,
a
+
b
)],
mode
=
mode_with_gpu
)
f
=
pfunc
([
b
],
[],
updates
=
[(
a
,
a
+
b
)],
mode
=
mode_with_gpu
)
f2
=
pfunc
([
b
],
[],
updates
=
[(
a
,
a
+
b
)],
mode
=
mode_without_gpu
)
pfunc
([
b
],
[],
updates
=
[(
a
,
a
+
b
)],
mode
=
mode_without_gpu
)
a0
=
a
.
get_value
()
*
1.0
a0
=
a
.
get_value
()
*
1.0
f
(
numpy
.
ones
((
0
,
0
),
dtype
=
'float32'
))
f
(
numpy
.
ones
((
0
,
0
),
dtype
=
'float32'
))
...
@@ -424,8 +431,9 @@ def test_elemwise0():
...
@@ -424,8 +431,9 @@ def test_elemwise0():
f
=
pfunc
([
b
],
[],
updates
=
[(
a
,
a
+
b
)],
mode
=
mode_with_gpu
)
f
=
pfunc
([
b
],
[],
updates
=
[(
a
,
a
+
b
)],
mode
=
mode_with_gpu
)
# check that we work inplace.
# check that we work inplace.
assert
(
list
(
f
.
maker
.
fgraph
.
toposort
()[
1
]
.
op
.
destroy_map
.
items
())
assert
(
list
(
==
[(
0
,
[
0
])])
f
.
maker
.
fgraph
.
toposort
()[
1
]
.
op
.
destroy_map
.
items
())
==
[
(
0
,
[
0
])])
a0
=
a
.
get_value
()
*
1.0
a0
=
a
.
get_value
()
*
1.0
f
(
numpy
.
ones
((
4
,
4
),
dtype
=
'float32'
))
f
(
numpy
.
ones
((
4
,
4
),
dtype
=
'float32'
))
...
@@ -495,7 +503,8 @@ def test_elemwise2():
...
@@ -495,7 +503,8 @@ def test_elemwise2():
dtype
=
'float32'
),
'a'
)
dtype
=
'float32'
),
'a'
)
b
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
[
0
]
*
len
(
shape
))()
b
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
[
0
]
*
len
(
shape
))()
f
=
pfunc
([
b
],
[],
updates
=
[(
a
,
(
a
+
b
)
.
dimshuffle
([
2
,
0
,
3
,
1
])
*
f
=
pfunc
([
b
],
[],
updates
=
[(
a
,
(
a
+
b
)
.
dimshuffle
([
2
,
0
,
3
,
1
])
*
tensor
.
exp
(
b
**
a
)
.
dimshuffle
([
2
,
0
,
3
,
1
]))],
mode
=
mode_with_gpu
)
tensor
.
exp
(
b
**
a
)
.
dimshuffle
([
2
,
0
,
3
,
1
]))],
mode
=
mode_with_gpu
)
has_elemwise
=
False
has_elemwise
=
False
for
i
,
node
in
enumerate
(
f
.
maker
.
fgraph
.
toposort
()):
for
i
,
node
in
enumerate
(
f
.
maker
.
fgraph
.
toposort
()):
has_elemwise
=
has_elemwise
or
isinstance
(
node
.
op
,
tensor
.
Elemwise
)
has_elemwise
=
has_elemwise
or
isinstance
(
node
.
op
,
tensor
.
Elemwise
)
...
@@ -585,10 +594,11 @@ def test_elemwise_composite_float64():
...
@@ -585,10 +594,11 @@ def test_elemwise_composite_float64():
return
l
return
l
for
mode
in
[
mode_with_gpu
,
mode_with_gpu
.
excluding
(
'gpu_after_fusion'
),
for
mode
in
[
mode_with_gpu
,
mode_with_gpu
.
excluding
(
'gpu_after_fusion'
),
mode_with_gpu
.
excluding
(
'elemwise_fusion'
)]:
mode_with_gpu
.
excluding
(
'elemwise_fusion'
)]:
f
=
pfunc
([
a
,
b
],
f
=
pfunc
(
tensor
.
cast
(
tensor
.
lt
(
tensor
.
cast
(
a
,
'float64'
)
**
2
,
[
a
,
b
],
b
),
tensor
.
cast
(
'float32'
),
mode
=
mode
)
tensor
.
lt
(
tensor
.
cast
(
a
,
'float64'
)
**
2
,
b
),
'float32'
),
mode
=
mode
)
out
=
f
(
av
,
bv
)
out
=
f
(
av
,
bv
)
assert
numpy
.
all
(
out
==
((
av
**
2
)
<
bv
))
assert
numpy
.
all
(
out
==
((
av
**
2
)
<
bv
))
...
@@ -648,11 +658,11 @@ def speed_elemwise_collapse():
...
@@ -648,11 +658,11 @@ def speed_elemwise_collapse():
v
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
v
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
v
=
v
[:,
::
2
,
:,
:]
v
=
v
[:,
::
2
,
:,
:]
v
=
cuda_ndarray
.
CudaNdarray
(
v
)
v
=
cuda_ndarray
.
CudaNdarray
(
v
)
t
1
=
t
ime
.
time
()
time
.
time
()
for
i
in
range
(
100
):
for
i
in
range
(
100
):
# let debugmode catch errors
# let debugmode catch errors
f
(
v
)
f
(
v
)
t
2
=
t
ime
.
time
()
time
.
time
()
def
speed_elemwise_collapse2
():
def
speed_elemwise_collapse2
():
...
@@ -672,11 +682,11 @@ def speed_elemwise_collapse2():
...
@@ -672,11 +682,11 @@ def speed_elemwise_collapse2():
v
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
v
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
v
=
v
[:,
:,
:,
::
2
]
v
=
v
[:,
:,
:,
::
2
]
v
=
cuda_ndarray
.
CudaNdarray
(
v
)
v
=
cuda_ndarray
.
CudaNdarray
(
v
)
t
1
=
t
ime
.
time
()
time
.
time
()
for
i
in
range
(
100
):
for
i
in
range
(
100
):
# let debugmode catch errors
# let debugmode catch errors
f
(
v
)
f
(
v
)
t
2
=
t
ime
.
time
()
time
.
time
()
def
test_elemwise_collapse
():
def
test_elemwise_collapse
():
...
@@ -848,8 +858,8 @@ def test_hostfromgpu_shape_i():
...
@@ -848,8 +858,8 @@ def test_hostfromgpu_shape_i():
ca
=
theano
.
sandbox
.
cuda
.
var
.
CudaNdarrayType
((
False
,
False
))()
ca
=
theano
.
sandbox
.
cuda
.
var
.
CudaNdarrayType
((
False
,
False
))()
av
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
5
,
4
),
dtype
=
'float32'
)
av
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
5
,
4
),
dtype
=
'float32'
)
cv
=
cuda
.
CudaNdarray
(
numpy
.
asarray
(
numpy
.
random
.
rand
(
5
,
4
),
cv
=
cuda
.
CudaNdarray
(
numpy
.
asarray
(
dtype
=
'float32'
))
numpy
.
random
.
rand
(
5
,
4
),
dtype
=
'float32'
))
f
=
theano
.
function
([
a
],
cuda
.
basic_ops
.
gpu_from_host
(
a
),
mode
=
m
)
f
=
theano
.
function
([
a
],
cuda
.
basic_ops
.
gpu_from_host
(
a
),
mode
=
m
)
assert
cuda
.
basic_ops
.
gpu_from_host
in
[
x
.
op
assert
cuda
.
basic_ops
.
gpu_from_host
in
[
x
.
op
...
@@ -880,7 +890,7 @@ def test_gpujoin_assert_cndas():
...
@@ -880,7 +890,7 @@ def test_gpujoin_assert_cndas():
a
=
theano
.
shared
(
_a
)
a
=
theano
.
shared
(
_a
)
try
:
try
:
c
=
c
uda
.
basic_ops
.
gpu_join
(
1
,
a
)
cuda
.
basic_ops
.
gpu_join
(
1
,
a
)
# can't "assert False" here, as we want the assertion
# can't "assert False" here, as we want the assertion
# error from gpu_join
# error from gpu_join
except
TypeError
:
except
TypeError
:
...
@@ -921,12 +931,17 @@ def test_gpujoin_gpualloc():
...
@@ -921,12 +931,17 @@ def test_gpujoin_gpualloc():
b
=
T
.
fmatrix
(
'b'
)
b
=
T
.
fmatrix
(
'b'
)
b_val
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
3
,
5
),
dtype
=
'float32'
)
b_val
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
3
,
5
),
dtype
=
'float32'
)
f
=
theano
.
function
([
a
,
b
],
T
.
join
(
0
,
T
.
zeros_like
(
a
),
T
.
ones_like
(
b
))
+
4
,
f
=
theano
.
function
(
[
a
,
b
],
T
.
join
(
0
,
T
.
zeros_like
(
a
),
T
.
ones_like
(
b
))
+
4
,
mode
=
mode_without_gpu
)
mode
=
mode_without_gpu
)
f_gpu
=
theano
.
function
([
a
,
b
],
T
.
join
(
0
,
T
.
zeros_like
(
a
),
T
.
ones_like
(
b
)),
f_gpu
=
theano
.
function
(
[
a
,
b
],
T
.
join
(
0
,
T
.
zeros_like
(
a
),
T
.
ones_like
(
b
)),
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
f_gpu2
=
theano
.
function
([
a
,
b
],
T
.
join
(
0
,
T
.
zeros_like
(
a
),
f_gpu2
=
theano
.
function
(
T
.
ones_like
(
b
))
+
4
,
[
a
,
b
],
T
.
join
(
0
,
T
.
zeros_like
(
a
),
T
.
ones_like
(
b
))
+
4
,
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
assert
sum
([
node
.
op
==
T
.
alloc
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
==
2
assert
sum
([
node
.
op
==
T
.
alloc
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
==
2
...
@@ -963,9 +978,6 @@ def test_gpualloc_output_to_gpu():
...
@@ -963,9 +978,6 @@ def test_gpualloc_output_to_gpu():
assert
numpy
.
allclose
(
f
(
5
),
f_gpu
(
5
))
assert
numpy
.
allclose
(
f
(
5
),
f_gpu
(
5
))
import
theano.tensor.tests.test_basic
class
TestAlloc
(
theano
.
tensor
.
tests
.
test_basic
.
TestAlloc
):
class
TestAlloc
(
theano
.
tensor
.
tests
.
test_basic
.
TestAlloc
):
dtype
=
"float32"
dtype
=
"float32"
mode
=
mode_with_gpu
mode
=
mode_with_gpu
...
@@ -987,7 +999,6 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
...
@@ -987,7 +999,6 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
self
.
shared
=
cuda
.
shared_constructor
self
.
shared
=
cuda
.
shared_constructor
import
theano.tensor.tests.test_subtensor
# This is to don't duplicate test.
# This is to don't duplicate test.
...
@@ -1035,7 +1046,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
...
@@ -1035,7 +1046,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
((
3
,
10
,
68000
),
[
1
,
2
],
True
),
((
3
,
10
,
68000
),
[
1
,
2
],
True
),
((
3
,
69000
,
11
),
[
1
,
2
],
True
),
((
3
,
69000
,
11
),
[
1
,
2
],
True
),
# much memory, will be disabled if needed
# much memory, will be disabled if needed
((
2
*
10e7
,),
[
-
1
,
199999999
],
True
),
((
2
*
10e7
,),
[
-
1
,
199999999
],
True
),
((
4
,
5
),
[
2
,
3
],
True
),
((
4
,
5
),
[
2
,
3
],
True
),
((
4
,
2
,
3
),
[
0
,
3
],
True
),
((
4
,
2
,
3
),
[
0
,
3
],
True
),
((
4
,
2
,
3
),
[
3
,
3
,
1
,
1
,
2
,
((
4
,
2
,
3
),
[
3
,
3
,
1
,
1
,
2
,
...
@@ -1047,8 +1058,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
...
@@ -1047,8 +1058,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
# optimized for that case.
# optimized for that case.
((
4
,
4
,
2
,
3
),
[
3
,
3
,
1
,
1
,
2
,
2
,
0
,
0
,
((
4
,
4
,
2
,
3
),
[
3
,
3
,
1
,
1
,
2
,
2
,
0
,
0
,
-
1
,
-
2
,
-
3
,
-
4
],
False
),
-
1
,
-
2
,
-
3
,
-
4
],
False
),
((
1
,
10
),
[
0
,
0
],
True
),
((
1
,
10
),
[
0
,
0
],
True
),
]:
]:
# If there is not enough memory on the GPU, skip the test
# If there is not enough memory on the GPU, skip the test
size_needed
=
numpy
.
prod
(
shape
)
*
(
4
+
1
)
size_needed
=
numpy
.
prod
(
shape
)
*
(
4
+
1
)
if
isinstance
(
theano
.
compile
.
get_default_mode
(),
if
isinstance
(
theano
.
compile
.
get_default_mode
(),
...
@@ -1106,13 +1116,14 @@ def test_advinc_subtensor1():
...
@@ -1106,13 +1116,14 @@ def test_advinc_subtensor1():
rep
[[
0
,
2
]]
+=
yval
rep
[[
0
,
2
]]
+=
yval
utt
.
assert_allclose
(
rval
,
rep
)
utt
.
assert_allclose
(
rval
,
rep
)
def
test_advset_subtensor1
():
def
test_advset_subtensor1
():
""" Test GPU version of set_subtensor on vectors (uses GpuAdvancedIncSubtensor1) """
""" Test GPU version of set_subtensor on vectors (uses GpuAdvancedIncSubtensor1) """
shp
=
(
10
,)
shp
=
(
10
,)
shared
=
cuda
.
shared_constructor
shared
=
cuda
.
shared_constructor
xval
=
numpy
.
arange
(
shp
[
0
],
dtype
=
'float32'
)
.
reshape
(
shp
)
+
1
xval
=
numpy
.
arange
(
shp
[
0
],
dtype
=
'float32'
)
.
reshape
(
shp
)
+
1
idxs
=
numpy
.
array
([
0
,
2
,
5
,
7
,
3
],
dtype
=
'int32'
)
idxs
=
numpy
.
array
([
0
,
2
,
5
,
7
,
3
],
dtype
=
'int32'
)
yval
=
numpy
.
ones
(
len
(
idxs
),
dtype
=
'float32'
)
*
10
yval
=
numpy
.
ones
(
len
(
idxs
),
dtype
=
'float32'
)
*
10
x
=
shared
(
xval
,
name
=
'x'
)
x
=
shared
(
xval
,
name
=
'x'
)
y
=
T
.
tensor
(
dtype
=
'float32'
,
broadcastable
=
(
False
,)
*
len
(
shp
),
name
=
'y'
)
y
=
T
.
tensor
(
dtype
=
'float32'
,
broadcastable
=
(
False
,)
*
len
(
shp
),
name
=
'y'
)
expr
=
T
.
advanced_set_subtensor1
(
x
,
y
,
idxs
)
expr
=
T
.
advanced_set_subtensor1
(
x
,
y
,
idxs
)
...
@@ -1124,13 +1135,14 @@ def test_advset_subtensor1():
...
@@ -1124,13 +1135,14 @@ def test_advset_subtensor1():
rep
[
idxs
]
=
yval
rep
[
idxs
]
=
yval
utt
.
assert_allclose
(
rval
,
rep
)
utt
.
assert_allclose
(
rval
,
rep
)
def
test_advset_subtensor1_2d
():
def
test_advset_subtensor1_2d
():
""" Test GPU version of set_subtensor on matrices (uses GpuAdvancedIncSubtensor1_dev20 if compute capability >= 2.0) """
""" Test GPU version of set_subtensor on matrices (uses GpuAdvancedIncSubtensor1_dev20 if compute capability >= 2.0) """
shp
=
(
10
,
5
)
shp
=
(
10
,
5
)
shared
=
cuda
.
shared_constructor
shared
=
cuda
.
shared_constructor
xval
=
numpy
.
arange
(
numpy
.
prod
(
shp
),
dtype
=
'float32'
)
.
reshape
(
shp
)
+
1
xval
=
numpy
.
arange
(
numpy
.
prod
(
shp
),
dtype
=
'float32'
)
.
reshape
(
shp
)
+
1
idxs
=
numpy
.
array
([
0
,
2
,
5
,
7
,
3
],
dtype
=
'int32'
)
idxs
=
numpy
.
array
([
0
,
2
,
5
,
7
,
3
],
dtype
=
'int32'
)
yval
=
numpy
.
ones
((
len
(
idxs
),
shp
[
1
]),
dtype
=
'float32'
)
*
10
yval
=
numpy
.
ones
((
len
(
idxs
),
shp
[
1
]),
dtype
=
'float32'
)
*
10
x
=
shared
(
xval
,
name
=
'x'
)
x
=
shared
(
xval
,
name
=
'x'
)
y
=
T
.
tensor
(
dtype
=
'float32'
,
broadcastable
=
(
False
,)
*
len
(
shp
),
name
=
'y'
)
y
=
T
.
tensor
(
dtype
=
'float32'
,
broadcastable
=
(
False
,)
*
len
(
shp
),
name
=
'y'
)
expr
=
T
.
advanced_set_subtensor1
(
x
,
y
,
idxs
)
expr
=
T
.
advanced_set_subtensor1
(
x
,
y
,
idxs
)
...
@@ -1142,37 +1154,38 @@ def test_advset_subtensor1_2d():
...
@@ -1142,37 +1154,38 @@ def test_advset_subtensor1_2d():
rep
[
idxs
]
=
yval
rep
[
idxs
]
=
yval
utt
.
assert_allclose
(
rval
,
rep
)
utt
.
assert_allclose
(
rval
,
rep
)
def
test_inc_subtensor
():
def
test_inc_subtensor
():
shared
=
cuda
.
shared_constructor
cuda
.
shared_constructor
#shared = tensor.shared
#
shared = tensor.shared
x
,
y
=
T
.
fmatrices
(
'x'
,
'y'
)
x
,
y
=
T
.
fmatrices
(
'x'
,
'y'
)
xval
=
numpy
.
asarray
(
[[
1
,
2
,
3
],
[
4
,
5
,
6
],
[
7
,
8
,
9
]],
xval
=
numpy
.
asarray
(
dtype
=
'float32'
)
[[
1
,
2
,
3
],
[
4
,
5
,
6
],
[
7
,
8
,
9
]],
dtype
=
'float32'
)
yval
=
numpy
.
asarray
(
[[
10
,
10
,
10
],
[
10
,
10
,
10
],
[
10
,
10
,
10
]],
yval
=
numpy
.
asarray
(
dtype
=
'float32'
)
[[
10
,
10
,
10
],
[
10
,
10
,
10
],
[
10
,
10
,
10
]],
dtype
=
'float32'
)
expr
=
T
.
inc_subtensor
(
x
[:,
1
:
3
],
y
[:,
1
:
3
])
expr
=
T
.
inc_subtensor
(
x
[:,
1
:
3
],
y
[:,
1
:
3
])
f
=
theano
.
function
([
x
,
y
],
expr
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
x
,
y
],
expr
,
mode
=
mode_with_gpu
)
assert
sum
([
isinstance
(
node
.
op
,
cuda
.
GpuIncSubtensor
)
and
assert
sum
([
isinstance
(
node
.
op
,
cuda
.
GpuIncSubtensor
)
and
node
.
op
.
set_instead_of_inc
==
False
node
.
op
.
set_instead_of_inc
is
False
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
==
1
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
==
1
utt
.
assert_allclose
(
f
(
xval
,
yval
),
[[
1.
,
12.
,
13.
],
utt
.
assert_allclose
(
f
(
xval
,
yval
),
[[
1.
,
12.
,
13.
],
[
4.
,
15.
,
16.
],
[
7.
,
18.
,
19.
]])
[
4.
,
15.
,
16.
],
[
7.
,
18.
,
19.
]])
def
test_set_subtensor
():
def
test_set_subtensor
():
shared
=
cuda
.
shared_constructor
cuda
.
shared_constructor
#shared = tensor.shared
#
shared = tensor.shared
x
,
y
=
T
.
fmatrices
(
'x'
,
'y'
)
x
,
y
=
T
.
fmatrices
(
'x'
,
'y'
)
xval
=
numpy
.
asarray
(
[[
1
,
2
,
3
],
[
4
,
5
,
6
],
[
7
,
8
,
9
]],
xval
=
numpy
.
asarray
(
dtype
=
'float32'
)
[[
1
,
2
,
3
],
[
4
,
5
,
6
],
[
7
,
8
,
9
]],
dtype
=
'float32'
)
yval
=
numpy
.
asarray
(
[[
10
,
10
,
10
],
[
10
,
10
,
10
],
[
10
,
10
,
10
]],
yval
=
numpy
.
asarray
(
dtype
=
'float32'
)
[[
10
,
10
,
10
],
[
10
,
10
,
10
],
[
10
,
10
,
10
]],
dtype
=
'float32'
)
expr
=
T
.
set_subtensor
(
x
[:,
1
:
3
],
y
[:,
1
:
3
])
expr
=
T
.
set_subtensor
(
x
[:,
1
:
3
],
y
[:,
1
:
3
])
f
=
theano
.
function
([
x
,
y
],
expr
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
x
,
y
],
expr
,
mode
=
mode_with_gpu
)
assert
sum
([
isinstance
(
node
.
op
,
cuda
.
GpuIncSubtensor
)
and
assert
sum
([
isinstance
(
node
.
op
,
cuda
.
GpuIncSubtensor
)
and
node
.
op
.
set_instead_of_inc
==
True
node
.
op
.
set_instead_of_inc
is
True
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
==
1
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
==
1
f
(
xval
,
yval
)
f
(
xval
,
yval
)
...
@@ -1191,7 +1204,7 @@ def test_many_arg_elemwise():
...
@@ -1191,7 +1204,7 @@ def test_many_arg_elemwise():
for
arg
in
xrange
(
0
,
num_args
)]
for
arg
in
xrange
(
0
,
num_args
)]
symb_args
=
[
theano
.
tensor
.
TensorType
(
'float32'
,
symb_args
=
[
theano
.
tensor
.
TensorType
(
'float32'
,
(
False
,)
*
nb_dim
)()
(
False
,)
*
nb_dim
)()
for
arg
in
xrange
(
0
,
num_args
)]
for
arg
in
xrange
(
0
,
num_args
)]
outputs
=
[]
outputs
=
[]
...
@@ -1313,7 +1326,6 @@ class test_size(unittest.TestCase):
...
@@ -1313,7 +1326,6 @@ class test_size(unittest.TestCase):
assert
y
.
size
==
theano
.
function
([],
x
.
size
)()
assert
y
.
size
==
theano
.
function
([],
x
.
size
)()
import
theano.tensor.tests.test_sharedvar
# This test the case when the shared constructor view an CudaNdarray as input
# This test the case when the shared constructor view an CudaNdarray as input
test_shared_options
=
theano
.
tensor
.
tests
.
test_sharedvar
.
makeSharedTester
(
test_shared_options
=
theano
.
tensor
.
tests
.
test_sharedvar
.
makeSharedTester
(
shared_constructor_
=
tcn
.
shared_constructor
,
shared_constructor_
=
tcn
.
shared_constructor
,
...
@@ -1374,7 +1386,7 @@ def speed_reduce10():
...
@@ -1374,7 +1386,7 @@ def speed_reduce10():
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
#test_many_arg_elemwise()
#
test_many_arg_elemwise()
#test_gpujoin_assert_cndas()
#
test_gpujoin_assert_cndas()
test_advset_subtensor1
()
test_advset_subtensor1
()
test_advset_subtensor1_2d
()
test_advset_subtensor1_2d
()
theano/sandbox/cuda/tests/test_bench_loopfusion.py
浏览文件 @
b69ad54d
...
@@ -10,7 +10,7 @@ from __future__ import absolute_import, print_function, division
...
@@ -10,7 +10,7 @@ from __future__ import absolute_import, print_function, division
# so state is ignored
# so state is ignored
# since this job is not restartable, channel is also ignored
# since this job is not restartable, channel is also ignored
import
logging
,
time
,
sys
import
logging
import
numpy
import
numpy
from
six.moves
import
xrange
from
six.moves
import
xrange
...
@@ -18,8 +18,12 @@ from six.moves import xrange
...
@@ -18,8 +18,12 @@ from six.moves import xrange
import
theano
import
theano
from
theano.compile
import
shared
,
pfunc
from
theano.compile
import
shared
,
pfunc
from
theano
import
tensor
from
theano
import
tensor
from
theano.tensor.nnet
import
softplus
from
theano.tensor.nnet.nnet
import
softsign
from
theano.tensor.nnet.nnet
import
softsign
try
:
from
PIL
import
Image
except
ImportError
:
Image
=
None
# from PIL import Image
_logger
=
logging
.
getLogger
(
'theano.sandbox.cuda.tests.test_bench_loopfusion'
)
_logger
=
logging
.
getLogger
(
'theano.sandbox.cuda.tests.test_bench_loopfusion'
)
...
@@ -28,7 +32,8 @@ def _shared_uniform(rng, low, high, size, dtype, name=None):
...
@@ -28,7 +32,8 @@ def _shared_uniform(rng, low, high, size, dtype, name=None):
return
shared
(
return
shared
(
theano
.
_asarray
(
theano
.
_asarray
(
rng
.
uniform
(
low
=
low
,
high
=
high
,
size
=
size
),
rng
.
uniform
(
low
=
low
,
high
=
high
,
size
=
size
),
dtype
=
dtype
),
name
)
dtype
=
dtype
),
name
)
class
Kouh2008
(
object
):
class
Kouh2008
(
object
):
...
@@ -49,8 +54,10 @@ class Kouh2008(object):
...
@@ -49,8 +54,10 @@ class Kouh2008(object):
"""
"""
if
len
(
w_list
)
!=
len
(
x_list
):
if
len
(
w_list
)
!=
len
(
x_list
):
raise
ValueError
(
'w_list must have same len as x_list'
)
raise
ValueError
(
'w_list must have same len as x_list'
)
output
=
(
sum
(
w
*
tensor
.
pow
(
x
,
p
)
for
(
w
,
x
)
in
zip
(
w_list
,
x_list
)))
\
output
=
((
sum
(
w
*
tensor
.
pow
(
x
,
p
)
/
(
theano
.
_asarray
(
eps
,
dtype
=
k
.
type
.
dtype
)
+
k
+
tensor
.
pow
(
sum
(
tensor
.
pow
(
x
,
q
)
for
x
in
x_list
),
r
))
for
(
w
,
x
)
in
zip
(
w_list
,
x_list
)))
/
(
theano
.
_asarray
(
eps
,
dtype
=
k
.
type
.
dtype
)
+
k
+
tensor
.
pow
(
sum
(
tensor
.
pow
(
x
,
q
)
for
x
in
x_list
),
r
)))
assert
output
.
type
.
ndim
==
2
assert
output
.
type
.
ndim
==
2
self
.
__dict__
.
update
(
locals
())
self
.
__dict__
.
update
(
locals
())
...
@@ -80,9 +87,14 @@ class Kouh2008(object):
...
@@ -80,9 +87,14 @@ class Kouh2008(object):
w_sm
=
theano
.
tensor
.
nnet
.
softmax
(
w
)
w_sm
=
theano
.
tensor
.
nnet
.
softmax
(
w
)
w_list
=
[
w_sm
[:,
i
]
for
i
in
xrange
(
n_terms
)]
w_list
=
[
w_sm
[:,
i
]
for
i
in
xrange
(
n_terms
)]
w_l1
=
abs
(
w
)
.
sum
()
w_l1
=
abs
(
w
)
.
sum
()
w_l2_sqr
=
(
w
**
2
)
.
sum
()
w_l2_sqr
=
(
w
**
2
)
.
sum
()
else
:
else
:
w_list
=
[
shared_uniform
(
low
=-
2.0
/
n_terms
,
high
=
2.0
/
n_terms
,
size
=
(
n_out
,),
name
=
'w_
%
i'
%
i
)
w_list
=
[
shared_uniform
(
low
=-
2.0
/
n_terms
,
high
=
2.0
/
n_terms
,
size
=
(
n_out
,),
name
=
'w_
%
i'
%
i
)
for
i
in
xrange
(
n_terms
)]
for
i
in
xrange
(
n_terms
)]
w_l1
=
sum
(
abs
(
wi
)
.
sum
()
for
wi
in
w_list
)
w_l1
=
sum
(
abs
(
wi
)
.
sum
()
for
wi
in
w_list
)
w_l2_sqr
=
sum
((
wi
**
2
)
.
sum
()
for
wi
in
w_list
)
w_l2_sqr
=
sum
((
wi
**
2
)
.
sum
()
for
wi
in
w_list
)
...
@@ -102,18 +114,26 @@ class Kouh2008(object):
...
@@ -102,18 +114,26 @@ class Kouh2008(object):
p
=
tensor
.
nnet
.
sigmoid
(
p_unbounded
)
*
e_range_mag
+
e_range_low
p
=
tensor
.
nnet
.
sigmoid
(
p_unbounded
)
*
e_range_mag
+
e_range_low
q
=
tensor
.
nnet
.
sigmoid
(
q_unbounded
)
*
e_range_mag
+
e_range_low
q
=
tensor
.
nnet
.
sigmoid
(
q_unbounded
)
*
e_range_mag
+
e_range_low
r
=
tensor
.
nnet
.
sigmoid
(
r_unbounded
)
*
\
r
=
tensor
.
nnet
.
sigmoid
(
r_unbounded
)
*
\
theano
.
_asarray
(
1.0
/
e_range_low
-
1.0
/
e_range_high
,
dtype
=
dtype
)
\
theano
.
_asarray
(
1.0
/
e_range_low
-
1.0
/
e_range_high
,
+
theano
.
_asarray
(
1.0
/
e_range_high
,
dtype
=
dtype
)
dtype
=
dtype
)
+
\
theano
.
_asarray
(
1.0
/
e_range_high
,
dtype
=
dtype
)
k
=
softsign
(
k_unbounded
)
k
=
softsign
(
k_unbounded
)
if
use_softmax_w
:
if
use_softmax_w
:
rval
=
cls
(
w_list
,
x_list
,
p
,
q
,
r
,
k
,
rval
=
cls
(
w_list
,
x_list
,
p
,
q
,
r
,
k
,
params
=
[
p_unbounded
,
q_unbounded
,
r_unbounded
,
k_unbounded
,
w
]
+
params
,
params
=
[
p_unbounded
,
q_unbounded
,
r_unbounded
,
k_unbounded
,
w
]
+
params
,
updates
=
updates
)
updates
=
updates
)
else
:
else
:
rval
=
cls
(
w_list
,
x_list
,
p
,
q
,
r
,
k
,
rval
=
cls
(
w_list
,
x_list
,
p
,
q
,
r
,
k
,
params
=
[
p_unbounded
,
q_unbounded
,
r_unbounded
,
k_unbounded
]
+
w_list
+
params
,
params
=
[
p_unbounded
,
q_unbounded
,
r_unbounded
,
k_unbounded
]
+
w_list
+
params
,
updates
=
updates
)
updates
=
updates
)
rval
.
p_unbounded
=
p_unbounded
rval
.
p_unbounded
=
p_unbounded
rval
.
q_unbounded
=
q_unbounded
rval
.
q_unbounded
=
q_unbounded
...
@@ -126,8 +146,10 @@ class Kouh2008(object):
...
@@ -126,8 +146,10 @@ class Kouh2008(object):
return
rval
return
rval
@classmethod
@classmethod
def
new_filters_expbounds
(
cls
,
rng
,
input
,
n_in
,
n_out
,
n_terms
,
dtype
=
None
,
eps
=
1e-1
,
def
new_filters_expbounds
(
cls
,
rng
,
input
,
n_in
,
n_out
,
n_terms
,
exponent_range
=
(
1.0
,
3.0
),
filter_range
=
1.0
):
dtype
=
None
,
eps
=
1e-1
,
exponent_range
=
(
1.0
,
3.0
),
filter_range
=
1.0
):
"""Return a KouhLayer instance with random parameters
"""Return a KouhLayer instance with random parameters
The parameters are drawn on a range [typically] suitable for fine-tuning by gradient
The parameters are drawn on a range [typically] suitable for fine-tuning by gradient
...
@@ -161,18 +183,29 @@ class Kouh2008(object):
...
@@ -161,18 +183,29 @@ class Kouh2008(object):
def
shared_uniform
(
low
,
high
,
size
,
name
):
def
shared_uniform
(
low
,
high
,
size
,
name
):
return
_shared_uniform
(
rng
,
low
,
high
,
size
,
dtype
,
name
)
return
_shared_uniform
(
rng
,
low
,
high
,
size
,
dtype
,
name
)
f_list
=
[
shared_uniform
(
low
=-
2.0
/
numpy
.
sqrt
(
n_in
),
high
=
2.0
/
numpy
.
sqrt
(
n_in
),
size
=
(
n_in
,
n_out
),
name
=
'f_
%
i'
%
i
)
f_list
=
[
shared_uniform
(
low
=-
2.0
/
numpy
.
sqrt
(
n_in
),
high
=
2.0
/
numpy
.
sqrt
(
n_in
),
size
=
(
n_in
,
n_out
),
name
=
'f_
%
i'
%
i
)
for
i
in
xrange
(
n_terms
)]
for
i
in
xrange
(
n_terms
)]
b_list
=
[
shared_uniform
(
low
=
0
,
high
=.
01
,
size
=
(
n_out
,),
name
=
'b_
%
i'
%
i
)
b_list
=
[
shared_uniform
(
low
=
0
,
high
=.
01
,
size
=
(
n_out
,),
name
=
'b_
%
i'
%
i
)
for
i
in
xrange
(
n_terms
)]
for
i
in
xrange
(
n_terms
)]
#
x_list = [theano._asarray(eps, dtype=dtype)+
softplus(tensor.dot(input, f_list[i])) for i in xrange(n_terms)]
#
x_list = [theano._asarray(eps, dtype=dtype) +
softplus(tensor.dot(input, f_list[i])) for i in xrange(n_terms)]
filter_range
=
theano
.
_asarray
(
filter_range
,
dtype
=
dtype
)
filter_range
=
theano
.
_asarray
(
filter_range
,
dtype
=
dtype
)
half_filter_range
=
theano
.
_asarray
(
filter_range
/
2
,
dtype
=
dtype
)
half_filter_range
=
theano
.
_asarray
(
filter_range
/
2
,
x_list
=
[
theano
.
_asarray
(
filter_range
+
eps
,
dtype
=
dtype
)
+
half_filter_range
*
softsign
(
tensor
.
dot
(
input
,
f_list
[
i
])
+
dtype
=
dtype
)
b_list
[
i
])
for
i
in
xrange
(
n_terms
)]
x_list
=
[
theano
.
_asarray
(
filter_range
+
eps
,
dtype
=
dtype
)
+
half_filter_range
*
softsign
(
tensor
.
dot
(
input
,
f_list
[
i
])
+
b_list
[
i
])
for
i
in
xrange
(
n_terms
)]
rval
=
cls
.
new_expbounds
(
rng
,
x_list
,
n_out
,
dtype
=
dtype
,
params
=
f_list
+
b_list
,
rval
=
cls
.
new_expbounds
(
rng
,
x_list
,
n_out
,
dtype
=
dtype
,
params
=
f_list
+
b_list
,
exponent_range
=
exponent_range
)
exponent_range
=
exponent_range
)
rval
.
f_list
=
f_list
rval
.
f_list
=
f_list
rval
.
input
=
input
# add the input to the returned object
rval
.
input
=
input
# add the input to the returned object
...
@@ -183,6 +216,8 @@ class Kouh2008(object):
...
@@ -183,6 +216,8 @@ class Kouh2008(object):
def
img_from_weights
(
self
,
rows
=
None
,
cols
=
None
,
row_gap
=
1
,
col_gap
=
1
,
eps
=
1e-4
):
def
img_from_weights
(
self
,
rows
=
None
,
cols
=
None
,
row_gap
=
1
,
col_gap
=
1
,
eps
=
1e-4
):
""" Return an image that visualizes all the weights in the layer.
""" Return an image that visualizes all the weights in the layer.
"""
"""
if
Image
is
None
:
raise
ImportError
(
"No module named PIL"
)
n_in
,
n_out
=
self
.
f_list
[
0
]
.
value
.
shape
n_in
,
n_out
=
self
.
f_list
[
0
]
.
value
.
shape
...
@@ -190,10 +225,12 @@ class Kouh2008(object):
...
@@ -190,10 +225,12 @@ class Kouh2008(object):
rows
=
int
(
numpy
.
sqrt
(
n_out
))
rows
=
int
(
numpy
.
sqrt
(
n_out
))
if
cols
is
None
:
if
cols
is
None
:
cols
=
n_out
//
rows
cols
=
n_out
//
rows
if
n_out
%
rows
:
cols
+=
1
if
n_out
%
rows
:
cols
+=
1
if
rows
is
None
:
if
rows
is
None
:
rows
=
n_out
//
cols
rows
=
n_out
//
cols
if
n_out
%
cols
:
rows
+=
1
if
n_out
%
cols
:
rows
+=
1
filter_shape
=
self
.
filter_shape
filter_shape
=
self
.
filter_shape
height
=
rows
*
(
row_gap
+
filter_shape
[
0
])
-
row_gap
height
=
rows
*
(
row_gap
+
filter_shape
[
0
])
-
row_gap
...
@@ -203,34 +240,40 @@ class Kouh2008(object):
...
@@ -203,34 +240,40 @@ class Kouh2008(object):
w
=
self
.
w
.
value
w
=
self
.
w
.
value
w_col
=
0
w_col
=
0
def
pixel_range
(
x
):
def
pixel_range
(
x
):
return
255
*
(
x
-
x
.
min
())
/
(
x
.
max
()
-
x
.
min
()
+
eps
)
return
255
*
(
x
-
x
.
min
())
/
(
x
.
max
()
-
x
.
min
()
+
eps
)
for
r
in
xrange
(
rows
):
for
r
in
xrange
(
rows
):
out_r_low
=
r
*
(
row_gap
+
filter_shape
[
0
])
out_r_low
=
r
*
(
row_gap
+
filter_shape
[
0
])
out_r_high
=
out_r_low
+
filter_shape
[
0
]
out_r_high
=
out_r_low
+
filter_shape
[
0
]
for
c
in
xrange
(
cols
):
for
c
in
xrange
(
cols
):
out_c_low
=
c
*
(
col_gap
+
filter_shape
[
1
])
out_c_low
=
c
*
(
col_gap
+
filter_shape
[
1
])
out_c_high
=
out_c_low
+
filter_shape
[
1
]
out_c_high
=
out_c_low
+
filter_shape
[
1
]
out_tile
=
out_array
[
out_r_low
:
out_r_high
,
out_c_low
:
out_c_high
,
:]
out_tile
=
out_array
[
out_r_low
:
out_r_high
,
out_c_low
:
out_c_high
,
:]
if
c
%
3
==
0
:
# linear filter
if
c
%
3
==
0
:
# linear filter
if
w_col
<
w
.
shape
[
1
]:
if
w_col
<
w
.
shape
[
1
]:
out_tile
[
...
]
=
pixel_range
(
w
[:,
w_col
])
.
reshape
(
filter_shape
+
(
1
,))
out_tile
[
...
]
=
pixel_range
(
w
[:,
w_col
])
.
reshape
(
filter_shape
+
(
1
,))
w_col
+=
1
w_col
+=
1
if
c
%
3
==
1
:
# E filters
if
c
%
3
==
1
:
# E filters
if
w_col
<
w
.
shape
[
1
]:
if
w_col
<
w
.
shape
[
1
]:
# filters after the 3rd do not get rendered, but are skipped over.
# filters after the 3rd do not get rendered, but are skipped over.
# there are only 3 colour channels.
# there are only 3 colour channels.
for
i
in
xrange
(
min
(
self
.
n_E_quadratic
,
3
)):
for
i
in
xrange
(
min
(
self
.
n_E_quadratic
,
3
)):
out_tile
[:,
:,
i
]
=
pixel_range
(
w
[:,
w_col
+
i
])
.
reshape
(
filter_shape
)
out_tile
[:,
:,
i
]
=
pixel_range
(
w
[:,
w_col
+
i
])
.
reshape
(
filter_shape
)
w_col
+=
self
.
n_E_quadratic
w_col
+=
self
.
n_E_quadratic
if
c
%
3
==
2
:
# S filters
if
c
%
3
==
2
:
# S filters
if
w_col
<
w
.
shape
[
1
]:
if
w_col
<
w
.
shape
[
1
]:
# filters after the 3rd do not get rendered, but are skipped over.
# filters after the 3rd do not get rendered, but are skipped over.
# there are only 3 colour channels.
# there are only 3 colour channels.
for
i
in
xrange
(
min
(
self
.
n_S_quadratic
,
3
)):
for
i
in
xrange
(
min
(
self
.
n_S_quadratic
,
3
)):
out_tile
[:,
:,
2
-
i
]
=
pixel_range
(
w
[:,
w_col
+
i
])
.
reshape
(
filter_shape
)
out_tile
[:,
:,
2
-
i
]
=
pixel_range
(
w
[:,
w_col
+
i
])
.
reshape
(
filter_shape
)
w_col
+=
self
.
n_S_quadratic
w_col
+=
self
.
n_S_quadratic
return
Image
.
fromarray
(
out_array
,
'RGB'
)
return
Image
.
fromarray
(
out_array
,
'RGB'
)
...
@@ -264,8 +307,9 @@ class Config(object):
...
@@ -264,8 +307,9 @@ class Config(object):
ft_batchsize
=
30
ft_batchsize
=
30
ft_epoch_len
=
50000
ft_epoch_len
=
50000
ft_status_interval
=
50
# property( lambda s:s.ft_epoch_len/s.ft_batchsize)
ft_status_interval
=
50
# property(lambda s:s.ft_epoch_len/s.ft_batchsize)
ft_validation_interval
=
property
(
lambda
s
:
s
.
ft_epoch_len
/
s
.
ft_batchsize
)
ft_validation_interval
=
property
(
lambda
s
:
s
.
ft_epoch_len
/
s
.
ft_batchsize
)
ft_ntrain_limit
=
0
ft_ntrain_limit
=
0
ft_test_lag1
=
True
ft_test_lag1
=
True
...
@@ -297,7 +341,8 @@ if 0:
...
@@ -297,7 +341,8 @@ if 0:
s_lr
=
theano
.
tensor
.
fscalar
()
s_lr
=
theano
.
tensor
.
fscalar
()
if
not
debug
:
if
not
debug
:
sshape
=
(
None
,
784
)
sshape
=
(
None
,
784
)
else
:
sshape
=
(
None
,
3
)
else
:
sshape
=
(
None
,
3
)
x
=
theano
.
tensor
.
TensorType
(
dtype
=
conf
.
dtype
,
broadcastable
=
(
0
,
0
),
shape
=
sshape
)()
x
=
theano
.
tensor
.
TensorType
(
dtype
=
conf
.
dtype
,
broadcastable
=
(
0
,
0
),
shape
=
sshape
)()
y
=
theano
.
tensor
.
lvector
()
y
=
theano
.
tensor
.
lvector
()
...
@@ -315,7 +360,8 @@ if 0:
...
@@ -315,7 +360,8 @@ if 0:
print
(
layer
.
params
)
print
(
layer
.
params
)
gparams
=
theano
.
tensor
.
grad
(
cost
,
layer
.
params
)
gparams
=
theano
.
tensor
.
grad
(
cost
,
layer
.
params
)
updates
=
[(
p
,
p
-
s_lr
*
gp
)
for
p
,
gp
in
zip
(
layer
.
params
,
gparams
)]
updates
=
[
(
p
,
p
-
s_lr
*
gp
)
for
p
,
gp
in
zip
(
layer
.
params
,
gparams
)]
train_nll
=
pfunc
([
x
,
y
,
s_lr
],
[],
updates
=
updates
)
train_nll
=
pfunc
([
x
,
y
,
s_lr
],
[],
updates
=
updates
)
...
...
theano/sandbox/cuda/tests/test_blas.py
浏览文件 @
b69ad54d
...
@@ -8,31 +8,31 @@ from theano import tensor
...
@@ -8,31 +8,31 @@ from theano import tensor
from
theano.tests
import
unittest_tools
from
theano.tests
import
unittest_tools
import
numpy
import
numpy
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.sandbox.cuda
as
tcn
import
theano.sandbox.cuda
as
tcn
from
theano.tensor.signal.pool
import
(
Pool
,
PoolGrad
,
DownsampleFactorMaxGradGrad
)
import
theano.compile.mode
import
theano.compile.mode
from
theano.tensor.tests.test_blas
import
BaseGemv
,
TestBlasStrides
,
TestGer
from
theano.tensor.tests.test_blas
import
BaseGemv
,
TestBlasStrides
,
TestGer
from
theano.sandbox.cuda.blas
import
gpu_gemv_no_inplace
,
gpu_gemv_inplace
from
theano.sandbox.cuda.blas
import
gpu_gemv_no_inplace
,
gpu_gemv_inplace
from
theano.sandbox.cuda.blas
import
gpu_ger_inplace
,
gpu_ger_no_inplace
from
theano.sandbox.cuda.blas
import
gpu_ger_inplace
,
gpu_ger_no_inplace
from
theano.sandbox.cuda.blas
import
batched_dot
,
GpuBatchedDot
from
theano.sandbox.cuda.blas
import
batched_dot
,
GpuBatchedDot
from
theano.tensor.signal.pool
import
(
Pool
,
PoolGrad
,
DownsampleFactorMaxGradGrad
)
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpu'
)
'FAST_RUN'
)
.
excluding
(
'gpu'
)
else
:
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
(
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpu'
)
)
.
including
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
(
)
.
excluding
(
'gpu'
)
# The CPU tests already compare C/Py, so we only check C/GPU
# The CPU tests already compare C/Py, so we only check C/GPU
mode_with_gpu
=
copy
.
copy
(
mode_with_gpu
)
mode_with_gpu
=
copy
.
copy
(
mode_with_gpu
)
...
@@ -55,73 +55,81 @@ class TestBatchedDot(unittest_tools.InferShapeTester):
...
@@ -55,73 +55,81 @@ class TestBatchedDot(unittest_tools.InferShapeTester):
def
cmp
(
a_shp
,
b_shp
):
def
cmp
(
a_shp
,
b_shp
):
a
=
numpy
.
random
.
randn
(
*
a_shp
)
.
astype
(
numpy
.
float32
)
a
=
numpy
.
random
.
randn
(
*
a_shp
)
.
astype
(
numpy
.
float32
)
b
=
numpy
.
random
.
randn
(
*
b_shp
)
.
astype
(
numpy
.
float32
)
b
=
numpy
.
random
.
randn
(
*
b_shp
)
.
astype
(
numpy
.
float32
)
x
=
tensor
.
ftensor3
()
x
=
tensor
.
ftensor3
()
y
=
tensor
.
ftensor3
()
y
=
tensor
.
ftensor3
()
f
=
theano
.
function
([
x
,
y
],
batched_dot
(
x
,
y
),
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
x
,
y
],
batched_dot
(
x
,
y
),
mode
=
mode_with_gpu
)
z0
=
numpy
.
asarray
(
f
(
a
,
b
))
z0
=
numpy
.
asarray
(
f
(
a
,
b
))
ga
=
cuda_ndarray
.
CudaNdarray
(
a
)
ga
=
cuda_ndarray
.
CudaNdarray
(
a
)
gb
=
cuda_ndarray
.
CudaNdarray
(
b
)
gb
=
cuda_ndarray
.
CudaNdarray
(
b
)
z1
=
numpy
.
asarray
(
f
(
ga
,
gb
))
z1
=
numpy
.
asarray
(
f
(
ga
,
gb
))
z_test
=
numpy
.
sum
(
a
[:,
:,
:,
None
]
*
b
[:,
None
,
:,
:],
axis
=-
2
)
z1
=
numpy
.
asarray
(
f
(
ga
,
gb
))
z_test
=
numpy
.
sum
(
a
[:,:,:,
None
]
*
b
[:,
None
,:,:],
axis
=-
2
)
z_test
=
numpy
.
sum
(
a
[:,
:,
:,
None
]
*
b
[:,
None
,
:,
:],
axis
=-
2
)
unittest_tools
.
assert_allclose
(
z0
,
z_test
)
unittest_tools
.
assert_allclose
(
z0
,
z_test
)
unittest_tools
.
assert_allclose
(
z1
,
z_test
)
unittest_tools
.
assert_allclose
(
z1
,
z_test
)
cmp
((
5
,
4
,
3
),
(
5
,
3
,
2
))
cmp
((
5
,
4
,
3
),
(
5
,
3
,
2
))
cmp
((
5
,
3
,
3
),
(
5
,
3
,
3
))
cmp
((
5
,
3
,
3
),
(
5
,
3
,
3
))
cmp
((
5
,
2
,
6
),
(
5
,
6
,
3
))
cmp
((
5
,
2
,
6
),
(
5
,
6
,
3
))
# Test dimensions of 0
# Test dimensions of 0
cmp
((
0
,
2
,
6
),
(
0
,
6
,
3
))
cmp
((
0
,
2
,
6
),
(
0
,
6
,
3
))
cmp
((
5
,
0
,
3
),
(
5
,
3
,
2
))
cmp
((
5
,
0
,
3
),
(
5
,
3
,
2
))
cmp
((
5
,
4
,
0
),
(
5
,
0
,
2
))
cmp
((
5
,
4
,
0
),
(
5
,
0
,
2
))
cmp
((
5
,
4
,
3
),
(
5
,
3
,
0
))
cmp
((
5
,
4
,
3
),
(
5
,
3
,
0
))
cmp
((
0
,
0
,
0
),
(
0
,
0
,
0
))
cmp
((
0
,
0
,
0
),
(
0
,
0
,
0
))
# Test dimensions of 1
# Test dimensions of 1
cmp
((
1
,
2
,
6
),
(
1
,
6
,
3
))
cmp
((
1
,
2
,
6
),
(
1
,
6
,
3
))
cmp
((
5
,
1
,
3
),
(
5
,
3
,
2
))
cmp
((
5
,
1
,
3
),
(
5
,
3
,
2
))
cmp
((
5
,
4
,
1
),
(
5
,
1
,
2
))
cmp
((
5
,
4
,
1
),
(
5
,
1
,
2
))
cmp
((
5
,
4
,
3
),
(
5
,
3
,
1
))
cmp
((
5
,
4
,
3
),
(
5
,
3
,
1
))
def
test_batched_dot_errors
(
self
):
def
test_batched_dot_errors
(
self
):
def
fail
(
a_shp
,
b_shp
):
def
fail
(
a_shp
,
b_shp
):
a
=
numpy
.
random
.
randn
(
*
a_shp
)
.
astype
(
numpy
.
float32
)
a
=
numpy
.
random
.
randn
(
*
a_shp
)
.
astype
(
numpy
.
float32
)
b
=
numpy
.
random
.
randn
(
*
b_shp
)
.
astype
(
numpy
.
float32
)
b
=
numpy
.
random
.
randn
(
*
b_shp
)
.
astype
(
numpy
.
float32
)
x
=
tensor
.
ftensor3
()
x
=
tensor
.
ftensor3
()
y
=
tensor
.
ftensor3
()
y
=
tensor
.
ftensor3
()
f
=
theano
.
function
([
x
,
y
],
batched_dot
(
x
,
y
),
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
x
,
y
],
batched_dot
(
x
,
y
),
mode
=
mode_with_gpu
)
z
=
f
(
a
,
b
)
f
(
a
,
b
)
# Different batch size
# Different batch size
self
.
assertRaises
(
RuntimeError
,
fail
,
(
5
,
4
,
3
),
(
6
,
3
,
2
))
self
.
assertRaises
(
RuntimeError
,
fail
,
(
5
,
4
,
3
),
(
6
,
3
,
2
))
# Shape mismatch
# Shape mismatch
self
.
assertRaises
(
RuntimeError
,
fail
,
(
5
,
4
,
3
),
(
5
,
2
,
2
))
self
.
assertRaises
(
RuntimeError
,
fail
,
(
5
,
4
,
3
),
(
5
,
2
,
2
))
def
test_batched_dot_gradient
(
self
):
def
test_batched_dot_gradient
(
self
):
for
threshold
in
[
0
,
100
]:
unittest_tools
.
verify_grad
(
unittest_tools
.
verify_grad
(
GpuBatchedDot
(
stream_threshold
=
threshold
),
batched_dot
,
[
[
numpy
.
random
.
randn
(
5
,
7
,
2
)
.
astype
(
numpy
.
float32
),
numpy
.
random
.
randn
(
5
,
7
,
2
)
.
astype
(
numpy
.
float32
),
numpy
.
random
.
randn
(
5
,
2
,
6
)
.
astype
(
numpy
.
float32
)],
numpy
.
random
.
randn
(
5
,
2
,
6
)
.
astype
(
numpy
.
float32
)],
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
def
test_infer_shape
(
self
):
def
test_infer_shape
(
self
):
# only matrix
/
matrix is supported
# only matrix
/
matrix is supported
admat
=
tensor
.
ftensor3
()
admat
=
tensor
.
ftensor3
()
bdmat
=
tensor
.
ftensor3
()
bdmat
=
tensor
.
ftensor3
()
admat_val
=
my_rand
(
7
,
4
,
5
)
admat_val
=
my_rand
(
7
,
4
,
5
)
...
@@ -134,22 +142,21 @@ class TestBatchedDot(unittest_tools.InferShapeTester):
...
@@ -134,22 +142,21 @@ class TestBatchedDot(unittest_tools.InferShapeTester):
def
test_dot22
():
def
test_dot22
():
def
cmp
(
a_shp
,
b_shp
):
def
cmp
(
a_shp
,
b_shp
):
a0
=
my_rand
(
*
a_shp
)
a0
=
my_rand
(
*
a_shp
)
a
=
tcn
.
shared_constructor
(
a0
,
'a'
)
a
=
tcn
.
shared_constructor
(
a0
,
'a'
)
b
=
tensor
.
fmatrix
()
b
=
tensor
.
fmatrix
()
f
=
pfunc
([
b
],
[],
updates
=
[(
a
,
tensor
.
dot
(
a
,
b
))],
mode
=
mode_with_gpu
)
f
=
pfunc
([
b
],
[],
updates
=
[(
a
,
tensor
.
dot
(
a
,
b
))],
mode
=
mode_with_gpu
)
bval
=
my_rand
(
*
b_shp
)
bval
=
my_rand
(
*
b_shp
)
f
(
bval
)
f
(
bval
)
assert
numpy
.
allclose
(
numpy
.
dot
(
a0
,
bval
),
a
.
get_value
())
assert
numpy
.
allclose
(
numpy
.
dot
(
a0
,
bval
),
a
.
get_value
())
# Try with a matrix equal to a0, but with strides in both dims
# Try with a matrix equal to a0, but with strides in both dims
a
.
set_value
(
a0
)
a
.
set_value
(
a0
)
a
.
set_value
(
a
.
set_value
(
a
.
get_value
(
borrow
=
True
,
a
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)[::
-
1
,
::
-
1
],
return_internal_type
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
borrow
=
True
)
f
(
bval
)
f
(
bval
)
...
@@ -224,7 +231,7 @@ def test_gemm():
...
@@ -224,7 +231,7 @@ def test_gemm():
assert
any
([
node
.
op
==
tcn
.
blas
.
gpu_gemm_inplace
assert
any
([
node
.
op
==
tcn
.
blas
.
gpu_gemm_inplace
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
bval
=
my_rand
(
*
b_shp
)
bval
=
my_rand
(
*
b_shp
)
cval
=
my_rand
(
a_shp
[
0
],
b_shp
[
1
])
cval
=
my_rand
(
a_shp
[
0
],
b_shp
[
1
])
f
(
bval
,
cval
)
f
(
bval
,
cval
)
...
@@ -233,8 +240,7 @@ def test_gemm():
...
@@ -233,8 +240,7 @@ def test_gemm():
# Try with a matrix equal to a0, but with strides in both dims
# Try with a matrix equal to a0, but with strides in both dims
a
.
set_value
(
a0
)
a
.
set_value
(
a0
)
a
.
set_value
(
a
.
set_value
(
a
.
get_value
(
borrow
=
True
,
a
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)[::
-
1
,
::
-
1
],
return_internal_type
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
borrow
=
True
)
f
(
bval
,
cval
)
f
(
bval
,
cval
)
...
@@ -250,7 +256,7 @@ def test_gemm():
...
@@ -250,7 +256,7 @@ def test_gemm():
def
test_gemm_no_inplace
():
def
test_gemm_no_inplace
():
def
cmp
(
a_shp
,
b_shp
):
def
cmp
(
a_shp
,
b_shp
):
a0
=
my_rand
(
*
a_shp
)
a0
=
my_rand
(
*
a_shp
)
a
=
tcn
.
shared_constructor
(
a0
,
'a'
)
a
=
tcn
.
shared_constructor
(
a0
,
'a'
)
cval
=
my_rand
(
a_shp
[
0
],
b_shp
[
1
])
cval
=
my_rand
(
a_shp
[
0
],
b_shp
[
1
])
c
=
tcn
.
shared_constructor
(
cval
.
copy
(),
'c'
)
c
=
tcn
.
shared_constructor
(
cval
.
copy
(),
'c'
)
...
@@ -258,8 +264,7 @@ def test_gemm_no_inplace():
...
@@ -258,8 +264,7 @@ def test_gemm_no_inplace():
b
=
tcn
.
fmatrix
(
'b'
)
b
=
tcn
.
fmatrix
(
'b'
)
b2
=
tcn
.
fmatrix
(
'b2'
)
b2
=
tcn
.
fmatrix
(
'b2'
)
f
=
pfunc
(
f
=
pfunc
([
b
,
b2
],
[
b
,
b2
],
[
tensor
.
dot
(
a
,
b2
)
+
c
],
[
tensor
.
dot
(
a
,
b2
)
+
c
],
updates
=
[(
a
,
tensor
.
dot
(
a
,
b
)
+
c
)],
updates
=
[(
a
,
tensor
.
dot
(
a
,
b
)
+
c
)],
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
...
@@ -276,7 +281,8 @@ def test_gemm_no_inplace():
...
@@ -276,7 +281,8 @@ def test_gemm_no_inplace():
# Try with a matrix equal to a0, but with strides in both dims
# Try with a matrix equal to a0, but with strides in both dims
a
.
set_value
(
a0
)
a
.
set_value
(
a0
)
a
.
set_value
(
a
.
set_value
(
a
.
get_value
(
borrow
=
True
,
a
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)[::
-
1
,
::
-
1
],
return_internal_type
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
borrow
=
True
)
f
(
bval
,
bval2
)
f
(
bval
,
bval2
)
...
@@ -303,8 +309,8 @@ if 0:
...
@@ -303,8 +309,8 @@ if 0:
def
test_maxpool
():
def
test_maxpool
():
"""TODO: test the gpu version!!! """
"""TODO: test the gpu version!!! """
for
d0
,
d1
,
r_true
,
r_false
in
[(
4
,
4
,
[[[[
5
,
7
],
[
13
,
15
]]]],
[[[[
5
,
7
],
[
13
,
15
]]]]),
for
d0
,
d1
,
r_true
,
r_false
in
[(
4
,
4
,
[[[[
5
,
7
],
[
13
,
15
]]]],
[[[[
5
,
7
],
[
13
,
15
]]]]),
(
5
,
5
,
[[[[
6
,
8
],
[
16
,
18
],
[
21
,
23
]]]],
(
5
,
5
,
[[[[
6
,
8
],
[
16
,
18
],
[
21
,
23
]]]],
[[[[
6
,
8
,
9
],
[
16
,
18
,
19
],
[
21
,
23
,
24
]]]])]:
[[[[
6
,
8
,
9
],
[
16
,
18
,
19
],
[
21
,
23
,
24
]]]])]:
for
border
,
ret
in
[(
True
,
r_true
),
(
False
,
r_false
)]:
for
border
,
ret
in
[(
True
,
r_true
),
(
False
,
r_false
)]:
ret
=
numpy
.
array
(
ret
)
ret
=
numpy
.
array
(
ret
)
a
=
tcn
.
blas
.
Pool
((
2
,
2
),
border
)
a
=
tcn
.
blas
.
Pool
((
2
,
2
),
border
)
...
@@ -312,7 +318,7 @@ if 0:
...
@@ -312,7 +318,7 @@ if 0:
b
=
dmatrix4
()
b
=
dmatrix4
()
f
=
pfunc
([
b
],
[
a
(
b
)],
mode
=
mode_with_gpu
)
f
=
pfunc
([
b
],
[
a
(
b
)],
mode
=
mode_with_gpu
)
bval
=
numpy
.
arange
(
0
,
d0
*
d1
)
.
reshape
(
1
,
1
,
d0
,
d1
)
bval
=
numpy
.
arange
(
0
,
d0
*
d1
)
.
reshape
(
1
,
1
,
d0
,
d1
)
r
=
f
(
bval
)[
0
]
r
=
f
(
bval
)[
0
]
# print bval, bval.shape, border
# print bval, bval.shape, border
# print r, r.shape
# print r, r.shape
...
@@ -347,8 +353,7 @@ def test_downsample():
...
@@ -347,8 +353,7 @@ def test_downsample():
(
1
,
1
,
1025
,
10
),
(
1
,
1
,
1025
,
10
),
(
1
,
1
,
1023
,
10
),
(
1
,
1
,
1023
,
10
),
(
65536
,
1
,
10
,
10
),
(
65536
,
1
,
10
,
10
),
(
1
,
65536
,
10
,
10
),
(
1
,
65536
,
10
,
10
),
]
]
numpy
.
random
.
RandomState
(
unittest_tools
.
fetch_seed
())
.
shuffle
(
shps
)
numpy
.
random
.
RandomState
(
unittest_tools
.
fetch_seed
())
.
shuffle
(
shps
)
...
@@ -413,10 +418,11 @@ def test_downsample():
...
@@ -413,10 +418,11 @@ def test_downsample():
gg
=
pfunc
([],
ggf
,
mode
=
gpu_mode
)
gg
=
pfunc
([],
ggf
,
mode
=
gpu_mode
)
gg2
=
pfunc
([],
ggf
,
mode
=
ref_mode
)
gg2
=
pfunc
([],
ggf
,
mode
=
ref_mode
)
assert
any
([
isinstance
(
node
.
op
,
assert
any
([
isinstance
(
tcn
.
blas
.
GpuDownsampleFactorMaxGradGrad
)
node
.
op
,
tcn
.
blas
.
GpuDownsampleFactorMaxGradGrad
)
for
node
in
gg
.
maker
.
fgraph
.
toposort
()])
for
node
in
gg
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
node
.
op
,
DownsampleFactorMaxGradGrad
)
assert
any
([
isinstance
(
node
.
op
,
DownsampleFactorMaxGradGrad
)
for
node
in
gg2
.
maker
.
fgraph
.
toposort
()])
for
node
in
gg2
.
maker
.
fgraph
.
toposort
()])
assert
numpy
.
allclose
(
gg
(),
gg2
()),
shp
assert
numpy
.
allclose
(
gg
(),
gg2
()),
shp
...
@@ -434,6 +440,7 @@ class TestGpuGemv(TestCase, BaseGemv,
...
@@ -434,6 +440,7 @@ class TestGpuGemv(TestCase, BaseGemv,
gemv
=
gpu_gemv_no_inplace
gemv
=
gpu_gemv_no_inplace
gemv_inplace
=
gpu_gemv_inplace
gemv_inplace
=
gpu_gemv_inplace
# Mimic shared constructors registry
# Mimic shared constructors registry
@staticmethod
@staticmethod
def
shared
(
val
):
def
shared
(
val
):
# If we don't put shared on the GPU, we won't be able to test
# If we don't put shared on the GPU, we won't be able to test
...
@@ -531,7 +538,9 @@ class TestVectorMatrixDot(TestCase):
...
@@ -531,7 +538,9 @@ class TestVectorMatrixDot(TestCase):
gpu_f
=
theano
.
function
([],
v2
+
theano
.
dot
(
m
,
v1
),
mode
=
mode_with_gpu
)
gpu_f
=
theano
.
function
([],
v2
+
theano
.
dot
(
m
,
v1
),
mode
=
mode_with_gpu
)
# gpu_f2 is needed to test the case when the input is not on the gpu
# gpu_f2 is needed to test the case when the input is not on the gpu
# but the output is moved to the gpu.
# but the output is moved to the gpu.
gpu_f2
=
theano
.
function
([],
tcn
.
gpu_from_host
(
v2
+
theano
.
dot
(
m
,
v1
)),
gpu_f2
=
theano
.
function
(
[],
tcn
.
gpu_from_host
(
v2
+
theano
.
dot
(
m
,
v1
)),
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
# Assert they produce the same output
# Assert they produce the same output
...
@@ -556,7 +565,8 @@ class TestVectorMatrixDot(TestCase):
...
@@ -556,7 +565,8 @@ class TestVectorMatrixDot(TestCase):
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
# gpu_f2 is needed to test the case when the input is not on the gpu
# gpu_f2 is needed to test the case when the input is not on the gpu
# but the output is moved to the gpu.
# but the output is moved to the gpu.
gpu_f2
=
theano
.
function
([],
tcn
.
gpu_from_host
(
v2
+
theano
.
dot
(
v1
,
m
)),
gpu_f2
=
theano
.
function
(
[],
tcn
.
gpu_from_host
(
v2
+
theano
.
dot
(
v1
,
m
)),
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
# Assert they produce the same output
# Assert they produce the same output
...
...
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
浏览文件 @
b69ad54d
...
@@ -2,14 +2,16 @@
...
@@ -2,14 +2,16 @@
Tests for GPU convolution
Tests for GPU convolution
"""
"""
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
sys
import
time
import
time
import
unittest
import
unittest
import
traceback
import
theano
from
theano
import
tensor
from
theano.tests.unittest_tools
import
seed_rng
,
assert_allclose
from
theano.sandbox
import
cuda
import
numpy
import
numpy
from
six.moves
import
xrange
from
six.moves
import
xrange
from
theano.sandbox.cuda.dnn
import
GpuDnnConv
,
DnnBase
,
dnn_conv
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
from
nose.tools
import
assert_raises
from
nose.tools
import
assert_raises
imported_scipy_convolve2d
=
False
imported_scipy_convolve2d
=
False
...
@@ -19,16 +21,10 @@ try:
...
@@ -19,16 +21,10 @@ try:
except
ImportError
:
except
ImportError
:
pass
pass
import
theano
from
theano
import
tensor
from
theano.tests.unittest_tools
import
seed_rng
,
assert_allclose
# Skip test if cuda is not available.
# Skip test if cuda is not available.
from
theano.sandbox
import
cuda
if
cuda
.
cuda_available
is
False
:
if
cuda
.
cuda_available
==
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
from
theano.sandbox.cuda.dnn
import
GpuDnnConv
,
DnnBase
,
dnn_conv
# needed as the gpu conv don't have a perform implementation.
# needed as the gpu conv don't have a perform implementation.
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
...
@@ -106,11 +102,11 @@ def py_conv(img, kern, mode, subsample):
...
@@ -106,11 +102,11 @@ def py_conv(img, kern, mode, subsample):
if
imported_scipy_convolve2d
:
if
imported_scipy_convolve2d
:
return
py_conv_scipy
(
img
,
kern
,
mode
,
subsample
)
return
py_conv_scipy
(
img
,
kern
,
mode
,
subsample
)
elif
mode
==
'valid'
:
elif
mode
==
'valid'
:
return
py_conv_valid_numpy
(
img
,
kern
)[
:,
:,
::
subsample
[
0
],
return
py_conv_valid_numpy
(
img
,
kern
)[
::
subsample
[
1
]]
:,
:,
::
subsample
[
0
],
::
subsample
[
1
]]
elif
mode
==
'full'
:
elif
mode
==
'full'
:
return
py_conv_full_numpy
(
img
,
kern
)[
:,
:,
::
subsample
[
0
],
return
py_conv_full_numpy
(
img
,
kern
)[
::
subsample
[
1
]]
:,
:,
::
subsample
[
0
],
::
subsample
[
1
]]
else
:
else
:
raise
Exception
(
"Can't execute this kernel."
)
raise
Exception
(
"Can't execute this kernel."
)
...
@@ -129,7 +125,7 @@ def py_conv_scipy(img, kern, mode, subsample):
...
@@ -129,7 +125,7 @@ def py_conv_scipy(img, kern, mode, subsample):
for
b
in
xrange
(
out
.
shape
[
0
]):
for
b
in
xrange
(
out
.
shape
[
0
]):
for
k
in
xrange
(
out
.
shape
[
1
]):
for
k
in
xrange
(
out
.
shape
[
1
]):
for
s
in
xrange
(
img
.
shape
[
1
]):
for
s
in
xrange
(
img
.
shape
[
1
]):
#convolve2d or correlate
#
convolve2d or correlate
out
[
b
,
k
,
:,
:]
+=
convolve2d
(
img
[
b
,
s
,
:,
:],
out
[
b
,
k
,
:,
:]
+=
convolve2d
(
img
[
b
,
s
,
:,
:],
kern
[
k
,
s
,
:,
:],
kern
[
k
,
s
,
:,
:],
mode
)
mode
)
...
@@ -168,10 +164,12 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
...
@@ -168,10 +164,12 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
npy_kern
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
kshape
)
-
2
,
npy_kern
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
kshape
)
-
2
,
dtype
=
'float32'
)
dtype
=
'float32'
)
else
:
else
:
npy_img
=
theano
.
_asarray
(
numpy
.
arange
(
npy_img
=
theano
.
_asarray
(
numpy
.
prod
(
ishape
))
.
reshape
(
ishape
),
dtype
=
'float32'
)
+
1
numpy
.
arange
(
numpy
.
prod
(
ishape
))
.
reshape
(
ishape
),
npy_kern
=
-
(
theano
.
_asarray
(
numpy
.
arange
(
dtype
=
'float32'
)
+
1
numpy
.
prod
(
kshape
))
.
reshape
(
kshape
),
dtype
=
'float32'
)
+
1
)
npy_kern
=
-
(
theano
.
_asarray
(
numpy
.
arange
(
numpy
.
prod
(
kshape
))
.
reshape
(
kshape
),
dtype
=
'float32'
)
+
1
)
img
=
cuda_ndarray
.
CudaNdarray
(
npy_img
)
img
=
cuda_ndarray
.
CudaNdarray
(
npy_img
)
kern
=
cuda_ndarray
.
CudaNdarray
(
npy_kern
)
kern
=
cuda_ndarray
.
CudaNdarray
(
npy_kern
)
...
@@ -281,15 +279,15 @@ def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
...
@@ -281,15 +279,15 @@ def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
((
3
,
1
)
+
imshp
,
(
1
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
((
3
,
1
)
+
imshp
,
(
1
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
# nkern only
# nkern only
((
1
,
1
)
+
imshp
,
(
2
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
((
1
,
1
)
+
imshp
,
(
2
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#batch and nkern
#
batch and nkern
((
3
,
1
)
+
imshp
,
(
2
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
((
3
,
1
)
+
imshp
,
(
2
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#batch and stack
#
batch and stack
((
3
,
2
)
+
imshp
,
(
1
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
((
3
,
2
)
+
imshp
,
(
1
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#stack and nkern
#
stack and nkern
((
1
,
2
)
+
imshp
,
(
2
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
((
1
,
2
)
+
imshp
,
(
2
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#batch, nkern and stack
#
batch, nkern and stack
((
2
,
2
)
+
imshp
,
(
2
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
((
2
,
2
)
+
imshp
,
(
2
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#batch, nkern and stack
#
batch, nkern and stack
((
3
,
2
)
+
imshp
,
(
4
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
)
((
3
,
2
)
+
imshp
,
(
4
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
)
]
]
...
@@ -345,37 +343,37 @@ def get_valid_shapes():
...
@@ -345,37 +343,37 @@ def get_valid_shapes():
shapes
+=
[
shapes
+=
[
# other test
# other test
((
2
,
1
,
2
,
2
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
2
,
1
,
2
,
2
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
3
,
2
,
4
,
4
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
3
,
2
,
4
,
4
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
1
,
1
,
4
,
4
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
1
,
1
,
4
,
4
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
20
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
20
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
3
,
2
,
8
,
8
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize
((
3
,
2
,
8
,
8
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize,
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize, non-square image
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize, non-square image,
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize, non-square image, non-square kern
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize, non-square image, non-square kern,
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim,
,
((
16
,
5
,
64
,
64
),
(
8
,
5
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# a big one
((
16
,
5
,
64
,
64
),
(
8
,
5
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# a big one
,
((
16
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# MNIST LeNET layer 1
((
16
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# MNIST LeNET layer 1
,
((
20
,
16
,
32
,
32
),
(
1
,
16
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# layer 1 backprop to weights
((
20
,
16
,
32
,
32
),
(
1
,
16
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# layer 1 backprop to weights
,
((
60
,
20
,
28
,
28
),
(
10
,
20
,
5
,
5
),
(
1
,
1
),
(
2
,
2
),
(
1
,
1
))
# added a test case that fail from test_nnet.py.test_conv_nnet2
((
60
,
20
,
28
,
28
),
(
10
,
20
,
5
,
5
),
(
1
,
1
),
(
2
,
2
),
(
1
,
1
)),
# added a test case that fail from test_nnet.py.test_conv_nnet2
,
((
10
,
5
,
28
,
28
),
(
10
,
5
,
5
,
5
),
(
1
,
1
),
(
2
,
2
),
(
1
,
1
))
# test precedent but reduced that triger the error
((
10
,
5
,
28
,
28
),
(
10
,
5
,
5
,
5
),
(
1
,
1
),
(
2
,
2
),
(
1
,
1
)),
# test precedent but reduced that triger the error
# Test more than maxThreadsDim0
# Test more than maxThreadsDim0
,
((
2
,
4
,
13
,
1050
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
2
,
4
,
13
,
1050
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
2
,
4
,
1050
,
13
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
2
,
4
,
1050
,
13
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
]
]
shapes
+=
[
((
60
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 1 layers
shapes
+=
[
((
60
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 1 layers
,
((
60
,
20
,
12
,
12
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 2 layers
((
60
,
20
,
12
,
12
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 2 layers
,
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 bprop 1 full
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 bprop 1 full
,
((
20
,
60
,
12
,
12
),
(
30
,
60
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 bprop 2 valid
((
20
,
60
,
12
,
12
),
(
30
,
60
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 bprop 2 valid
# , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#
test_lenet_28 bprop 2 valid
# ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1)), #
test_lenet_28 bprop 2 valid
,
((
10
,
1
,
64
,
64
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_64 1 layers
((
10
,
1
,
64
,
64
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_64 1 layers
,
((
10
,
20
,
29
,
29
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_64 2 layers
((
10
,
20
,
29
,
29
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_64 2 layers
,
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_64 full
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_64 full
# , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#
test_lenet_64 bprop 1
# ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1)), #
test_lenet_64 bprop 1
# , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#
test_lenet_64 bprop 2
# ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1)) #
test_lenet_64 bprop 2
]
]
return
shapes
return
shapes
...
@@ -466,47 +464,46 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[],
...
@@ -466,47 +464,46 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[],
shapes
+=
[
shapes
+=
[
# other test
# other test
((
2
,
1
,
2
,
2
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
2
,
1
,
2
,
2
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
3
,
2
,
4
,
4
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
3
,
2
,
4
,
4
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
1
,
1
,
4
,
4
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
1
,
1
,
4
,
4
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
20
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
20
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
3
,
2
,
8
,
8
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize
((
3
,
2
,
8
,
8
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize, non-square image
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize, non-square image
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize, non-square image, non-square kern
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize, non-square image, non-square kern
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
,
((
16
,
5
,
64
,
64
),
(
8
,
5
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# a big one
((
16
,
5
,
64
,
64
),
(
8
,
5
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# a big one
,
((
16
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# MNIST LeNET layer 1
((
16
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# MNIST LeNET layer 1
,
((
20
,
16
,
32
,
32
),
(
1
,
16
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# layer 1 backprop to weights
((
20
,
16
,
32
,
32
),
(
1
,
16
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# layer 1 backprop to weights
]
]
if
test_bigger_kernels
:
if
test_bigger_kernels
:
# Shapes where the kernel is larger than the image in some dimension
# Shapes where the kernel is larger than the image in some dimension
shapes
+=
[
shapes
+=
[
((
3
,
1
,
1
,
1
),
(
2
,
1
,
5
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
3
,
1
,
1
,
1
),
(
2
,
1
,
5
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
3
,
2
,
1
,
1
),
(
4
,
2
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
3
,
2
,
1
,
1
),
(
4
,
2
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
3
,
2
,
4
,
4
),
(
4
,
2
,
2
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
3
,
2
,
4
,
4
),
(
4
,
2
,
2
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
3
,
2
,
4
,
4
),
(
4
,
2
,
8
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
3
,
2
,
4
,
4
),
(
4
,
2
,
8
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
2
,
10
,
10
),
(
3
,
2
,
2
,
12
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
2
,
10
,
10
),
(
3
,
2
,
2
,
12
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
]
]
shapes
+=
[
shapes
+=
[((
60
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 1 layers
# ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
# ((60, 20, 12, 12),(30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 2 layers
# , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 bprop 1 full
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 bprop 1 full
# ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 2 valid
# , ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
# ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 2 valid
# , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
# ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 1 layers
# , ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 1 layers
# ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 2 layers
# , ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 2 layers
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_64 full
,
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_64 full
# ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1)), # test_lenet_64 bprop 1
# , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
# ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1)), # test_lenet_64 bprop 2
# , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
# Test more than maxThreadsDim0
# Test more than maxThreadsDim0
,
((
2
,
4
,
13
,
1050
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
2
,
4
,
13
,
1050
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
2
,
4
,
1050
,
13
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
2
,
4
,
1050
,
13
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
1
,
1
,
44800
,
1
),
(
6
,
1
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# This caused crash
((
1
,
1
,
44800
,
1
),
(
6
,
1
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# This caused crash
]
]
verbose
=
0
verbose
=
0
...
@@ -636,7 +633,6 @@ class TestConv2DGPU(unittest.TestCase):
...
@@ -636,7 +633,6 @@ class TestConv2DGPU(unittest.TestCase):
imshp_logical
=
featshp_logical
[
1
:],
imshp_logical
=
featshp_logical
[
1
:],
kshp_logical
=
kshp
[
2
:])
kshp_logical
=
kshp
[
2
:])
def
test_invalid_input_shape
(
self
):
def
test_invalid_input_shape
(
self
):
"""
"""
Tests that when the shape gived at build time is not the same as
Tests that when the shape gived at build time is not the same as
...
@@ -659,7 +655,7 @@ class TestConv2DGPU(unittest.TestCase):
...
@@ -659,7 +655,7 @@ class TestConv2DGPU(unittest.TestCase):
for
mode
in
[
'valid'
,
'full'
]:
for
mode
in
[
'valid'
,
'full'
]:
for
shapes
in
[((
3
,
2
,
8
,
8
),
(
4
,
2
,
5
,
5
),
(
8
,
8
)),
for
shapes
in
[((
3
,
2
,
8
,
8
),
(
4
,
2
,
5
,
5
),
(
8
,
8
)),
((
3
,
2
,
8
,
8
),
(
4
,
2
,
5
,
5
),
(
5
,
8
)),
((
3
,
2
,
8
,
8
),
(
4
,
2
,
5
,
5
),
(
5
,
8
)),
#((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)),
#
((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)),
# We use only the number of columns.
# We use only the number of columns.
]:
]:
...
@@ -777,8 +773,8 @@ def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
...
@@ -777,8 +773,8 @@ def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
gpuval
=
numpy
.
array
(
f
(
gpuval
=
numpy
.
array
(
f
(
npy_img
.
transpose
(
1
,
0
,
2
,
3
),
npy_img
.
transpose
(
1
,
0
,
2
,
3
),
npy_kern
.
transpose
(
1
,
0
,
2
,
3
)[:,
:,
::
-
1
,
::
-
1
]))
.
transpose
(
npy_kern
.
transpose
(
1
,
0
,
2
,
3
)[:,
:,
::
-
1
,
::
-
1
])
1
,
0
,
2
,
3
)
)
.
transpose
(
1
,
0
,
2
,
3
)
assert_allclose
(
cpuval
,
gpuval
,
rtol
=
1e-4
)
assert_allclose
(
cpuval
,
gpuval
,
rtol
=
1e-4
)
...
@@ -892,43 +888,43 @@ def benchmark():
...
@@ -892,43 +888,43 @@ def benchmark():
shapes_valid
=
[
shapes_valid
=
[
# test_lenet_28 shape
# test_lenet_28 shape
((
20
,
60
,
12
,
12
),
(
30
,
60
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
20
,
60
,
12
,
12
),
(
30
,
60
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
# valid
,
((
60
,
20
,
12
,
12
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
60
,
20
,
12
,
12
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
60
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
60
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
1
,
60
,
28
,
28
),
(
20
,
60
,
24
,
24
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
1
,
60
,
28
,
28
),
(
20
,
60
,
24
,
24
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
# test_lenet_32 shape
# test_lenet_32 shape
,
((
20
,
60
,
14
,
14
),
(
30
,
60
,
10
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
20
,
60
,
14
,
14
),
(
30
,
60
,
10
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
60
,
20
,
14
,
14
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
60
,
20
,
14
,
14
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
60
,
1
,
32
,
32
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
60
,
1
,
32
,
32
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
1
,
60
,
32
,
32
),
(
20
,
60
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
1
,
60
,
32
,
32
),
(
20
,
60
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
# test_lenet_64 shape
# test_lenet_64 shape
,
((
10
,
20
,
29
,
29
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
10
,
20
,
29
,
29
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
20
,
10
,
29
,
29
),
(
30
,
10
,
23
,
23
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
20
,
10
,
29
,
29
),
(
30
,
10
,
23
,
23
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
10
,
1
,
64
,
64
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
10
,
1
,
64
,
64
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
1
,
10
,
64
,
64
),
(
20
,
10
,
58
,
58
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
1
,
10
,
64
,
64
),
(
20
,
10
,
58
,
58
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
# test_lenet_108 shape
# test_lenet_108 shape
,
((
10
,
20
,
51
,
51
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
10
,
20
,
51
,
51
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
20
,
10
,
51
,
51
),
(
30
,
10
,
45
,
45
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
20
,
10
,
51
,
51
),
(
30
,
10
,
45
,
45
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
10
,
1
,
108
,
108
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
10
,
1
,
108
,
108
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
1
,
10
,
108
,
108
),
(
20
,
10
,
102
,
102
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
1
,
10
,
108
,
108
),
(
20
,
10
,
102
,
102
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
# test_lenet_256 shape
# test_lenet_256 shape
,
((
2
,
20
,
124
,
124
),
(
30
,
20
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
2
,
20
,
124
,
124
),
(
30
,
20
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
20
,
2
,
124
,
124
),
(
30
,
2
,
116
,
116
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
20
,
2
,
124
,
124
),
(
30
,
2
,
116
,
116
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
2
,
1
,
256
,
256
),
(
20
,
1
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
2
,
1
,
256
,
256
),
(
20
,
1
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
1
,
2
,
256
,
256
),
(
20
,
2
,
248
,
248
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
1
,
2
,
256
,
256
),
(
20
,
2
,
248
,
248
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
]
]
shapes_full
=
[
shapes_full
=
[
# test_lenet_28 shape
# test_lenet_28 shape
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full
# test_lenet_32 shape
# test_lenet_32 shape
,
((
60
,
30
,
10
,
10
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full conv_full_patch_stack_padded' N=1
((
60
,
30
,
10
,
10
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full conv_full_patch_stack_padded' N=1
# test_lenet_64 shape
# test_lenet_64 shape
,
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full conv_full_patch_stack_padded' N=3
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full conv_full_patch_stack_padded' N=3
# test_lenet_108 shape
# test_lenet_108 shape
,
((
10
,
30
,
45
,
45
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full 'conv_full_patch_stack_padded' N=9
((
10
,
30
,
45
,
45
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full 'conv_full_patch_stack_padded' N=9
# test_lenet_256 shape
# test_lenet_256 shape
,
((
2
,
30
,
116
,
116
),
(
20
,
30
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full conv_reference_full
((
2
,
30
,
116
,
116
),
(
20
,
30
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full conv_reference_full
]
]
version
=
[
-
1
]
version
=
[
-
1
]
...
...
theano/sandbox/cuda/tests/test_cuda_ndarray.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
time
,
copy
,
sys
,
unittest
import
copy
import
unittest
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
...
@@ -32,7 +33,7 @@ def advantage(cpu_dt, gpu_dt):
...
@@ -32,7 +33,7 @@ def advantage(cpu_dt, gpu_dt):
def
test_host_to_device
():
def
test_host_to_device
():
#print >>sys.stdout, 'starting test_host_to_dev'
#
print >>sys.stdout, 'starting test_host_to_dev'
for
shape
in
((),
(
3
,),
(
2
,
3
),
(
3
,
4
,
5
,
6
)):
for
shape
in
((),
(
3
,),
(
2
,
3
),
(
3
,
4
,
5
,
6
)):
a
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
a
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
b
=
cuda_ndarray
.
CudaNdarray
(
a
)
b
=
cuda_ndarray
.
CudaNdarray
(
a
)
...
@@ -52,8 +53,7 @@ def test_host_to_device():
...
@@ -52,8 +53,7 @@ def test_host_to_device():
def
test_add_iadd_idiv
():
def
test_add_iadd_idiv
():
for
shapes
in
(
for
shapes
in
([(
5
,
5
),
(
5
,
1
)],
[(
5
,
5
),
(
5
,
1
)],
[(
5
,
5
),
(
1
,
5
)],
[(
5
,
5
),
(
1
,
5
)],
(),
(
0
,),
(
3
,),
(
2
,
3
),
(),
(
0
,),
(
3
,),
(
2
,
3
),
(
1
,
10000000
),
(
10000
,
1000
),
(
1000000
,
10
),
(
1
,
10000000
),
(
10000
,
1000
),
(
1000000
,
10
),
...
@@ -98,16 +98,10 @@ def test_add_iadd_idiv():
...
@@ -98,16 +98,10 @@ def test_add_iadd_idiv():
# add don't support stride
# add don't support stride
if
shape
==
shape2
:
if
shape
==
shape2
:
t0
=
time
.
time
()
bsum
=
b0
+
b1
bsum
=
b0
+
b1
bsum
=
b0
+
b1
bsum
=
b0
+
b1
t1
=
time
.
time
()
gpu_dt
=
t1
-
t0
t0
=
time
.
time
()
asum
=
a0
+
a1
asum
=
a0
+
a1
asum
=
a0
+
a1
asum
=
a0
+
a1
t1
=
time
.
time
()
cpu_dt
=
t1
-
t0
# print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt)
# print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt)
assert
numpy
.
allclose
(
asum
,
numpy
.
asarray
(
bsum
))
assert
numpy
.
allclose
(
asum
,
numpy
.
asarray
(
bsum
))
...
@@ -133,23 +127,9 @@ def test_add_iadd_idiv():
...
@@ -133,23 +127,9 @@ def test_add_iadd_idiv():
raise
Exception
(
"You need to modify this case!"
)
raise
Exception
(
"You need to modify this case!"
)
# TODO: b0[...,::-1] don't work
# TODO: b0[...,::-1] don't work
if
shape
==
shape2
:
t
=
False
try
:
_c
=
_b
+
b1
except
TypeError
:
t
=
True
assert
t
# test inplace version
# test inplace version
t0
=
time
.
time
()
b0
+=
b1
b0
+=
b1
t1
=
time
.
time
()
gpu_dt
=
t1
-
t0
t0
=
time
.
time
()
a0
+=
a1
a0
+=
a1
t1
=
time
.
time
()
cpu_dt
=
t1
-
t0
# print shape, 'adding inplace', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt)
# print shape, 'adding inplace', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt)
assert
numpy
.
allclose
(
a0
,
numpy
.
asarray
(
b0
))
assert
numpy
.
allclose
(
a0
,
numpy
.
asarray
(
b0
))
assert
numpy
.
allclose
(
a0
,
a0_orig
+
a1
)
assert
numpy
.
allclose
(
a0
,
a0_orig
+
a1
)
...
@@ -157,14 +137,14 @@ def test_add_iadd_idiv():
...
@@ -157,14 +137,14 @@ def test_add_iadd_idiv():
b0
/=
b1
b0
/=
b1
a0
/=
a1
a0
/=
a1
assert
numpy
.
allclose
(
a0
,
numpy
.
asarray
(
b0
))
assert
numpy
.
allclose
(
a0
,
numpy
.
asarray
(
b0
))
assert
numpy
.
allclose
(
a0
,
(
a0_orig
+
a1
)
/
a1
)
assert
numpy
.
allclose
(
a0
,
(
a0_orig
+
a1
)
/
a1
)
# test inplace version
# test inplace version
# for not contiguous input
# for not contiguous input
b0
+=
_b
b0
+=
_b
a0
+=
a1
[
...
,
::
-
1
]
a0
+=
a1
[
...
,
::
-
1
]
assert
numpy
.
allclose
(
a0
,
numpy
.
asarray
(
b0
))
assert
numpy
.
allclose
(
a0
,
numpy
.
asarray
(
b0
))
assert
numpy
.
allclose
(
a0
,
(
a0_orig
+
a1
)
/
a1
+
a1
[
...
,
::
-
1
])
assert
numpy
.
allclose
(
a0
,
(
a0_orig
+
a1
)
/
a1
+
a1
[
...
,
::
-
1
])
b0
/=
_b
b0
/=
_b
a0
/=
a1
[
...
,
::
-
1
]
a0
/=
a1
[
...
,
::
-
1
]
...
@@ -174,48 +154,42 @@ def test_add_iadd_idiv():
...
@@ -174,48 +154,42 @@ def test_add_iadd_idiv():
def
test_exp
():
def
test_exp
():
#print >>sys.stdout, 'starting test_exp'
#
print >>sys.stdout, 'starting test_exp'
for
shape
in
((),
(
3
,),
(
2
,
3
),
for
shape
in
((),
(
3
,),
(
2
,
3
),
(
1
,
10000000
),
(
10
,
1000000
),
(
1
,
10000000
),
(
10
,
1000000
),
(
100
,
100000
),
(
1000
,
10000
),
(
10000
,
1000
)):
(
100
,
100000
),
(
1000
,
10000
),
(
10000
,
1000
)):
a0
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
a0
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
a1
=
a0
.
copy
()
a1
=
a0
.
copy
()
b0
=
cuda_ndarray
.
CudaNdarray
(
a0
)
b0
=
cuda_ndarray
.
CudaNdarray
(
a0
)
b1
=
cuda_ndarray
.
CudaNdarray
(
a1
)
cuda_ndarray
.
CudaNdarray
(
a1
)
t0
=
time
.
time
()
bsum
=
b0
.
exp
()
bsum
=
b0
.
exp
()
t1
=
time
.
time
()
gpu_dt
=
t1
-
t0
t0
=
time
.
time
()
asum
=
numpy
.
exp
(
a1
)
asum
=
numpy
.
exp
(
a1
)
t1
=
time
.
time
()
cpu_dt
=
t1
-
t0
# print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt)
# print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt)
#c = numpy.asarray(b0+b1)
#
c = numpy.asarray(b0+b1)
if
asum
.
shape
:
if
asum
.
shape
:
assert
numpy
.
allclose
(
asum
,
numpy
.
asarray
(
bsum
))
assert
numpy
.
allclose
(
asum
,
numpy
.
asarray
(
bsum
))
def
test_copy
():
def
test_copy
():
#print >>sys.stdout, 'starting test_copy'
#
print >>sys.stdout, 'starting test_copy'
shape
=
(
500
,
499
)
shape
=
(
500
,
499
)
a
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
a
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
#print >>sys.stdout, '.. creating device object'
#
print >>sys.stdout, '.. creating device object'
b
=
cuda_ndarray
.
CudaNdarray
(
a
)
b
=
cuda_ndarray
.
CudaNdarray
(
a
)
#print >>sys.stdout, '.. copy'
#
print >>sys.stdout, '.. copy'
c
=
copy
.
copy
(
b
)
c
=
copy
.
copy
(
b
)
#print >>sys.stdout, '.. deepcopy'
#
print >>sys.stdout, '.. deepcopy'
d
=
copy
.
deepcopy
(
b
)
d
=
copy
.
deepcopy
(
b
)
#print >>sys.stdout, '.. comparisons'
#
print >>sys.stdout, '.. comparisons'
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
b
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
b
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
c
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
c
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
d
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
d
))
b
+=
b
b
+=
b
assert
numpy
.
allclose
(
a
+
a
,
numpy
.
asarray
(
b
))
assert
numpy
.
allclose
(
a
+
a
,
numpy
.
asarray
(
b
))
assert
numpy
.
allclose
(
a
+
a
,
numpy
.
asarray
(
c
))
assert
numpy
.
allclose
(
a
+
a
,
numpy
.
asarray
(
c
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
d
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
d
))
...
@@ -237,8 +211,8 @@ def test_nvcc_bug():
...
@@ -237,8 +211,8 @@ def test_nvcc_bug():
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
c
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
c
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
d
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
d
))
b
+=
b
b
+=
b
assert
numpy
.
allclose
(
a
+
a
,
numpy
.
asarray
(
b
))
assert
numpy
.
allclose
(
a
+
a
,
numpy
.
asarray
(
b
))
assert
numpy
.
allclose
(
a
+
a
,
numpy
.
asarray
(
c
))
assert
numpy
.
allclose
(
a
+
a
,
numpy
.
asarray
(
c
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
d
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
d
))
...
@@ -318,7 +292,7 @@ class test_DimShuffle(unittest.TestCase):
...
@@ -318,7 +292,7 @@ class test_DimShuffle(unittest.TestCase):
def
test_dot
():
def
test_dot
():
#print >>sys.stdout, 'starting test_dot'
#
print >>sys.stdout, 'starting test_dot'
utt
.
seed_rng
()
utt
.
seed_rng
()
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
...
@@ -347,12 +321,14 @@ def test_dot():
...
@@ -347,12 +321,14 @@ def test_dot():
b0
=
cuda_ndarray
.
CudaNdarray
(
a0
)
b0
=
cuda_ndarray
.
CudaNdarray
(
a0
)
assert
_allclose
(
numpy
.
dot
(
a0
.
T
,
a1
),
assert
_allclose
(
numpy
.
dot
(
a0
.
T
,
a1
),
cuda_ndarray
.
dot
(
cuda_ndarray
.
dimshuffle
(
b0
,
(
1
,
0
)),
b1
))
cuda_ndarray
.
dot
(
cuda_ndarray
.
dimshuffle
(
b0
,
(
1
,
0
)),
b1
))
a1
=
theano
.
_asarray
(
rng
.
randn
(
6
,
7
),
dtype
=
'float32'
)
a1
=
theano
.
_asarray
(
rng
.
randn
(
6
,
7
),
dtype
=
'float32'
)
b1
=
cuda_ndarray
.
CudaNdarray
(
a1
)
b1
=
cuda_ndarray
.
CudaNdarray
(
a1
)
assert
_allclose
(
numpy
.
dot
(
a0
.
T
,
a1
.
T
),
assert
_allclose
(
numpy
.
dot
(
a0
.
T
,
a1
.
T
),
cuda_ndarray
.
dot
(
cuda_ndarray
.
dimshuffle
(
b0
,
(
1
,
0
)),
cuda_ndarray
.
dot
(
cuda_ndarray
.
dimshuffle
(
b0
,
(
1
,
0
)),
cuda_ndarray
.
dimshuffle
(
b1
,
(
1
,
0
))))
cuda_ndarray
.
dimshuffle
(
b1
,
(
1
,
0
))))
...
@@ -367,8 +343,8 @@ def test_sum():
...
@@ -367,8 +343,8 @@ def test_sum():
assert
numpy
.
allclose
(
a0
.
sum
(),
assert
numpy
.
allclose
(
a0
.
sum
(),
numpy
.
asarray
(
b0
.
reduce_sum
([
1
,
1
])))
numpy
.
asarray
(
b0
.
reduce_sum
([
1
,
1
])))
a0
sum
=
a0
.
sum
(
axis
=
0
)
a0
.
sum
(
axis
=
0
)
b0
sum
=
b0
.
reduce_sum
([
1
,
0
])
b0
.
reduce_sum
([
1
,
0
])
# print 'asum\n',a0sum
# print 'asum\n',a0sum
# print 'bsum\n',numpy.asarray(b0sum)
# print 'bsum\n',numpy.asarray(b0sum)
...
@@ -399,8 +375,7 @@ def test_sum():
...
@@ -399,8 +375,7 @@ def test_sum():
def
test_reshape
():
def
test_reshape
():
shapelist
=
[
shapelist
=
[((
1
,
2
,
3
),
(
1
,
2
,
3
)),
((
1
,
2
,
3
),
(
1
,
2
,
3
)),
((
1
,),
(
1
,)),
((
1
,),
(
1
,)),
((
1
,
2
,
3
),
(
3
,
2
,
1
)),
((
1
,
2
,
3
),
(
3
,
2
,
1
)),
((
1
,
2
,
3
),
(
6
,)),
((
1
,
2
,
3
),
(
6
,)),
...
@@ -423,7 +398,7 @@ def test_reshape():
...
@@ -423,7 +398,7 @@ def test_reshape():
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
def
subtest
(
shape_1
,
shape_2
,
rng
):
def
subtest
(
shape_1
,
shape_2
,
rng
):
#print >> sys.stdout, "INFO: shapes", shape_1, shape_2
#
print >> sys.stdout, "INFO: shapes", shape_1, shape_2
a
=
theano
.
_asarray
(
rng
.
randn
(
*
shape_1
),
dtype
=
'float32'
)
a
=
theano
.
_asarray
(
rng
.
randn
(
*
shape_1
),
dtype
=
'float32'
)
b
=
cuda_ndarray
.
CudaNdarray
(
a
)
b
=
cuda_ndarray
.
CudaNdarray
(
a
)
...
@@ -459,8 +434,8 @@ def test_reshape():
...
@@ -459,8 +434,8 @@ def test_reshape():
b
=
cuda_ndarray
.
CudaNdarray
(
a
)
b
=
cuda_ndarray
.
CudaNdarray
(
a
)
try
:
try
:
b
b
=
b
.
reshape
(
shape_2
)
b
.
reshape
(
shape_2
)
except
Exception
as
ValueError
:
except
Exception
:
return
return
assert
False
assert
False
...
@@ -509,7 +484,7 @@ def test_stride_manipulation():
...
@@ -509,7 +484,7 @@ def test_stride_manipulation():
b_strides
=
b
.
_strides
b_strides
=
b
.
_strides
for
i
in
xrange
(
len
(
b
.
shape
)):
for
i
in
xrange
(
len
(
b
.
shape
)):
offset
+=
(
b
.
shape
[
i
]
-
1
)
*
b_strides
[
i
]
offset
+=
(
b
.
shape
[
i
]
-
1
)
*
b_strides
[
i
]
v
.
_set_stride
(
i
,
-
b_strides
[
i
])
v
.
_set_stride
(
i
,
-
b_strides
[
i
])
v
.
_dev_data
+=
offset
*
sizeof_float
v
.
_dev_data
+=
offset
*
sizeof_float
...
@@ -699,8 +674,8 @@ def test_setitem_matrixvector1():
...
@@ -699,8 +674,8 @@ def test_setitem_matrixvector1():
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
_a
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
_a
))
# test direct transfert from numpy
# test direct transfert from numpy
_a
[:,
1
]
=
b
*
100
_a
[:,
1
]
=
b
*
100
a
[:,
1
]
=
b
*
100
a
[:,
1
]
=
b
*
100
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
_a
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
_a
))
row
=
theano
.
_asarray
([
777
,
888
,
999
],
dtype
=
'float32'
)
row
=
theano
.
_asarray
([
777
,
888
,
999
],
dtype
=
'float32'
)
...
@@ -725,8 +700,8 @@ def test_setitem_matrix_tensor3():
...
@@ -725,8 +700,8 @@ def test_setitem_matrix_tensor3():
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
_a
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
_a
))
# test direct transfert from numpy
# test direct transfert from numpy
_a
[:,
1
,
1
]
=
b
*
100
_a
[:,
1
,
1
]
=
b
*
100
a
[:,
1
,
1
]
=
b
*
100
a
[:,
1
,
1
]
=
b
*
100
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
_a
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
_a
))
row
=
theano
.
_asarray
([
777
,
888
,
999
],
dtype
=
'float32'
)
row
=
theano
.
_asarray
([
777
,
888
,
999
],
dtype
=
'float32'
)
...
@@ -752,7 +727,7 @@ def test_setitem_matrix_bad_shape():
...
@@ -752,7 +727,7 @@ def test_setitem_matrix_bad_shape():
# attempt to assign the ndarray b with setitem
# attempt to assign the ndarray b with setitem
_a
[:,
1
,
1
]
=
_b
_a
[:,
1
,
1
]
=
_b
assert
False
assert
False
except
ValueError
as
e
:
except
ValueError
:
# print e
# print e
assert
True
assert
True
...
@@ -761,7 +736,7 @@ def test_setitem_matrix_bad_shape():
...
@@ -761,7 +736,7 @@ def test_setitem_matrix_bad_shape():
# attempt to assign the ndarray b with setitem
# attempt to assign the ndarray b with setitem
_a
[
1
,
1
,
:]
=
b
_a
[
1
,
1
,
:]
=
b
assert
False
assert
False
except
ValueError
as
e
:
except
ValueError
:
# print e
# print e
assert
True
assert
True
...
@@ -779,7 +754,7 @@ def test_setitem_matrix_bad_ndim():
...
@@ -779,7 +754,7 @@ def test_setitem_matrix_bad_ndim():
# attempt to assign the ndarray b with setitem
# attempt to assign the ndarray b with setitem
_a
[:,
:,
1
]
=
_b
_a
[:,
:,
1
]
=
_b
assert
False
assert
False
except
ValueError
as
e
:
except
ValueError
:
# print e
# print e
assert
True
assert
True
...
@@ -788,7 +763,7 @@ def test_setitem_matrix_bad_ndim():
...
@@ -788,7 +763,7 @@ def test_setitem_matrix_bad_ndim():
# attempt to assign the ndarray b with setitem
# attempt to assign the ndarray b with setitem
_a
[
1
,
:,
:]
=
b
_a
[
1
,
:,
:]
=
b
assert
False
assert
False
except
ValueError
as
e
:
except
ValueError
:
# print e
# print e
assert
True
assert
True
...
@@ -806,7 +781,7 @@ def test_setitem_matrix_bad_type():
...
@@ -806,7 +781,7 @@ def test_setitem_matrix_bad_type():
# attempt to assign the ndarray b with setitem
# attempt to assign the ndarray b with setitem
_a
[
1
,
:,
:]
=
b
_a
[
1
,
:,
:]
=
b
assert
False
assert
False
except
TypeError
as
e
:
except
TypeError
:
# print e
# print e
assert
True
assert
True
...
@@ -832,8 +807,8 @@ def test_setitem_assign_to_slice():
...
@@ -832,8 +807,8 @@ def test_setitem_assign_to_slice():
# test direct transfert from numpy
# test direct transfert from numpy
_d
=
_a
[
1
,
:,
:]
_d
=
_a
[
1
,
:,
:]
_d
[
1
,
:]
=
b
*
10
_d
[
1
,
:]
=
b
*
10
a
[
1
,
:,
:][
1
,
:]
=
b
*
10
a
[
1
,
:,
:][
1
,
:]
=
b
*
10
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
_a
))
assert
numpy
.
allclose
(
a
,
numpy
.
asarray
(
_a
))
...
@@ -923,7 +898,7 @@ def test_setitem_rightvalue_ndarray_fails():
...
@@ -923,7 +898,7 @@ def test_setitem_rightvalue_ndarray_fails():
b
=
theano
.
_asarray
([
7
,
8
,
9
,
10
],
dtype
=
'float32'
)
b
=
theano
.
_asarray
([
7
,
8
,
9
,
10
],
dtype
=
'float32'
)
_b
=
cuda_ndarray
.
CudaNdarray
(
b
)
_b
=
cuda_ndarray
.
CudaNdarray
(
b
)
b5
=
theano
.
_asarray
([
7
,
8
,
9
,
10
,
11
],
dtype
=
'float32'
)
b5
=
theano
.
_asarray
([
7
,
8
,
9
,
10
,
11
],
dtype
=
'float32'
)
_b5
=
cuda_ndarray
.
CudaNdarray
(
b
)
cuda_ndarray
.
CudaNdarray
(
b
)
# attempt to assign the ndarray b with setitem
# attempt to assign the ndarray b with setitem
_a
[:,
:,
1
]
=
_b
_a
[:,
:,
1
]
=
_b
...
@@ -941,9 +916,9 @@ def test_setitem_rightvalue_ndarray_fails():
...
@@ -941,9 +916,9 @@ def test_setitem_rightvalue_ndarray_fails():
# without same number of dim
# without same number of dim
try
:
try
:
_a
[
0
,
:,
:]
=
mat
_a
[
0
,
:,
:]
=
mat
#a[0, :, :] = mat
#
a[0, :, :] = mat
#assert numpy.allclose(numpy.asarray(_a), a)
#
assert numpy.allclose(numpy.asarray(_a), a)
except
ValueError
as
e
:
except
ValueError
:
pass
pass
# test direct transfert from numpy with broadcast
# test direct transfert from numpy with broadcast
...
@@ -964,7 +939,7 @@ def test_zeros_basic():
...
@@ -964,7 +939,7 @@ def test_zeros_basic():
_n
=
numpy
.
zeros
(
shp
,
dtype
=
"float32"
)
_n
=
numpy
.
zeros
(
shp
,
dtype
=
"float32"
)
assert
numpy
.
allclose
(
numpy
.
asarray
(
_a
),
_n
)
assert
numpy
.
allclose
(
numpy
.
asarray
(
_a
),
_n
)
assert
_a
.
shape
==
_n
.
shape
assert
_a
.
shape
==
_n
.
shape
assert
all
(
_a
.
_strides
==
numpy
.
asarray
(
_n
.
strides
)
/
4
)
assert
all
(
_a
.
_strides
==
numpy
.
asarray
(
_n
.
strides
)
/
4
)
# TODO:The following don't have the same stride!
# TODO:The following don't have the same stride!
# This should be fixed with the new GpuNdArray.
# This should be fixed with the new GpuNdArray.
...
@@ -1039,10 +1014,7 @@ def test_is_c_contiguous():
...
@@ -1039,10 +1014,7 @@ def test_is_c_contiguous():
assert
not
a
[::
2
]
.
is_c_contiguous
()
assert
not
a
[::
2
]
.
is_c_contiguous
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
test_zeros_basic_3d_tensor
()
test_zeros_basic_vector
()
test_setitem_matrixvector1
()
test_setitem_matrixvector1
()
test_setitem_matrix_tensor3
()
test_setitem_matrix_tensor3
()
test_setitem_broadcast_must_fail
()
test_setitem_assign_to_slice
()
test_setitem_assign_to_slice
()
test_setitem_rightvalue_ndarray_fails
()
test_setitem_rightvalue_ndarray_fails
()
theano/sandbox/cuda/tests/test_driver.py
浏览文件 @
b69ad54d
...
@@ -6,7 +6,7 @@ import theano
...
@@ -6,7 +6,7 @@ import theano
try
:
try
:
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
except
ImportError
:
except
ImportError
:
# To have the GPU back-end work without nose, we need this file to
# To have the GPU back-end work without nose, we need this file to
...
@@ -33,8 +33,9 @@ def test_nvidia_driver1():
...
@@ -33,8 +33,9 @@ def test_nvidia_driver1():
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
2
assert
len
(
topo
)
==
2
if
sum
(
isinstance
(
node
.
op
,
B
.
GpuCAReduce
)
for
node
in
topo
)
!=
1
:
if
sum
(
isinstance
(
node
.
op
,
B
.
GpuCAReduce
)
for
node
in
topo
)
!=
1
:
msg
=
'
\n\t
'
.
join
([
'Expected exactly one occurrence of GpuCAReduce '
+
msg
=
'
\n\t
'
.
join
(
'but got:'
]
+
[
str
(
app
)
for
app
in
topo
])
[
'Expected exactly one occurrence of GpuCAReduce '
+
'but got:'
]
+
[
str
(
app
)
for
app
in
topo
])
raise
AssertionError
(
msg
)
raise
AssertionError
(
msg
)
if
not
numpy
.
allclose
(
f
(),
a
.
sum
()):
if
not
numpy
.
allclose
(
f
(),
a
.
sum
()):
raise
Exception
(
"The nvidia driver version installed with this OS "
raise
Exception
(
"The nvidia driver version installed with this OS "
...
...
theano/sandbox/cuda/tests/test_extra_ops.py
浏览文件 @
b69ad54d
...
@@ -5,24 +5,22 @@ import itertools
...
@@ -5,24 +5,22 @@ import itertools
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
numpy
as
np
import
numpy
as
np
from
six.moves
import
xrange
from
six.moves
import
xrange
from
theano
import
tensor
as
T
import
theano
from
theano.tensor.extra_ops
import
cumsum
,
CumsumOp
from
theano.tests
import
unittest_tools
as
utt
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
is
False
:
if
cuda_ndarray
.
cuda_available
:
import
theano.tensor.tests.test_extra_ops
from
theano.sandbox.cuda.extra_ops
import
GpuCumsum
else
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.tensor.tests.test_extra_ops
from
theano.sandbox.cuda.extra_ops
import
GpuCumsum
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
else
:
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
from
theano
import
tensor
as
T
import
theano
from
theano.tensor.extra_ops
import
cumsum
,
CumsumOp
from
theano.tests
import
unittest_tools
as
utt
class
TestGpuCumsum
(
theano
.
tensor
.
tests
.
test_extra_ops
.
TestCumsumOp
):
class
TestGpuCumsum
(
theano
.
tensor
.
tests
.
test_extra_ops
.
TestCumsumOp
):
mode
=
mode_with_gpu
mode
=
mode_with_gpu
...
@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
...
@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
utt
.
assert_allclose
(
np
.
cumsum
(
a
[:
i
]),
f
(
a
[:
i
]))
utt
.
assert_allclose
(
np
.
cumsum
(
a
[:
i
]),
f
(
a
[:
i
]))
# Use multiple GPU threadblocks
# Use multiple GPU threadblocks
a
=
np
.
random
.
random
((
block_max_size
+
2
,))
.
astype
(
"float32"
)
a
=
np
.
random
.
random
((
block_max_size
+
2
,))
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
# Use recursive cumsum
# Use recursive cumsum
a
=
np
.
ones
((
block_max_size
*
(
block_max_size
+
1
)
+
2
,),
a
=
np
.
ones
((
block_max_size
*
(
block_max_size
+
1
)
+
2
,),
dtype
=
"float32"
)
dtype
=
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
...
@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
...
@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks
# Use multiple GPU threadblocks
a_shape
=
[
5
,
5
]
a_shape
=
[
5
,
5
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks
# Use multiple GPU gridblocks
a_shape
=
[
4
,
4
]
a_shape
=
[
4
,
4
]
a_shape
[
1
-
shape_axis
]
=
self
.
max_grid_size1
+
1
a_shape
[
1
-
shape_axis
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
),
rtol
=
5e-5
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
),
rtol
=
5e-5
)
# Use recursive cumsum
# Use recursive cumsum
a_shape
=
[
3
,
3
]
a_shape
=
[
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
def
test_GpuCumsum3D
(
self
):
def
test_GpuCumsum3D
(
self
):
...
@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
...
@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks (along accumulation axis)
# Use multiple GPU threadblocks (along accumulation axis)
a_shape
=
[
2
,
2
,
2
]
a_shape
=
[
2
,
2
,
2
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks (not along accumulation axis)
# Use multiple GPU gridblocks (not along accumulation axis)
a_shape
=
[
5
,
5
,
5
]
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
1
)
%
3
]
=
self
.
max_grid_size1
+
1
a_shape
[(
shape_axis
+
1
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
if
axis
is
None
:
# Avoid floating point error
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
a_shape
=
[
5
,
5
,
5
]
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
2
)
%
3
]
=
self
.
max_grid_size1
+
1
a_shape
[(
shape_axis
+
2
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
if
axis
is
None
:
# Avoid floating point error
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use recursive cumsum (along accumulation axis)
# Use recursive cumsum (along accumulation axis)
a_shape
=
[
3
,
3
,
3
]
a_shape
=
[
3
,
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
def
test_GpuCumsum4D
(
self
):
def
test_GpuCumsum4D
(
self
):
...
...
theano/sandbox/cuda/tests/test_gemmcorr3d.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
unittest
import
unittest
import
numpy
import
numpy
import
copy
import
theano
import
theano
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
# Skip tests if cuda_ndarray is not available.
# Skip tests if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
if
not
cuda_ndarray
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda not available'
)
from
theano.sandbox.cuda
import
float32_shared_constructor
as
shared
from
theano.sandbox.cuda
import
float32_shared_constructor
as
shared
from
theano.sandbox.cuda.blas
import
(
from
theano.sandbox.cuda.blas
import
(
GpuCorr3dMM
,
GpuCorr3dMM_gradWeights
,
GpuCorr3dMM_gradInputs
)
GpuCorr3dMM
,
GpuCorr3dMM_gradWeights
,
GpuCorr3dMM_gradInputs
)
from
theano.sandbox.cuda.basic_ops
import
gpu_contiguous
from
theano.sandbox.cuda.basic_ops
import
gpu_contiguous
import
theano.sandbox.cuda
as
cuda_ndarray
if
not
cuda_ndarray
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda not available'
)
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
...
@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
...
@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
inputs
=
shared
(
inputs_val
)
inputs
=
shared
(
inputs_val
)
filters
=
shared
(
filters_val
)
filters
=
shared
(
filters_val
)
bias
=
shared
(
numpy
.
zeros
(
filters_shape
[
4
])
.
astype
(
'float32'
))
bias
=
shared
(
numpy
.
zeros
(
filters_shape
[
4
])
.
astype
(
'float32'
))
conv
=
theano
.
tensor
.
nnet
.
convTransp3D
(
W
=
filters
,
b
=
bias
,
d
=
subsample
,
conv
=
theano
.
tensor
.
nnet
.
convTransp3D
(
W
=
filters
,
b
=
bias
,
d
=
subsample
,
H
=
inputs
)
H
=
inputs
)
f_ref
=
theano
.
function
([],
conv
)
f_ref
=
theano
.
function
([],
conv
)
res_ref
=
f_ref
()
res_ref
=
f_ref
()
...
...
theano/sandbox/cuda/tests/test_gradient.py
浏览文件 @
b69ad54d
...
@@ -8,7 +8,7 @@ from theano.sandbox import cuda
...
@@ -8,7 +8,7 @@ from theano.sandbox import cuda
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
...
...
theano/sandbox/cuda/tests/test_memory.py
浏览文件 @
b69ad54d
...
@@ -11,7 +11,7 @@ from theano import ifelse
...
@@ -11,7 +11,7 @@ from theano import ifelse
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
if
cuda
.
cuda_available
==
False
:
if
cuda
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
...
@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
...
@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
theano_alloc
=
cuda
.
cuda_ndarray
.
cuda_ndarray
.
theano_allocated
()
theano_alloc
=
cuda
.
cuda_ndarray
.
cuda_ndarray
.
theano_allocated
()
return
(
"(n malloc/theano mem allocated in KB)"
,
return
(
"(n malloc/theano mem allocated in KB)"
,
n_mallocs
+
extra_alloc
,
n_mallocs
+
extra_alloc
,
int
(
theano_alloc
/
1024
)
+
extra_size
)
int
(
theano_alloc
/
1024
))
return
(
"n malloc on the gpu"
,
n_mallocs
+
extra_alloc
)
return
(
"n malloc on the gpu"
,
n_mallocs
+
extra_alloc
)
# I don't use the following by default as if there is other stuff running
# I don't use the following by default as if there is other stuff running
...
@@ -83,9 +83,12 @@ def test_memory():
...
@@ -83,9 +83,12 @@ def test_memory():
variables
=
cuda
.
shared_constructor
(
np
.
ones
((
shapes
[
1
],),
variables
=
cuda
.
shared_constructor
(
np
.
ones
((
shapes
[
1
],),
dtype
=
'float32'
))
dtype
=
'float32'
))
derp
=
tensor
.
sum
(
tensor
.
dot
(
some_matrix
[:
shapes
[
0
]],
variables
))
derp
=
tensor
.
sum
(
tensor
.
dot
(
some_matrix
[:
shapes
[
0
]],
variables
))
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
borrow
=
True
,
borrow
=
True
,
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
mem2
=
freemem
()
mem2
=
freemem
()
print
(
"Before compilation"
,
mem2
)
print
(
"Before compilation"
,
mem2
)
...
@@ -112,7 +115,7 @@ def test_memory():
...
@@ -112,7 +115,7 @@ def test_memory():
del
obj
del
obj
# print "After deleting function 1", freemem()
# print "After deleting function 1", freemem()
#assert mem2 == freemem(), (mem2, freemem())
#
assert mem2 == freemem(), (mem2, freemem())
del
grad
del
grad
print
(
"After deleting function 2"
,
freemem
())
print
(
"After deleting function 2"
,
freemem
())
...
@@ -155,16 +158,19 @@ def test_memory_lazy():
...
@@ -155,16 +158,19 @@ def test_memory_lazy():
derp
=
ifelse
.
IfElse
(
1
)(
branch_select
,
derp
=
ifelse
.
IfElse
(
1
)(
branch_select
,
derp
,
some_matrix
[:
shapes
[
0
]]
.
sum
())
derp
,
some_matrix
[:
shapes
[
0
]]
.
sum
())
derp
+=
1
derp
+=
1
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
borrow
=
True
,
borrow
=
True
,
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
mem2
=
freemem
()
mem2
=
freemem
()
print
(
"Before compilation"
,
mem2
)
print
(
"Before compilation"
,
mem2
)
mem2_1
=
freemem
(
extra_alloc
=
more_alloc1
)
mem2_1
=
freemem
(
extra_alloc
=
more_alloc1
)
obj
=
theano
.
function
([
some_vector
,
branch_select
],
derp
,
obj
=
theano
.
function
([
some_vector
,
branch_select
],
derp
,
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
#theano.printing.debugprint(obj, print_type=True)
#
theano.printing.debugprint(obj, print_type=True)
mem3
=
freemem
()
mem3
=
freemem
()
print
(
"After function compilation 1"
,
mem3
)
print
(
"After function compilation 1"
,
mem3
)
assert
mem2_1
==
mem3
,
(
mem2_1
,
mem3
)
assert
mem2_1
==
mem3
,
(
mem2_1
,
mem3
)
...
...
theano/sandbox/cuda/tests/test_mlp.py
浏览文件 @
b69ad54d
...
@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
...
@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
'otherwise it is too slow!'
)
'otherwise it is too slow!'
)
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
if
tcn
.
cuda_available
==
False
:
if
tcn
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
...
@@ -147,19 +147,20 @@ def test_run_nnet():
...
@@ -147,19 +147,20 @@ def test_run_nnet():
rtol
=
1e-4
rtol
=
1e-4
if
n_in
*
n_hid
>=
2048
*
4096
:
if
n_in
*
n_hid
>=
2048
*
4096
:
rtol
=
7e-4
rtol
=
7e-4
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
rtol
,
atol
=
1e-6
),
\
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
rtol
,
atol
=
1e-6
),
\
(
"max_abs_diff, max_rel_diff, n_in, n_hid"
,
max_abs_diff
,
(
"max_abs_diff, max_rel_diff, n_in, n_hid"
,
max_abs_diff
,
rel_diff
.
max
(),
n_in
,
n_hid
)
rel_diff
.
max
(),
n_in
,
n_hid
)
def
test_run_nnet_med
():
def
test_run_nnet_med
():
utt
.
seed_rng
()
utt
.
seed_rng
()
r
val_cpu
=
r
un_nnet
(
False
,
10
,
128
,
50
,
4
,
n_train
=
10000
)
run_nnet
(
False
,
10
,
128
,
50
,
4
,
n_train
=
10000
)
def
test_run_nnet_small
():
def
test_run_nnet_small
():
utt
.
seed_rng
()
utt
.
seed_rng
()
r
val_cpu
=
r
un_nnet
(
False
,
10
,
10
,
4
,
4
,
n_train
=
100000
)
run_nnet
(
False
,
10
,
10
,
4
,
4
,
n_train
=
100000
)
def
run_conv_nnet1
(
use_gpu
):
def
run_conv_nnet1
(
use_gpu
):
...
@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
...
@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
mode
=
get_mode
(
use_gpu
)
mode
=
get_mode
(
use_gpu
)
# print 'building pfunc ...'
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
train
=
pfunc
(
g
in
zip
(
params
,
gparams
)])
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
# print i, n
...
@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
...
@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
n_batch
,
1
,
1
)
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
n_batch
,
1
,
1
)
conv_op1
=
conv
.
ConvOp
((
n_kern
,
logical_hid_shape
[
0
]
//
2
,
conv_op1
=
conv
.
ConvOp
((
n_kern
,
logical_hid_shape
[
0
]
//
2
,
logical_hid_shape
[
1
]
//
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
)
logical_hid_shape
[
1
]
//
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
)
hid
=
tensor
.
tanh
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid
=
tensor
.
tanh
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
[:,
:,
::
2
,
::
2
],
w1
)
+
b1
.
dimshuffle
((
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
[:,
:,
::
2
,
::
2
],
w1
)
+
b1
.
dimshuffle
((
...
@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
...
@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
mode
=
get_mode
(
use_gpu
)
mode
=
get_mode
(
use_gpu
)
# print 'building pfunc ...'
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
train
=
pfunc
(
g
in
zip
(
params
,
gparams
)])
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
# print i, n
...
@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
...
@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
if
downsample_ops
:
if
downsample_ops
:
hid
=
tensor
.
tanh
(
ds_op
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
))))
hid
=
tensor
.
tanh
(
ds_op
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
))))
else
:
else
:
hid
=
tensor
.
tanh
((
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)
hid
=
tensor
.
tanh
(
))[:,
:,
::
2
,
::
2
])
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
(
(
0
,
'x'
,
'x'
)))[:,
:,
::
2
,
::
2
])
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
,
w1
)
+
b1
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
,
w1
)
+
b1
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid_flat
=
hid1
.
reshape
((
n_batch
,
n_hid
))
hid_flat
=
hid1
.
reshape
((
n_batch
,
n_hid
))
out
=
tensor
.
nnet
.
softmax
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
out
=
tensor
.
nnet
.
softmax
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
loss
=
tensor
.
sum
(
tensor
.
nnet
.
crossentropy_categorical_1hot
(
out
,
loss
=
tensor
.
sum
(
tensor
.
nnet
.
crossentropy_categorical_1hot
(
tensor
.
argmax
(
y
,
axis
=
1
))
*
lr
)
out
,
tensor
.
argmax
(
y
,
axis
=
1
))
*
lr
)
# print 'loss type', loss.type
# print 'loss type', loss.type
params
=
[
w0
,
b0
,
w1
,
b1
,
v
,
c
]
params
=
[
w0
,
b0
,
w1
,
b1
,
v
,
c
]
...
@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
...
@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
mode
=
get_mode
(
use_gpu
,
check_isfinite
)
mode
=
get_mode
(
use_gpu
,
check_isfinite
)
# print 'building pfunc ...'
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
train
=
pfunc
(
g
in
zip
(
params
,
gparams
)])
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
if
verbose
:
if
verbose
:
theano
.
printing
.
debugprint
(
train
)
theano
.
printing
.
debugprint
(
train
)
...
@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
...
@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
rvals
=
my_zeros
(
n_train
)
rvals
=
my_zeros
(
n_train
)
t0
=
time
.
time
()
for
i
in
xrange
(
n_train
):
for
i
in
xrange
(
n_train
):
rvals
[
i
]
=
train
(
xval
,
yval
,
lr
)[
0
]
rvals
[
i
]
=
train
(
xval
,
yval
,
lr
)[
0
]
t1
=
time
.
time
()
print_mode
(
mode
)
print_mode
(
mode
)
if
pickle
and
isinstance
(
mode
,
theano
.
compile
.
ProfileMode
):
if
pickle
and
isinstance
(
mode
,
theano
.
compile
.
ProfileMode
):
...
@@ -495,7 +506,8 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
...
@@ -495,7 +506,8 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
compare
=
True
compare
=
True
if
not
compare
:
if
not
compare
:
return
run_conv_nnet2_classif
(
use_gpu
=
use_gpu
,
return
run_conv_nnet2_classif
(
use_gpu
=
use_gpu
,
seed
=
seed
,
isize
=
isize
,
ksize
=
ksize
,
bsize
=
bsize
,
seed
=
seed
,
isize
=
isize
,
ksize
=
ksize
,
bsize
=
bsize
,
n_train
=
n_train
,
n_train
=
n_train
,
check_isfinite
=
check_isfinite
,
check_isfinite
=
check_isfinite
,
...
@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
...
@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
finally
:
finally
:
theano
.
tensor
.
basic
.
float32_atol
=
orig_float32_atol
theano
.
tensor
.
basic
.
float32_atol
=
orig_float32_atol
if
pickle
:
if
isinstance
(
cpu_mode
,
theano
.
compile
.
ProfileMode
):
import
pickle
print
(
"BEGIN CPU profile mode dump"
)
print
(
pickle
.
dumps
(
cpu_mode
))
print
(
"END CPU profile mode dump"
)
if
isinstance
(
gpu_mode
,
theano
.
compile
.
ProfileMode
):
import
pickle
print
(
"BEGIN GPU profile mode dump"
)
print
(
pickle
.
dumps
(
gpu_mode
))
print
(
"END GPU profile mode dump"
)
# print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
# print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
# (time_cpu, time_gpu, time_cpu/time_gpu))
# (time_cpu, time_gpu, time_cpu/time_gpu))
# print "Estimated time for one pass through MNIST with CPU: %f" % (
# print "Estimated time for one pass through MNIST with CPU: %f" % (
...
...
theano/sandbox/cuda/tests/test_neighbours.py
浏览文件 @
b69ad54d
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
unittest
import
theano.tensor.nnet.tests.test_neighbours
from
theano.sandbox.cuda.neighbours
import
GpuImages2Neibs
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.tensor.nnet.tests.test_neighbours
from
theano.sandbox.cuda.neighbours
import
GpuImages2Neibs
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
...
...
theano/sandbox/cuda/tests/test_opt.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
operator
import
operator
import
sys
import
sys
import
unittest
import
numpy
import
numpy
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
...
@@ -9,39 +8,28 @@ from nose.plugins.skip import SkipTest
...
@@ -9,39 +8,28 @@ from nose.plugins.skip import SkipTest
from
nose.tools
import
assert_raises
from
nose.tools
import
assert_raises
import
theano
import
theano
import
theano.sandbox.cuda.cula
as
cula
from
theano.sandbox.cuda
import
basic_ops
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.scalar.basic_scipy
import
erfinv
from
six.moves
import
reduce
from
six.moves
import
reduce
from
theano.compile.pfunc
import
pfunc
from
theano.compile.pfunc
import
pfunc
from
theano
import
config
,
tensor
from
theano
import
config
,
tensor
import
theano.tensor.tests.test_nlinalg
import
theano.tensor.tests.test_nlinalg
import
theano.tensor.tests.test_opt
as
test_opt
import
theano.tensor.tests.test_opt
as
test_opt
from
theano.tensor.nnet.blocksparse
import
sparse_block_dot
from
theano.sandbox.cuda.blocksparse
import
GpuSparseBlockGemv
from
theano.sandbox.cuda.blocksparse
import
GpuSparseBlockOuter
from
theano.tests.breakpoint
import
PdbBreakpoint
from
theano.tests.breakpoint
import
PdbBreakpoint
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
import
theano.tests.test_ifelse
import
theano.sandbox.cuda
as
cuda
import
theano.sandbox.cuda
as
cuda
if
not
cuda
.
cuda_available
:
if
not
cuda
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.sandbox.cuda.cula
as
cula
from
theano.sandbox.cuda
import
basic_ops
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.scalar.basic_scipy
import
erfinv
from
theano.tensor.nnet.blocksparse
import
sparse_block_dot
from
theano.sandbox.cuda.blocksparse
import
GpuSparseBlockGemv
,
GpuSparseBlockOuter
imported_scipy_special
=
False
try
:
import
scipy.special
imported_scipy_special
=
True
# Importing scipy.special may raise ValueError.
# See http://projects.scipy.org/scipy/ticket/1739
except
(
ImportError
,
ValueError
):
pass
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpu'
)
...
@@ -152,7 +140,7 @@ def test_local_assert_no_cpu_op():
...
@@ -152,7 +140,7 @@ def test_local_assert_no_cpu_op():
def
test_int_pow
():
def
test_int_pow
():
a
=
CudaNdarrayType
([
False
])()
a
=
CudaNdarrayType
([
False
])()
f
=
theano
.
function
([
a
],
(
a
*
4
)
.
sum
(),
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
a
],
(
a
*
4
)
.
sum
(),
mode
=
mode_with_gpu
)
op_names
=
[
n
.
op
.
__class__
.
__name__
for
n
in
f
.
maker
.
fgraph
.
toposort
()]
op_names
=
[
n
.
op
.
__class__
.
__name__
for
n
in
f
.
maker
.
fgraph
.
toposort
()]
assert
op_names
==
[
'GpuCAReduce'
,
'GpuElemwise'
,
'HostFromGpu'
]
assert
op_names
==
[
'GpuCAReduce'
,
'GpuElemwise'
,
'HostFromGpu'
]
...
@@ -175,23 +163,30 @@ def test_gpualloc():
...
@@ -175,23 +163,30 @@ def test_gpualloc():
x
=
theano
.
shared
(
numpy
.
ones
(
3
,
dtype
=
'float32'
),
'x'
)
x
=
theano
.
shared
(
numpy
.
ones
(
3
,
dtype
=
'float32'
),
'x'
)
m
=
(
x
)
.
dimshuffle
([
'x'
,
0
])
m
=
(
x
)
.
dimshuffle
([
'x'
,
0
])
v
=
tensor
.
alloc
(
1.
,
*
m
.
shape
)
v
=
tensor
.
alloc
(
1.
,
*
m
.
shape
)
f
=
theano
.
function
([],
v
+
x
,
f
=
theano
.
function
([],
mode
=
mode_with_gpu
.
excluding
(
"local_elemwise_alloc"
))
v
+
x
,
mode
=
mode_with_gpu
.
excluding
(
"local_elemwise_alloc"
))
l
=
f
.
maker
.
fgraph
.
toposort
()
l
=
f
.
maker
.
fgraph
.
toposort
()
assert
numpy
.
any
([
isinstance
(
x
.
op
,
cuda
.
GpuAlloc
)
for
x
in
l
])
assert
numpy
.
any
([
isinstance
(
x
.
op
,
cuda
.
GpuAlloc
)
for
y
in
l
])
def
test_gpuallocempty
():
def
test_gpuallocempty
():
f_gpu
=
theano
.
function
([],
tensor
.
AllocEmpty
(
'float32'
)(
2
,
3
),
f_gpu
=
theano
.
function
(
[],
tensor
.
AllocEmpty
(
'float32'
)(
2
,
3
),
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
l_gpu
=
f_gpu
.
maker
.
fgraph
.
toposort
()
l_gpu
=
f_gpu
.
maker
.
fgraph
.
toposort
()
assert
numpy
.
any
([
isinstance
(
x
.
op
,
basic_ops
.
GpuAllocEmpty
)
for
x
in
l_gpu
])
assert
numpy
.
any
(
[
isinstance
(
x
.
op
,
basic_ops
.
GpuAllocEmpty
)
for
x
in
l_gpu
])
f_cpu
=
theano
.
function
([],
tensor
.
AllocEmpty
(
'int32'
)(
2
,
3
))
f_cpu
=
theano
.
function
([],
tensor
.
AllocEmpty
(
'int32'
)(
2
,
3
))
l_cpu
=
f_cpu
.
maker
.
fgraph
.
toposort
()
l_cpu
=
f_cpu
.
maker
.
fgraph
.
toposort
()
assert
not
numpy
.
any
([
isinstance
(
x
.
op
,
basic_ops
.
GpuAllocEmpty
)
for
x
in
l_cpu
])
assert
not
numpy
.
any
(
[
isinstance
(
x
.
op
,
basic_ops
.
GpuAllocEmpty
)
for
x
in
l_cpu
])
class
Test_local_elemwise_alloc
(
test_opt
.
Test_local_elemwise_alloc
):
class
Test_local_elemwise_alloc
(
test_opt
.
Test_local_elemwise_alloc
):
dtype
=
'float32'
dtype
=
'float32'
...
@@ -269,7 +264,8 @@ def test_gpuspecifyshape():
...
@@ -269,7 +264,8 @@ def test_gpuspecifyshape():
f
=
theano
.
function
([],
updates
=
[(
x
,
m
*
numpy
.
float32
(
2
))],
f
=
theano
.
function
([],
updates
=
[(
x
,
m
*
numpy
.
float32
(
2
))],
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
l
=
f
.
maker
.
fgraph
.
toposort
()
l
=
f
.
maker
.
fgraph
.
toposort
()
assert
not
numpy
.
any
([
isinstance
(
x
.
op
,
cuda
.
HostFromGpu
)
for
x
in
l
])
assert
not
numpy
.
any
(
[
isinstance
(
x
.
op
,
cuda
.
HostFromGpu
)
for
y
in
l
])
def
test_softmax
():
def
test_softmax
():
...
@@ -430,7 +426,7 @@ def test_local_gpu_subtensor():
...
@@ -430,7 +426,7 @@ def test_local_gpu_subtensor():
# Test multiple use of the input
# Test multiple use of the input
# We want the subtensor to be on the GPU to prevent multiple transfer.
# We want the subtensor to be on the GPU to prevent multiple transfer.
t
=
tensor
.
fmatrix
()
t
=
tensor
.
fmatrix
()
f
=
theano
.
function
([
t
],
[
t
[
3
:
4
],
t
+
1
],
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
t
],
[
t
[
3
:
4
],
t
+
1
],
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
not
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
not
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
cuda
.
GpuSubtensor
)
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
cuda
.
GpuSubtensor
)
for
node
in
topo
])
...
@@ -438,7 +434,7 @@ def test_local_gpu_subtensor():
...
@@ -438,7 +434,7 @@ def test_local_gpu_subtensor():
# Test multiple use of the input + input as output
# Test multiple use of the input + input as output
# We want the subtensor to be on the GPU to prevent multiple transfer.
# We want the subtensor to be on the GPU to prevent multiple transfer.
t
=
tensor
.
fmatrix
()
t
=
tensor
.
fmatrix
()
f
=
theano
.
function
([
t
],
[
t
[
3
:
4
],
t
+
1
,
t
],
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
t
],
[
t
[
3
:
4
],
t
+
1
,
t
],
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
not
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
not
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
cuda
.
GpuSubtensor
)
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
cuda
.
GpuSubtensor
)
for
node
in
topo
])
...
@@ -446,7 +442,7 @@ def test_local_gpu_subtensor():
...
@@ -446,7 +442,7 @@ def test_local_gpu_subtensor():
# Test shared forced on CPU end we do computation on the output of
# Test shared forced on CPU end we do computation on the output of
# the subtensor.
# the subtensor.
t
=
tensor
.
_shared
(
numpy
.
zeros
(
20
,
"float32"
))
t
=
tensor
.
_shared
(
numpy
.
zeros
(
20
,
"float32"
))
f
=
theano
.
function
([],
t
[
3
:
4
]
+
1
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([],
t
[
3
:
4
]
+
1
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
not
any
([
isinstance
(
node
.
op
,
cuda
.
GpuSubtensor
)
for
node
in
topo
])
assert
not
any
([
isinstance
(
node
.
op
,
cuda
.
GpuSubtensor
)
for
node
in
topo
])
...
@@ -507,10 +503,11 @@ def test_local_gpu_split():
...
@@ -507,10 +503,11 @@ def test_local_gpu_split():
def
test_print_op
():
def
test_print_op
():
""" Test that print ops don't block gpu optimization"""
""" Test that print ops don't block gpu optimization"""
b
=
tensor
.
fmatrix
()
b
=
tensor
.
fmatrix
()
f
=
theano
.
function
([
b
],
theano
.
printing
.
Print
()(
b
)
*
2
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
(
[
b
],
theano
.
printing
.
Print
()(
b
)
*
2
,
mode
=
mode_with_gpu
)
# theano.printing.debugprint(f)
# theano.printing.debugprint(f)
# print f.maker.fgraph.toposort()
# print f.maker.fgraph.toposort()
#
[GpuFromHost(<TensorType(float32, matrix)>), <theano.printing.Print object at 0x3581210>(GpuFromHost.0), GpuElemwise{mul}(CudaNdarray{[[ 2.]]}, <theano.printing.Print object at 0x3581210>.0), HostFromGpu(GpuElemwise{mul}.0)]
#
[GpuFromHost(<TensorType(float32, matrix)>), <theano.printing.Print object at 0x3581210>(GpuFromHost.0), GpuElemwise{mul}(CudaNdarray{[[ 2.]]}, <theano.printing.Print object at 0x3581210>.0), HostFromGpu(GpuElemwise{mul}.0)]
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
topo
[
0
]
.
op
==
cuda
.
gpu_from_host
assert
topo
[
0
]
.
op
==
cuda
.
gpu_from_host
assert
isinstance
(
topo
[
1
]
.
op
,
theano
.
printing
.
Print
)
assert
isinstance
(
topo
[
1
]
.
op
,
theano
.
printing
.
Print
)
...
@@ -563,8 +560,10 @@ def test_huge_elemwise_fusion():
...
@@ -563,8 +560,10 @@ def test_huge_elemwise_fusion():
bytes limits.
bytes limits.
"""
"""
shape
=
(
2
,
3
,
4
,
5
,
6
)
shape
=
(
2
,
3
,
4
,
5
,
6
)
ttype
=
tensor
.
tensor
(
dtype
=
'float32'
,
broadcastable
=
(
False
,)
*
len
(
shape
))
ttype
=
tensor
.
tensor
(
dtype
=
'float32'
,
gpu_ptr_size
=
theano
.
sandbox
.
cuda
.
opt
.
get_device_type_sizes
()[
'gpu_ptr_size'
]
broadcastable
=
(
False
,)
*
len
(
shape
))
gpu_ptr_size
=
theano
.
sandbox
.
cuda
.
opt
.
get_device_type_sizes
()[
'gpu_ptr_size'
]
if
gpu_ptr_size
==
8
:
if
gpu_ptr_size
==
8
:
nb_in
=
7
nb_in
=
7
len_topo
=
10
len_topo
=
10
...
@@ -582,14 +581,19 @@ def test_huge_elemwise_fusion():
...
@@ -582,14 +581,19 @@ def test_huge_elemwise_fusion():
assert
isinstance
(
topo
[
-
3
]
.
op
.
scalar_op
,
theano
.
scalar
.
basic
.
Sub
)
assert
isinstance
(
topo
[
-
3
]
.
op
.
scalar_op
,
theano
.
scalar
.
basic
.
Sub
)
assert
isinstance
(
topo
[
-
2
]
.
op
.
scalar_op
,
theano
.
scalar
.
basic
.
Composite
)
assert
isinstance
(
topo
[
-
2
]
.
op
.
scalar_op
,
theano
.
scalar
.
basic
.
Composite
)
# let debugmode catch errors
# let debugmode catch errors
gen
=
lambda
:
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
# gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
def
gen
():
return
(
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
))
f
(
*
[
gen
()
for
i
in
range
(
nb_in
)])
f
(
*
[
gen
()
for
i
in
range
(
nb_in
)])
# Test the case where we can't put the computation on the gpu! their is too
# Test the case where we can't put the computation on the gpu! their is too
# many dimensions to the input to have 2 inputs to the op!
# many dimensions to the input to have 2 inputs to the op!
shape
=
(
1
,
2
,
3
,
4
,
5
,
6
,
7
,
2
,
2
,
3
,
2
,
1
,
2
,
2
,
2
,)
shape
=
(
1
,
2
,
3
,
4
,
5
,
6
,
7
,
2
,
2
,
3
,
2
,
1
,
2
,
2
,
2
,)
ttype
=
tensor
.
tensor
(
dtype
=
'float32'
,
broadcastable
=
(
False
,)
*
len
(
shape
))
ttype
=
tensor
.
tensor
(
dtype
=
'float32'
,
broadcastable
=
(
False
,)
*
len
(
shape
))
vars
=
[
tensor
.
tanh
(
ttype
)
for
x
in
range
(
7
)]
vars
=
[
tensor
.
tanh
(
ttype
)
for
x
in
range
(
7
)]
f
=
pfunc
(
vars
,
[
vars
[
0
]
-
vars
[
1
]
-
vars
[
2
]
-
vars
[
3
]
-
vars
[
4
]
-
f
=
pfunc
(
vars
,
[
vars
[
0
]
-
vars
[
1
]
-
vars
[
2
]
-
vars
[
3
]
-
vars
[
4
]
-
vars
[
5
]
-
vars
[
6
]],
mode
=
mode_with_gpu
)
vars
[
5
]
-
vars
[
6
]],
mode
=
mode_with_gpu
)
...
@@ -598,7 +602,9 @@ def test_huge_elemwise_fusion():
...
@@ -598,7 +602,9 @@ def test_huge_elemwise_fusion():
assert
sum
([
isinstance
(
node
.
op
,
cuda
.
GpuElemwise
)
for
node
in
topo
])
==
0
assert
sum
([
isinstance
(
node
.
op
,
cuda
.
GpuElemwise
)
for
node
in
topo
])
==
0
assert
sum
([
isinstance
(
node
.
op
,
tensor
.
Elemwise
)
for
node
in
topo
])
==
1
assert
sum
([
isinstance
(
node
.
op
,
tensor
.
Elemwise
)
for
node
in
topo
])
==
1
# let debugmode catch errors
# let debugmode catch errors
gen
=
lambda
:
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
def
gen
():
return
(
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
))
f
(
gen
(),
gen
(),
gen
(),
gen
(),
gen
(),
gen
(),
gen
())
f
(
gen
(),
gen
(),
gen
(),
gen
(),
gen
(),
gen
(),
gen
())
def
gen
(
shape
):
def
gen
(
shape
):
...
@@ -611,9 +617,9 @@ def test_huge_elemwise_fusion():
...
@@ -611,9 +617,9 @@ def test_huge_elemwise_fusion():
(
2
,
2
,
2
,
2
),
(
2
,
2
,
2
,
2
),
(
2
,
2
,
2
,
2
,
2
),
# 5d
(
2
,
2
,
2
,
2
,
2
),
# 5d
(
2
,
2
,
2
,
2
,
2
,
2
),
(
2
,
2
,
2
,
2
,
2
,
2
),
#
(2, 2, 2, 2, 2, 2, 2),
#
(2, 2, 2, 2, 2, 2, 2),
#
(2, 2, 2, 2, 2, 2, 2, 2),
#
(2, 2, 2, 2, 2, 2, 2, 2),
#
(2, 2, 2, 1, 1, 1, 1, 2, 2), # 9d
#
(2, 2, 2, 1, 1, 1, 1, 2, 2), # 9d
]:
]:
vals
=
[
cuda
.
shared_constructor
(
gen
(
shape
))
for
x
in
range
(
max_var
)]
vals
=
[
cuda
.
shared_constructor
(
gen
(
shape
))
for
x
in
range
(
max_var
)]
for
use_tan
in
[
True
,
False
]:
for
use_tan
in
[
True
,
False
]:
...
@@ -676,7 +682,9 @@ def test_local_gpu_elemwise_0():
...
@@ -676,7 +682,9 @@ def test_local_gpu_elemwise_0():
a
=
tensor
.
fmatrix
()
a
=
tensor
.
fmatrix
()
from
theano.scalar.basic
import
identity
from
theano.scalar.basic
import
identity
out_s
=
theano
.
scalar
.
Composite
([
a_s
,
b_s
,
c_s
],
out_s
=
theano
.
scalar
.
Composite
([
a_s
,
b_s
,
c_s
],
[
identity
(
a_s
),
identity
(
c_s
),
identity
(
b_s
)])
[
identity
(
a_s
),
identity
(
c_s
),
identity
(
b_s
)])
outs_op
=
tensor
.
Elemwise
(
out_s
)
outs_op
=
tensor
.
Elemwise
(
out_s
)
f
=
theano
.
function
([
a
,
b
,
c
],
outs_op
(
a
,
b
,
c
),
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
a
,
b
,
c
],
outs_op
(
a
,
b
,
c
),
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
...
@@ -725,9 +733,6 @@ def test_elemwise_fusion():
...
@@ -725,9 +733,6 @@ def test_elemwise_fusion():
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
))
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
))
import
theano.tests.test_ifelse
class
TestIfElse
(
theano
.
tests
.
test_ifelse
.
test_ifelse
):
class
TestIfElse
(
theano
.
tests
.
test_ifelse
.
test_ifelse
):
dtype
=
"float32"
dtype
=
"float32"
mode
=
mode_with_gpu
mode
=
mode_with_gpu
...
@@ -765,15 +770,17 @@ def test_incsubtensor_mixed():
...
@@ -765,15 +770,17 @@ def test_incsubtensor_mixed():
def
test_erfinvgpu
():
def
test_erfinvgpu
():
""" Test that local_gpu_elemwise_0 replaces Erfinv with ErfinvGPU """
""" Test that local_gpu_elemwise_0 replaces Erfinv with ErfinvGPU """
x
=
tensor
.
fmatrix
()
x
=
tensor
.
fmatrix
()
f
=
theano
.
function
([
x
],
tensor
.
Elemwise
(
erfinv
)(
x
),
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
x
],
f2
=
theano
.
function
([
x
],
tensor
.
Elemwise
(
erfinv
)(
x
),
tensor
.
Elemwise
(
erfinv
)(
x
),
mode
=
mode_with_gpu
)
theano
.
function
([
x
],
tensor
.
Elemwise
(
erfinv
)(
x
),
mode
=
mode_without_gpu
)
mode
=
mode_without_gpu
)
assert
isinstance
(
f
.
maker
.
fgraph
.
toposort
()[
1
]
.
op
,
cuda
.
GpuElemwise
)
assert
isinstance
(
f
.
maker
.
fgraph
.
toposort
()[
1
]
.
op
,
cuda
.
GpuElemwise
)
assert
isinstance
(
f
.
maker
.
fgraph
.
toposort
()[
1
]
.
op
.
scalar_op
,
assert
isinstance
(
f
.
maker
.
fgraph
.
toposort
()[
1
]
.
op
.
scalar_op
,
cuda
.
elemwise
.
ErfinvGPU
)
cuda
.
elemwise
.
ErfinvGPU
)
xv
=
numpy
.
random
.
rand
(
7
,
8
)
.
astype
(
'float32'
)
numpy
.
random
.
rand
(
7
,
8
)
.
astype
(
'float32'
)
if
imported_scipy_special
:
assert
numpy
.
allclose
(
f
(
xv
),
f2
(
xv
))
def
test_local_gpu_solve
():
def
test_local_gpu_solve
():
...
...
theano/sandbox/cuda/tests/test_rng_curand.py
浏览文件 @
b69ad54d
...
@@ -8,7 +8,7 @@ from theano.sandbox.rng_mrg import MRG_RandomStreams
...
@@ -8,7 +8,7 @@ from theano.sandbox.rng_mrg import MRG_RandomStreams
# Skip tests if cuda_ndarray is not available.
# Skip tests if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
# The PyCObject that represents the cuda random stream object
# The PyCObject that represents the cuda random stream object
...
...
theano/sandbox/cuda/tests/test_tensor_op.py
浏览文件 @
b69ad54d
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
This file test tensor op that should also operate on CudaNdaray.
This file test tensor op that should also operate on CudaNdaray.
"""
"""
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
copy
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
numpy
import
numpy
...
@@ -14,7 +13,7 @@ import theano.tensor as T
...
@@ -14,7 +13,7 @@ import theano.tensor as T
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
import
theano.sandbox.cuda
as
cuda
import
theano.sandbox.cuda
as
cuda
from
theano.tensor.nnet.tests
import
test_conv3d2d
from
theano.tensor.nnet.tests
import
test_conv3d2d
if
cuda
.
cuda_available
==
False
:
if
cuda
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
...
@@ -57,7 +56,7 @@ def test_softmax_optimizations():
...
@@ -57,7 +56,7 @@ def test_softmax_optimizations():
one_of_n
=
tensor
.
lvector
(
'one_of_n'
)
one_of_n
=
tensor
.
lvector
(
'one_of_n'
)
op
=
crossentropy_categorical_1hot
op
=
crossentropy_categorical_1hot
xe
=
op
(
x
,
one_of_n
)
op
(
x
,
one_of_n
)
fgraph
=
theano
.
gof
.
FunctionGraph
(
fgraph
=
theano
.
gof
.
FunctionGraph
(
[
x
,
one_of_n
],
[
x
,
one_of_n
],
...
@@ -84,10 +83,10 @@ def test_may_share_memory_cuda():
...
@@ -84,10 +83,10 @@ def test_may_share_memory_cuda():
# can't test the transpose as ta._strides = is not implemented
# can't test the transpose as ta._strides = is not implemented
# manual transpose of a
# manual transpose of a
#ta = a.reshape((4,3))
#
ta = a.reshape((4,3))
# ta._strides = (ta._strides[1],ta._strides[0])#not implemented
# ta._strides = (ta._strides[1],ta._strides[0])#not implemented
#elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
#
elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
#ta.gpudata += ta.size*elem_size
#
ta.gpudata += ta.size*elem_size
for
a_
,
b_
,
rep
in
[(
a
,
a
,
True
),
(
b
,
b
,
True
),
(
a
,
b
,
False
),
for
a_
,
b_
,
rep
in
[(
a
,
a
,
True
),
(
b
,
b
,
True
),
(
a
,
b
,
False
),
(
a
,
na
,
False
),
(
b
,
nb
,
False
),
(
a
,
na
,
False
),
(
b
,
nb
,
False
),
...
@@ -95,8 +94,7 @@ def test_may_share_memory_cuda():
...
@@ -95,8 +94,7 @@ def test_may_share_memory_cuda():
(
a
,
va
,
True
),
(
b
,
vb
,
True
),
(
a
,
va
,
True
),
(
b
,
vb
,
True
),
(
va
,
b
,
False
),
(
a
,
vb
,
False
),
(
va
,
b
,
False
),
(
a
,
vb
,
False
),
(
a
,
ra
,
True
),
(
b
,
rb
,
True
),
(
a
,
ra
,
True
),
(
b
,
rb
,
True
),
(
ra
,
b
,
False
),
(
a
,
rb
,
False
),
(
ra
,
b
,
False
),
(
a
,
rb
,
False
),
]:
]:
assert
may_share_memory
(
a_
,
b_
)
==
rep
assert
may_share_memory
(
a_
,
b_
)
==
rep
assert
may_share_memory
(
b_
,
a_
)
==
rep
assert
may_share_memory
(
b_
,
a_
)
==
rep
...
...
theano/sandbox/cuda/tests/test_var.py
浏览文件 @
b69ad54d
...
@@ -10,7 +10,7 @@ from theano.sandbox.cuda.var import float32_shared_constructor as f32sc
...
@@ -10,7 +10,7 @@ from theano.sandbox.cuda.var import float32_shared_constructor as f32sc
from
theano.sandbox.cuda
import
CudaNdarrayType
,
cuda_available
from
theano.sandbox.cuda
import
CudaNdarrayType
,
cuda_available
import
theano.sandbox.cuda
as
cuda
import
theano.sandbox.cuda
as
cuda
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
if
cuda_available
==
False
:
if
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
...
@@ -26,19 +26,18 @@ def test_float32_shared_constructor():
...
@@ -26,19 +26,18 @@ def test_float32_shared_constructor():
# test that broadcastable arg is accepted, and that they
# test that broadcastable arg is accepted, and that they
# don't strictly have to be tuples
# don't strictly have to be tuples
assert
eq
(
assert
eq
(
f32sc
(
npy_row
,
f32sc
(
npy_row
,
broadcastable
=
(
True
,
False
))
.
type
,
broadcastable
=
(
True
,
False
))
.
type
,
CudaNdarrayType
((
True
,
False
)))
CudaNdarrayType
((
True
,
False
)))
assert
eq
(
assert
eq
(
f32sc
(
npy_row
,
f32sc
(
npy_row
,
broadcastable
=
[
True
,
False
])
.
type
,
broadcastable
=
[
True
,
False
])
.
type
,
CudaNdarrayType
((
True
,
False
)))
CudaNdarrayType
((
True
,
False
)))
assert
eq
(
assert
eq
(
f32sc
(
npy_row
,
f32sc
(
npy_row
,
broadcastable
=
numpy
.
array
([
True
,
False
]))
.
type
,
broadcastable
=
numpy
.
array
([
True
,
False
]))
.
type
,
CudaNdarrayType
([
True
,
False
]))
CudaNdarrayType
([
True
,
False
]))
# test that we can make non-matrix shared vars
# test that we can make non-matrix shared vars
assert
eq
(
assert
eq
(
f32sc
(
numpy
.
zeros
((
2
,
3
,
4
,
5
),
dtype
=
'float32'
))
.
type
,
f32sc
(
numpy
.
zeros
((
2
,
3
,
4
,
5
),
dtype
=
'float32'
))
.
type
,
CudaNdarrayType
((
False
,)
*
4
))
CudaNdarrayType
((
False
,)
*
4
))
...
@@ -77,7 +76,8 @@ class T_updates(unittest.TestCase):
...
@@ -77,7 +76,8 @@ class T_updates(unittest.TestCase):
x
=
tensor
.
fmatrix
(
'x'
)
x
=
tensor
.
fmatrix
(
'x'
)
output_updates
=
[(
output_var
,
x
**
2
)]
output_updates
=
[(
output_var
,
x
**
2
)]
output_givens
=
{
x
:
data
}
output_givens
=
{
x
:
data
}
output_func
=
theano
.
function
(
inputs
=
[],
outputs
=
[],
output_func
=
theano
.
function
(
inputs
=
[],
outputs
=
[],
updates
=
output_updates
,
givens
=
output_givens
)
updates
=
output_updates
,
givens
=
output_givens
)
output_func
()
output_func
()
...
...
theano/sandbox/cuda/tests/test_viewop.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
numpy
import
numpy
import
unittest
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano
import
theano
...
@@ -11,7 +10,7 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
...
@@ -11,7 +10,7 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
def
test_viewop_gpu
():
def
test_viewop_gpu
():
from
theano.sandbox
import
cuda
from
theano.sandbox
import
cuda
if
cuda
.
cuda_available
==
False
:
if
cuda
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
_x
=
theano
.
tensor
.
fvector
(
'x'
)
_x
=
theano
.
tensor
.
fvector
(
'x'
)
x
=
cuda
.
gpu_from_host
(
_x
)
x
=
cuda
.
gpu_from_host
(
_x
)
...
...
theano/sandbox/cuda/tests/walltime.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
print_function
from
__future__
import
print_function
import
sys
,
time
import
sys
import
time
from
six
import
iteritems
from
six
import
iteritems
from
theano.compile.pfunc
import
pfunc
from
theano.compile.pfunc
import
pfunc
from
theano
import
tensor
from
theano
import
tensor
...
@@ -35,35 +36,47 @@ def showtimes(times):
...
@@ -35,35 +36,47 @@ def showtimes(times):
def
cmp_sigmoids
(
shape
):
def
cmp_sigmoids
(
shape
):
def
numpy_sigmoid
(
input
):
def
numpy_sigmoid
(
input
):
rval
=
1.0
/
(
1.0
+
numpy
.
exp
(
-
input
))
1.0
/
(
1.0
+
numpy
.
exp
(
-
input
))
sinput
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,)
*
len
(
shape
))()
sinput
=
tensor
.
Tensor
(
shared_input
=
tcn
.
shared_constructor
(
numpy
.
random
.
rand
(
*
shape
),
'shared_input'
)
dtype
=
'float32'
,
broadcastable
=
(
0
,)
*
len
(
shape
))()
times
=
compare_fns
(
shared_input
=
tcn
.
shared_constructor
(
dict
(
numpy
=
numpy_sigmoid
numpy
.
random
.
rand
(
*
shape
),
,
theano_cpu
=
pfunc
([
sinput
],
1.0
/
(
1.0
+
tensor
.
exp
(
-
sinput
)))
'shared_input'
)
,
theano_gpu_onboard
=
pfunc
([
sinput
],
[],
updates
=
[(
shared_input
,
1.0
/
(
1.0
+
tensor
.
exp
(
-
shared_input
)))])
times
=
compare_fns
(
dict
(
),
numpy
=
numpy_sigmoid
,
theano_cpu
=
pfunc
([
sinput
],
1.0
/
(
1.0
+
tensor
.
exp
(
-
sinput
))),
theano_gpu_onboard
=
pfunc
(
[
sinput
],
[],
updates
=
[(
shared_input
,
1.0
/
(
1.0
+
tensor
.
exp
(
-
shared_input
)))])),
input
=
shared_input
.
value
)
input
=
shared_input
.
value
)
showtimes
(
times
)
showtimes
(
times
)
def
cmp_sigmoids_T
(
shape
):
def
cmp_sigmoids_T
(
shape
):
def
numpy_sigmoid
(
input
):
def
numpy_sigmoid
(
input
):
rval
=
1.0
/
(
1.0
+
numpy
.
exp
(
-
input
.
T
))
1.0
/
(
1.0
+
numpy
.
exp
(
-
input
.
T
))
sinput
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,)
*
len
(
shape
))()
sinput
=
tensor
.
Tensor
(
shared_input
=
tcn
.
shared_constructor
(
numpy
.
random
.
rand
(
*
shape
),
'shared_input'
)
dtype
=
'float32'
,
broadcastable
=
(
0
,)
*
len
(
shape
))()
times
=
compare_fns
(
shared_input
=
tcn
.
shared_constructor
(
dict
(
numpy
=
numpy_sigmoid
numpy
.
random
.
rand
(
*
shape
),
,
theano_cpu
=
pfunc
([
sinput
],
1.0
/
(
1.0
+
tensor
.
exp
(
-
sinput
.
T
)))
'shared_input'
)
,
theano_gpu_onboard
=
pfunc
([
sinput
],
[],
updates
=
[(
shared_input
,
1.0
/
(
1.0
+
times
=
compare_fns
(
dict
(
tensor
.
exp
(
-
shared_input
.
T
)))])
numpy
=
numpy_sigmoid
,
),
theano_cpu
=
pfunc
([
sinput
],
1.0
/
(
1.0
+
tensor
.
exp
(
-
sinput
.
T
))),
theano_gpu_onboard
=
pfunc
(
[
sinput
],
[],
updates
=
[(
shared_input
,
1.0
/
(
1.0
+
tensor
.
exp
(
-
shared_input
.
T
)))])),
input
=
shared_input
.
value
)
input
=
shared_input
.
value
)
showtimes
(
times
)
showtimes
(
times
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
eval
(
sys
.
argv
[
1
])
eval
(
sys
.
argv
[
1
])
# cmp_sigmoids((640, 64*64)) # looks great in profiler
# cmp_sigmoids((640, 64*64)) # looks great in profiler
#cmp_sigmoids((173, 74*49))
# cmp_sigmoids((173, 74*49))
#cmp_sigmoids_T((173, 74*49))
# cmp_sigmoids_T((173, 74*49))
theano/sandbox/cuda/type.py
浏览文件 @
b69ad54d
...
@@ -259,8 +259,8 @@ class CudaNdarrayType(Type):
...
@@ -259,8 +259,8 @@ class CudaNdarrayType(Type):
'complex64'
:
(
complex
,
'theano_complex64'
,
'complex64'
:
(
complex
,
'theano_complex64'
,
'NPY_COMPLEX64'
)}[
self
.
dtype
]
'NPY_COMPLEX64'
)}[
self
.
dtype
]
except
KeyError
:
except
KeyError
:
raise
TypeError
(
"Unsupported dtype for
%
s:
%
s"
%
(
raise
TypeError
(
"Unsupported dtype for
%
s:
%
s"
%
self
.
__class__
.
__name__
,
self
.
dtype
))
(
self
.
__class__
.
__name__
,
self
.
dtype
))
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
"""
"""
...
@@ -271,9 +271,10 @@ class CudaNdarrayType(Type):
...
@@ -271,9 +271,10 @@ class CudaNdarrayType(Type):
other
.
broadcastable
==
self
.
broadcastable
)
other
.
broadcastable
==
self
.
broadcastable
)
def
convert_variable
(
self
,
var
):
def
convert_variable
(
self
,
var
):
if
(
type
(
self
)
==
type
(
var
.
type
)
and
if
(
isinstance
(
self
,
type
(
var
.
type
)
)
and
self
.
ndim
==
var
.
type
.
ndim
and
self
.
ndim
==
var
.
type
.
ndim
and
all
(
sb
==
ob
or
ob
for
sb
,
ob
in
zip
(
self
.
broadcastable
,
all
(
sb
==
ob
or
ob
for
sb
,
ob
in
zip
(
self
.
broadcastable
,
var
.
type
.
broadcastable
))):
var
.
type
.
broadcastable
))):
return
theano
.
tensor
.
patternbroadcast
(
var
,
self
.
broadcastable
)
return
theano
.
tensor
.
patternbroadcast
(
var
,
self
.
broadcastable
)
...
@@ -312,7 +313,7 @@ class CudaNdarrayType(Type):
...
@@ -312,7 +313,7 @@ class CudaNdarrayType(Type):
return
self
.
name
return
self
.
name
else
:
else
:
b
=
self
.
broadcastable
b
=
self
.
broadcastable
#bcast = str(self.broadcastable)
#
bcast = str(self.broadcastable)
if
not
numpy
.
any
(
b
):
if
not
numpy
.
any
(
b
):
s
=
"
%
iD"
%
len
(
b
)
s
=
"
%
iD"
%
len
(
b
)
else
:
else
:
...
@@ -327,7 +328,7 @@ class CudaNdarrayType(Type):
...
@@ -327,7 +328,7 @@ class CudaNdarrayType(Type):
def
__repr__
(
self
):
def
__repr__
(
self
):
return
str
(
self
)
return
str
(
self
)
#"CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
#
"CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
def
c_declare
(
self
,
name
,
sub
,
check_input
=
True
):
def
c_declare
(
self
,
name
,
sub
,
check_input
=
True
):
return
""" CudaNdarray *
%(name)
s;"""
%
locals
()
return
""" CudaNdarray *
%(name)
s;"""
%
locals
()
...
@@ -563,8 +564,7 @@ theano.compile.register_deep_copy_op_c_code(
...
@@ -563,8 +564,7 @@ theano.compile.register_deep_copy_op_c_code(
CudaNdarray_HOST_DIMS(
%(oname)
s)[i]) {
CudaNdarray_HOST_DIMS(
%(oname)
s)[i]) {
alloc = true;
alloc = true;
break;
break;
}
}}
}
if(alloc) {
if(alloc) {
Py_XDECREF(
%(oname)
s);
Py_XDECREF(
%(oname)
s);
%(oname)
s = (CudaNdarray*)CudaNdarray_Copy(
%(iname)
s);
%(oname)
s = (CudaNdarray*)CudaNdarray_Copy(
%(iname)
s);
...
@@ -581,8 +581,7 @@ theano.compile.register_deep_copy_op_c_code(
...
@@ -581,8 +581,7 @@ theano.compile.register_deep_copy_op_c_code(
%(fail)
s;
%(fail)
s;
}
}
}
}
"""
,
"""
,
version
=
3
)
version
=
3
)
# THIS WORKS But CudaNdarray instances don't compare equal to one
# THIS WORKS But CudaNdarray instances don't compare equal to one
...
@@ -608,5 +607,5 @@ def CudaNdarray_pickler(cnda):
...
@@ -608,5 +607,5 @@ def CudaNdarray_pickler(cnda):
# In case cuda is not imported.
# In case cuda is not imported.
if
cuda
is
not
None
:
if
cuda
is
not
None
:
copyreg
.
pickle
(
cuda
.
CudaNdarray
,
CudaNdarray_pickler
,
copyreg
.
pickle
(
CudaNdarray_unpickler
)
cuda
.
CudaNdarray
,
CudaNdarray_pickler
,
CudaNdarray_unpickler
)
theano/sandbox/cuda/var.py
浏览文件 @
b69ad54d
...
@@ -13,7 +13,7 @@ try:
...
@@ -13,7 +13,7 @@ try:
# We must do those import to be able to create the full doc when nvcc
# We must do those import to be able to create the full doc when nvcc
# is not available
# is not available
from
theano.sandbox.cuda
import
filter
as
type_support_filter
from
theano.sandbox.cuda
import
filter
as
type_support_filter
from
theano.sandbox.cuda.basic_ops
import
HostFromGpu
,
GpuFromHost
from
theano.sandbox.cuda.basic_ops
import
HostFromGpu
except
ImportError
:
except
ImportError
:
pass
pass
...
@@ -33,6 +33,7 @@ class _operators(tensor.basic._tensor_py_operators):
...
@@ -33,6 +33,7 @@ class _operators(tensor.basic._tensor_py_operators):
def
_as_TensorVariable
(
self
):
def
_as_TensorVariable
(
self
):
return
HostFromGpu
()(
self
)
return
HostFromGpu
()(
self
)
def
_as_CudaNdarrayVariable
(
self
):
def
_as_CudaNdarrayVariable
(
self
):
return
self
return
self
...
@@ -54,6 +55,7 @@ class CudaNdarrayConstantSignature(tensor.TensorConstantSignature):
...
@@ -54,6 +55,7 @@ class CudaNdarrayConstantSignature(tensor.TensorConstantSignature):
class
CudaNdarrayConstant
(
_operators
,
Constant
):
class
CudaNdarrayConstant
(
_operators
,
Constant
):
def
signature
(
self
):
def
signature
(
self
):
return
CudaNdarrayConstantSignature
((
self
.
type
,
numpy
.
asarray
(
self
.
data
)))
return
CudaNdarrayConstantSignature
((
self
.
type
,
numpy
.
asarray
(
self
.
data
)))
def
__str__
(
self
):
def
__str__
(
self
):
if
self
.
name
is
not
None
:
if
self
.
name
is
not
None
:
return
self
.
name
return
self
.
name
...
@@ -61,7 +63,7 @@ class CudaNdarrayConstant(_operators, Constant):
...
@@ -61,7 +63,7 @@ class CudaNdarrayConstant(_operators, Constant):
data
=
str
(
numpy
.
asarray
(
self
.
data
))
data
=
str
(
numpy
.
asarray
(
self
.
data
))
except
Exception
as
e
:
except
Exception
as
e
:
data
=
"error while transferring the value: "
+
str
(
e
)
data
=
"error while transferring the value: "
+
str
(
e
)
return
"CudaNdarrayConstant{"
+
data
+
"}"
return
"CudaNdarrayConstant{"
+
data
+
"}"
CudaNdarrayType
.
Constant
=
CudaNdarrayConstant
CudaNdarrayType
.
Constant
=
CudaNdarrayConstant
...
...
theano/tests/test_flake8.py
浏览文件 @
b69ad54d
...
@@ -87,42 +87,8 @@ whitelist_flake8 = [
...
@@ -87,42 +87,8 @@ whitelist_flake8 = [
"sandbox/tests/test_theano_object.py"
,
"sandbox/tests/test_theano_object.py"
,
"sandbox/tests/test_scan.py"
,
"sandbox/tests/test_scan.py"
,
"sandbox/tests/__init__.py"
,
"sandbox/tests/__init__.py"
,
"sandbox/cuda/var.py"
,
"sandbox/cuda/GpuConvGrad3D.py"
,
"sandbox/cuda/basic_ops.py"
,
"sandbox/cuda/nnet.py"
,
"sandbox/cuda/elemwise.py"
,
"sandbox/cuda/type.py"
,
"sandbox/cuda/__init__.py"
,
"sandbox/cuda/__init__.py"
,
"sandbox/cuda/opt.py"
,
"sandbox/cuda/blas.py"
,
"sandbox/cuda/blocksparse.py"
,
"sandbox/cuda/rng_curand.py"
,
"sandbox/cuda/fftconv.py"
,
"sandbox/cuda/kernel_codegen.py"
,
"sandbox/cuda/GpuConvTransp3D.py"
,
"sandbox/cuda/nvcc_compiler.py"
,
"sandbox/cuda/neighbours.py"
,
"sandbox/cuda/tests/__init__.py"
,
"sandbox/cuda/tests/__init__.py"
,
"sandbox/cuda/tests/walltime.py"
,
"sandbox/cuda/tests/test_gradient.py"
,
"sandbox/cuda/tests/test_neighbours.py"
,
"sandbox/cuda/tests/test_conv_cuda_ndarray.py"
,
"sandbox/cuda/tests/test_var.py"
,
"sandbox/cuda/tests/test_opt.py"
,
"sandbox/cuda/tests/test_blas.py"
,
"sandbox/cuda/tests/test_driver.py"
,
"sandbox/cuda/tests/test_rng_curand.py"
,
"sandbox/cuda/tests/test_basic_ops.py"
,
"sandbox/cuda/tests/test_memory.py"
,
"sandbox/cuda/tests/test_mlp.py"
,
"sandbox/cuda/tests/test_bench_loopfusion.py"
,
"sandbox/cuda/tests/test_blocksparse.py"
,
"sandbox/cuda/tests/test_cuda_ndarray.py"
,
"sandbox/cuda/tests/test_tensor_op.py"
,
"sandbox/cuda/tests/test_extra_ops.py"
,
"sandbox/cuda/tests/test_gemmcorr3d.py"
,
"sandbox/cuda/tests/test_viewop.py"
,
"sandbox/gpuarray/tests/__init__.py"
,
"sandbox/gpuarray/tests/__init__.py"
,
"sandbox/scan_module/scan_utils.py"
,
"sandbox/scan_module/scan_utils.py"
,
"sandbox/scan_module/scan.py"
,
"sandbox/scan_module/scan.py"
,
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论