Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
b69ad54d
提交
b69ad54d
authored
5月 05, 2016
作者:
Xavier Bouthillier
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #4244 from ChihebTrabelsi/ccw2.0
flake8 sandbox/cuda/*.py
上级
200babca
58267dc2
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
33 个修改的文件
包含
295 行增加
和
266 行删除
+295
-266
GpuConvGrad3D.py
theano/sandbox/cuda/GpuConvGrad3D.py
+8
-9
GpuConvTransp3D.py
theano/sandbox/cuda/GpuConvTransp3D.py
+40
-17
basic_ops.py
theano/sandbox/cuda/basic_ops.py
+0
-0
blas.py
theano/sandbox/cuda/blas.py
+0
-0
elemwise.py
theano/sandbox/cuda/elemwise.py
+0
-0
fftconv.py
theano/sandbox/cuda/fftconv.py
+14
-8
kernel_codegen.py
theano/sandbox/cuda/kernel_codegen.py
+22
-26
neighbours.py
theano/sandbox/cuda/neighbours.py
+1
-1
nnet.py
theano/sandbox/cuda/nnet.py
+56
-59
nvcc_compiler.py
theano/sandbox/cuda/nvcc_compiler.py
+9
-7
opt.py
theano/sandbox/cuda/opt.py
+0
-0
rng_curand.py
theano/sandbox/cuda/rng_curand.py
+26
-29
test_basic_ops.py
theano/sandbox/cuda/tests/test_basic_ops.py
+0
-0
test_bench_loopfusion.py
theano/sandbox/cuda/tests/test_bench_loopfusion.py
+0
-0
test_blas.py
theano/sandbox/cuda/tests/test_blas.py
+0
-0
test_conv_cuda_ndarray.py
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+0
-0
test_cuda_ndarray.py
theano/sandbox/cuda/tests/test_cuda_ndarray.py
+0
-0
test_driver.py
theano/sandbox/cuda/tests/test_driver.py
+4
-3
test_extra_ops.py
theano/sandbox/cuda/tests/test_extra_ops.py
+24
-23
test_gemmcorr3d.py
theano/sandbox/cuda/tests/test_gemmcorr3d.py
+6
-5
test_gradient.py
theano/sandbox/cuda/tests/test_gradient.py
+1
-1
test_memory.py
theano/sandbox/cuda/tests/test_memory.py
+16
-10
test_mlp.py
theano/sandbox/cuda/tests/test_mlp.py
+64
-64
test_neighbours.py
theano/sandbox/cuda/tests/test_neighbours.py
+4
-4
test_opt.py
theano/sandbox/cuda/tests/test_opt.py
+0
-0
test_rng_curand.py
theano/sandbox/cuda/tests/test_rng_curand.py
+0
-0
test_tensor_op.py
theano/sandbox/cuda/tests/test_tensor_op.py
+0
-0
test_var.py
theano/sandbox/cuda/tests/test_var.py
+0
-0
test_viewop.py
theano/sandbox/cuda/tests/test_viewop.py
+0
-0
walltime.py
theano/sandbox/cuda/tests/walltime.py
+0
-0
type.py
theano/sandbox/cuda/type.py
+0
-0
var.py
theano/sandbox/cuda/var.py
+0
-0
test_flake8.py
theano/tests/test_flake8.py
+0
-0
没有找到文件。
theano/sandbox/cuda/GpuConvGrad3D.py
浏览文件 @
b69ad54d
...
@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
...
@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
d_
=
T
.
as_tensor_variable
(
d
)
d_
=
T
.
as_tensor_variable
(
d
)
WShape_
=
T
.
as_tensor_variable
(
WShape
)
WShape_
=
T
.
as_tensor_variable
(
WShape
)
dCdH_
=
as_cuda_ndarray_variable
(
dCdH
)
dCdH_
=
as_cuda_ndarray_variable
(
dCdH
)
broad
=
(
False
,)
*
5
broad
=
(
False
,)
*
5
return
theano
.
Apply
(
self
,
inputs
=
[
V_
,
d_
,
WShape_
,
dCdH_
],
return
theano
.
Apply
(
self
,
inputs
=
[
V_
,
d_
,
WShape_
,
dCdH_
],
outputs
=
[
CudaNdarrayType
(
dtype
=
V_
.
dtype
,
outputs
=
[
CudaNdarrayType
(
dtype
=
V_
.
dtype
,
broadcastable
=
broad
)()])
broadcastable
=
broad
)()])
...
@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
...
@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize
=
dCdH
.
shape
[
0
]
batchSize
=
dCdH
.
shape
[
0
]
outputFilters
=
dCdH
.
shape
[
1
]
outputHeight
=
dCdH
.
shape
[
2
]
outputHeight
=
dCdH
.
shape
[
2
]
outputWidth
=
dCdH
.
shape
[
3
]
outputWidth
=
dCdH
.
shape
[
3
]
outputDur
=
dCdH
.
shape
[
4
]
outputDur
=
dCdH
.
shape
[
4
]
assert
V
.
shape
[
0
]
==
batchSize
assert
V
.
shape
[
0
]
==
batchSize
inputFilters
=
V
.
shape
[
1
]
inputHeight
=
V
.
shape
[
2
]
inputWidth
=
V
.
shape
[
3
]
inputDur
=
V
.
shape
[
4
]
dr
,
dc
,
dt
=
d
dr
,
dc
,
dt
=
d
dCdW
=
numpy
.
zeros
(
WShape
,
dtype
=
V
.
dtype
)
dCdW
=
numpy
.
zeros
(
WShape
,
dtype
=
V
.
dtype
)
...
@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
...
@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
for
p
in
xrange
(
0
,
outputHeight
):
for
p
in
xrange
(
0
,
outputHeight
):
for
q
in
xrange
(
0
,
outputWidth
):
for
q
in
xrange
(
0
,
outputWidth
):
for
r
in
xrange
(
0
,
outputDur
):
for
r
in
xrange
(
0
,
outputDur
):
dCdW
[
j
,
z
,
k
,
l
,
m
]
+=
dCdH
[
i
,
j
,
p
,
q
,
r
]
*
V
[
i
,
z
,
dr
*
p
+
k
,
dc
*
q
+
l
,
dt
*
r
+
m
]
dCdW
[
j
,
z
,
k
,
l
,
m
]
+=
dCdH
[
i
,
j
,
p
,
q
,
r
]
*
\
V
[
i
,
z
,
dr
*
p
+
k
,
dc
*
q
+
l
,
dt
*
r
+
m
]
output_storage
[
0
][
0
]
=
dCdW
output_storage
[
0
][
0
]
=
dCdW
...
@@ -86,7 +85,7 @@ class GpuConvGrad3D(GpuOp):
...
@@ -86,7 +85,7 @@ class GpuConvGrad3D(GpuOp):
dCdW
=
outputs
[
0
]
dCdW
=
outputs
[
0
]
codeSource
=
"""
codeSource
=
"""
///////////// < code generated by GpuConvGrad3D >
///////////// < code generated by GpuConvGrad3D >
//printf("
\t\t\t\t
GpuConvGrad3DW c code
\\
n");
//printf("
\t\t\t\t
GpuConvGrad3DW c code
\\
n");
...
@@ -285,7 +284,7 @@ if(!work_complete){
...
@@ -285,7 +284,7 @@ if(!work_complete){
# This code is not sensitive to the ignore_border flag.
# This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the
# It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position.
# input pixels that were downsampled to that z-position.
codeSource
=
"""
codeSource
=
"""
__global__ void
__global__ void
//thread block size = WShape[4]
//thread block size = WShape[4]
//grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3])
//grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3])
...
...
theano/sandbox/cuda/GpuConvTransp3D.py
浏览文件 @
b69ad54d
...
@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
...
@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
else
:
else
:
RShape_
=
T
.
as_tensor_variable
([
-
1
,
-
1
,
-
1
])
RShape_
=
T
.
as_tensor_variable
([
-
1
,
-
1
,
-
1
])
return
theano
.
Apply
(
self
,
inputs
=
[
W_
,
b_
,
d_
,
H_
,
RShape_
],
return
theano
.
Apply
(
outputs
=
[
CudaNdarrayType
(
dtype
=
H_
.
dtype
,
self
,
inputs
=
[
W_
,
b_
,
d_
,
H_
,
RShape_
],
broadcastable
=
(
False
,)
*
5
)()])
outputs
=
[
CudaNdarrayType
(
dtype
=
H_
.
dtype
,
broadcastable
=
(
False
,)
*
5
)()])
def
infer_shape
(
self
,
node
,
input_shapes
):
def
infer_shape
(
self
,
node
,
input_shapes
):
W
,
b
,
d
,
H
,
RShape
=
node
.
inputs
W
,
b
,
d
,
H
,
RShape
=
node
.
inputs
...
@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
...
@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
assert
dc
>
0
assert
dc
>
0
assert
dt
>
0
assert
dt
>
0
videoHeight
=
(
outputHeight
-
1
)
*
dr
+
filterHeight
videoHeight
=
(
outputHeight
-
1
)
*
dr
+
filterHeight
videoWidth
=
(
outputWidth
-
1
)
*
dc
+
filterWidth
videoWidth
=
(
outputWidth
-
1
)
*
dc
+
filterWidth
videoDur
=
(
outputDur
-
1
)
*
dt
+
filterDur
videoDur
=
(
outputDur
-
1
)
*
dt
+
filterDur
if
Rshape
is
not
None
and
Rshape
[
0
]
!=
-
1
:
if
Rshape
is
not
None
and
Rshape
[
0
]
!=
-
1
:
if
Rshape
[
0
]
<
videoHeight
:
if
Rshape
[
0
]
<
videoHeight
:
...
@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
...
@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
# else:
# else:
# print "No Rshape passed in"
# print "No Rshape passed in"
# print "video size: "
+
str((videoHeight, videoWidth, videoDur))
# print "video size: "
+
str((videoHeight, videoWidth, videoDur))
R
=
numpy
.
zeros
(
(
batchSize
,
inputChannels
,
videoHeight
,
R
=
numpy
.
zeros
((
batchSize
,
inputChannels
,
videoHeight
,
videoWidth
,
videoDur
)
,
dtype
=
H
.
dtype
)
videoWidth
,
videoDur
),
dtype
=
H
.
dtype
)
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
# sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for
i
in
xrange
(
0
,
batchSize
):
for
i
in
xrange
(
0
,
batchSize
):
# print '\texample '+str(i+1)+'/'+str(batchSize)
# print '\texample '+str(i+1)+'/'+str(batchSize)
for
j
in
xrange
(
0
,
inputChannels
):
for
j
in
xrange
(
0
,
inputChannels
):
# print '\t\tfeature map '
+str(j+1)+'/'+
str(inputChannels)
# print '\t\tfeature map '
+ str(j+1) + '/' +
str(inputChannels)
for
r
in
xrange
(
0
,
videoHeight
):
for
r
in
xrange
(
0
,
videoHeight
):
# print '\t\t\trow '
+str(r+1)+
'/'+str(videoHeight)
# print '\t\t\trow '
+ str(r+1) +
'/'+str(videoHeight)
for
c
in
xrange
(
0
,
videoWidth
):
for
c
in
xrange
(
0
,
videoWidth
):
for
t
in
xrange
(
0
,
videoDur
):
for
t
in
xrange
(
0
,
videoDur
):
R
[
i
,
j
,
r
,
c
,
t
]
=
b
[
j
]
R
[
i
,
j
,
r
,
c
,
t
]
=
b
[
j
]
ftc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
t
-
filterDur
+
1
)
/
float
(
dt
)))
])
ftc
=
max
(
fcc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
c
-
filterWidth
+
1
)
/
float
(
dc
)))
])
[
0
,
int
(
numpy
.
ceil
(
rc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
r
-
filterHeight
+
1
)
/
float
(
dr
)))
])
float
(
t
-
filterDur
+
1
)
/
float
(
dt
)
))
]
)
fcc
=
max
(
[
0
,
int
(
numpy
.
ceil
(
float
(
c
-
filterWidth
+
1
)
/
float
(
dc
)
))
]
)
rc
=
max
(
[
0
,
int
(
numpy
.
ceil
(
float
(
r
-
filterHeight
+
1
)
/
float
(
dr
)
))
]
)
while
rc
<
outputHeight
:
while
rc
<
outputHeight
:
rk
=
r
-
rc
*
dr
rk
=
r
-
rc
*
dr
if
rk
<
0
:
if
rk
<
0
:
...
@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
...
@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
if
tk
<
0
:
if
tk
<
0
:
break
break
R
[
i
,
j
,
r
,
c
,
t
]
+=
numpy
.
dot
(
W
[:,
j
,
rk
,
ck
,
tk
],
H
[
i
,
:,
rc
,
cc
,
tc
]
)
R
[
i
,
j
,
r
,
c
,
t
]
+=
numpy
.
dot
(
W
[:,
j
,
rk
,
ck
,
tk
],
H
[
i
,
:,
rc
,
cc
,
tc
])
tc
+=
1
tc
+=
1
""
# close loop over tc
""
# close loop over tc
...
...
theano/sandbox/cuda/basic_ops.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/blas.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/elemwise.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/fftconv.py
浏览文件 @
b69ad54d
...
@@ -5,9 +5,9 @@ import numpy as np
...
@@ -5,9 +5,9 @@ import numpy as np
import
theano
import
theano
import
theano.tensor
as
T
import
theano.tensor
as
T
from
theano.misc.pycuda_init
import
pycuda_available
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
from
theano.ifelse
import
ifelse
from
theano.ifelse
import
ifelse
from
theano.misc.pycuda_init
import
pycuda_available
if
cuda_available
:
if
cuda_available
:
from
theano.sandbox.cuda
import
(
basic_ops
,
CudaNdarrayType
,
from
theano.sandbox.cuda
import
(
basic_ops
,
CudaNdarrayType
,
...
@@ -448,7 +448,7 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
...
@@ -448,7 +448,7 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
o1
=
i1
+
1
o1
=
i1
+
1
input_padded
=
T
.
zeros
((
b
,
ic
,
o0
,
o1
),
dtype
=
'float32'
)
input_padded
=
T
.
zeros
((
b
,
ic
,
o0
,
o1
),
dtype
=
'float32'
)
input_padded
=
T
.
set_subtensor
(
input_padded
[:,
:,
:
i0
,
:
i1
],
input_padded
=
T
.
set_subtensor
(
input_padded
[:,
:,
:
i0
,
:
i1
],
input
)
input
)
else
:
else
:
o1
=
i1
o1
=
i1
input_padded
=
input
input_padded
=
input
...
@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
...
@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# special way because we specify explicitly here
# how much values are expected.
# how much values are expected.
if
border_mode
==
'valid'
:
if
border_mode
==
'valid'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
)]
elif
border_mode
==
'full'
:
elif
border_mode
==
'full'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
)]
else
:
else
:
raise
ValueError
(
'invalid mode'
)
raise
ValueError
(
'invalid mode'
)
...
@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
...
@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
output_fft_s
=
mult_and_reduce
(
input_fft_v
,
filters_fft_v
,
output_fft_s
=
mult_and_reduce
(
input_fft_v
,
filters_fft_v
,
input_shape
=
input_fft_v_shape
,
input_shape
=
input_fft_v_shape
,
filter_shape
=
filters_fft_v_shape
)
filter_shape
=
filters_fft_v_shape
)
#output_fft_s = input_fft_v
#
output_fft_s = input_fft_v
# reshape for IFFT
# reshape for IFFT
output_fft_flat
=
output_fft_s
.
reshape
((
b
*
oc
,
o0
,
o1
,
o2
//
2
+
1
,
2
))
output_fft_flat
=
output_fft_s
.
reshape
((
b
*
oc
,
o0
,
o1
,
o2
//
2
+
1
,
2
))
...
@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
...
@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# special way because we specify explicitly here
# how much values are expected.
# how much values are expected.
if
border_mode
==
'valid'
:
if
border_mode
==
'valid'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
),
(
f2
-
1
):(
f2
-
1
+
i2
-
f2
+
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
),
(
f2
-
1
):(
f2
-
1
+
i2
-
f2
+
1
)]
elif
border_mode
==
'full'
:
elif
border_mode
==
'full'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
),
(
f2
-
1
):(
f2
-
1
+
i2
+
f2
-
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
),
(
f2
-
1
):(
f2
-
1
+
i2
+
f2
-
1
)]
else
:
else
:
raise
ValueError
(
'invalid mode'
)
raise
ValueError
(
'invalid mode'
)
#output = output_circ[:, :, :, :, :]
#
output = output_circ[:, :, :, :, :]
# Rescale manually. This is just a factor that comes in during the
# Rescale manually. This is just a factor that comes in during the
# trip through FFT and inverse FFT.
# trip through FFT and inverse FFT.
...
...
theano/sandbox/cuda/kernel_codegen.py
浏览文件 @
b69ad54d
...
@@ -76,7 +76,7 @@ def inline_reduce(N, buf, pos, count, manner_fn):
...
@@ -76,7 +76,7 @@ def inline_reduce(N, buf, pos, count, manner_fn):
rest of the buffer is trashed by this function.
rest of the buffer is trashed by this function.
Notes
Notes
-----
-----
buf should be in gpu shared memory, we access it many times.
buf should be in gpu shared memory, we access it many times.
"""
"""
...
@@ -167,29 +167,26 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
...
@@ -167,29 +167,26 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
We use __i as an int variable in a loop.
We use __i as an int variable in a loop.
"""
"""
return
[
return
[
# get max of buf (trashing all but buf[0])
# get max of buf (trashing all but buf[0])
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'__syncthreads()'
,
'float row_max = '
+
buf
+
'[0]'
,
'float row_max = '
+
buf
+
'[0]'
,
'__syncthreads()'
,
'__syncthreads()'
,
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
threadCount
+
'){'
,
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = exp('
+
buf2
+
'[__i] - row_max)'
,
buf
+
'[__i] = exp('
+
buf2
+
'[__i] - row_max)'
,
buf2
+
'[__i] = '
+
buf
+
'[__i]'
,
'}'
,
buf2
+
'[__i] = '
+
buf
+
'[__i]'
,
'__syncthreads()'
,
'}'
,
inline_reduce_sum
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'__syncthreads()'
,
inline_reduce_sum
(
N
,
buf
,
threadPos
,
threadCount
),
'float row_sum = '
+
buf
+
'[0]'
,
'__syncthreads()'
,
'__syncthreads()'
,
'float row_sum = '
+
buf
+
'[0]'
,
# divide each exp() result by the sum to complete the job.
'__syncthreads()'
,
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
# divide each exp() result by the sum to complete the job.
'; __i+='
+
threadCount
+
'){'
,
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
buf
+
'[__i] = '
+
buf2
+
'[__i] / row_sum'
,
'}'
,
'; __i+='
+
threadCount
+
'){'
,
'__syncthreads()'
,
buf
+
'[__i] = '
+
buf2
+
'[__i] / row_sum'
,
]
'}'
,
'__syncthreads()'
,
]
@code_version
((
1
,))
@code_version
((
1
,))
...
@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
...
@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
init
=
manner_init
(
"
%(x)
s[
%(pos)
s *
%(stride_x)
s]"
%
locals
())
init
=
manner_init
(
"
%(x)
s[
%(pos)
s *
%(stride_x)
s]"
%
locals
())
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(x)
s[i *
%(stride_x)
s]"
%
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(x)
s[i *
%(stride_x)
s]"
%
locals
()))
locals
()))
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
buf
)
"
%
s[i]"
%
buf
)
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
...
...
theano/sandbox/cuda/neighbours.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
# This is work in progress
# This is work in progress
from
theano
import
Op
,
Apply
,
tensor
from
theano
import
Apply
,
tensor
from
theano.gof
import
local_optimizer
from
theano.gof
import
local_optimizer
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
...
...
theano/sandbox/cuda/nnet.py
浏览文件 @
b69ad54d
...
@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
...
@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
"""
%
locals
()
"""
%
locals
()
def
c_support_code_apply
(
self
,
node
,
nodename
):
def
c_support_code_apply
(
self
,
node
,
nodename
):
ret1
=
nvcc_kernel
(
"kSoftmax_
%
s"
%
nodename
,
ret1
=
nvcc_kernel
(
params
=
[
'int M'
,
'int N'
,
"kSoftmax_
%
s"
%
nodename
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
params
=
[
'int M'
,
'int N'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
'const float * x'
,
body
=
[
'const int sx0'
,
"extern __shared__ float buf[]"
,
'const int sx1'
,
"float * buf2 = buf + N"
,
'float * sm'
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
'const int sm_s0'
,
" blockIDX += gridDim.x){"
,
'const int sm_s1'
],
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
body
=
[
"extern __shared__ float buf[]"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"float * buf2 = buf + N"
,
"buf2[tx] = buf[tx]"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
"}"
,
" blockIDX += gridDim.x){"
,
"__syncthreads()"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
'threadIdx.x'
,
'blockDim.x'
),
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
inline_softmax
(
'N'
,
# This set all value correctly
'buf'
,
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
'buf2'
,
"}"
,
'threadIdx.x'
,
"__syncthreads()"
,
'blockDim.x'
),
"}"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
])
# This set all value correctly
ret2
=
nvcc_kernel
(
"kSoftmax_fixed_shared
%
s"
%
nodename
,
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
params
=
[
'int M'
,
'int N'
,
"__syncthreads()"
,
"}"
,
])
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
ret2
=
nvcc_kernel
(
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
"kSoftmax_fixed_shared
%
s"
%
nodename
,
body
=
[
params
=
[
'int M'
,
'int N'
,
"extern __shared__ float buf[]"
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
" blockIDX += gridDim.x){"
,
body
=
[
"extern __shared__ float buf[]"
,
"const float *x_ptr = &x[blockIDX * sx0]"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
" blockIDX += gridDim.x){"
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
"const float *x_ptr = &x[blockIDX * sx0]"
,
'sm_ptr'
,
'sm_s1'
,
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
'threadIdx.x'
,
'blockDim.x'
),
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
"__syncthreads()"
,
'sm_ptr'
,
'sm_s1'
,
"}"
,
'threadIdx.x'
,
])
'blockDim.x'
),
"__syncthreads()"
,
"}"
,
])
return
ret1
+
"
\n
"
+
ret2
return
ret1
+
"
\n
"
+
ret2
gpu_softmax
=
GpuSoftmax
()
gpu_softmax
=
GpuSoftmax
()
...
@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
...
@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'const float * b'
,
'const int sb0'
,
'const float * b'
,
'const int sb0'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
body
=
[
"extern __shared__ float buf[]"
,
"extern __shared__ float buf[]"
,
"float * buf2 = buf + N"
,
"float * buf2 = buf + N"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf[tx] += b[tx * sb0]"
,
"buf[tx] += b[tx * sb0]"
,
"buf2[tx] = buf[tx]"
,
"}"
,
"buf2[tx] = buf[tx]"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
"}"
,
'threadIdx.x'
,
"__syncthreads()"
,
'blockDim.x'
),
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
'threadIdx.x'
,
'blockDim.x'
),
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"__syncthreads()"
,
"}"
,
])
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
"}"
,
])
ret2
=
nvcc_kernel
(
"kSoftmaxWithBias_fixed_shared
%
s"
%
nodename
,
ret2
=
nvcc_kernel
(
"kSoftmaxWithBias_fixed_shared
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
'const float * x'
,
...
@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
...
@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
'x_ptr'
,
'sx1'
,
'sm_ptr'
,
'sm_s1'
,
'sm_ptr'
,
'sm_s1'
,
'threadIdx.x'
,
'threadIdx.x'
,
'blockDim.x'
,
'blockDim.x'
,
'b'
,
'sb0'
),
'b'
,
'sb0'
),
...
...
theano/sandbox/cuda/nvcc_compiler.py
浏览文件 @
b69ad54d
...
@@ -4,7 +4,6 @@ import logging
...
@@ -4,7 +4,6 @@ import logging
import
os
import
os
import
subprocess
import
subprocess
import
sys
import
sys
import
warnings
from
locale
import
getpreferredencoding
from
locale
import
getpreferredencoding
import
numpy
import
numpy
...
@@ -249,8 +248,9 @@ class NVCC_compiler(Compiler):
...
@@ -249,8 +248,9 @@ class NVCC_compiler(Compiler):
_logger
.
debug
(
'Writing module C++ code to
%
s'
,
cppfilename
)
_logger
.
debug
(
'Writing module C++ code to
%
s'
,
cppfilename
)
cppfile
.
write
(
src_code
)
cppfile
.
write
(
src_code
)
lib_filename
=
os
.
path
.
join
(
location
,
'
%
s.
%
s'
%
lib_filename
=
os
.
path
.
join
(
(
module_name
,
get_lib_extension
()))
location
,
'
%
s.
%
s'
%
(
module_name
,
get_lib_extension
()))
_logger
.
debug
(
'Generating shared lib
%
s'
,
lib_filename
)
_logger
.
debug
(
'Generating shared lib
%
s'
,
lib_filename
)
# TODO: Why do these args cause failure on gtx285 that has 1.3
# TODO: Why do these args cause failure on gtx285 that has 1.3
...
@@ -268,7 +268,7 @@ class NVCC_compiler(Compiler):
...
@@ -268,7 +268,7 @@ class NVCC_compiler(Compiler):
continue
continue
for
pattern
in
[
'-O'
,
'-arch='
,
'-ccbin='
,
'-G'
,
'-g'
,
'-I'
,
for
pattern
in
[
'-O'
,
'-arch='
,
'-ccbin='
,
'-G'
,
'-g'
,
'-I'
,
'-L'
,
'--fmad'
,
'--ftz'
,
'--maxrregcount'
,
'-L'
,
'--fmad'
,
'--ftz'
,
'--maxrregcount'
,
'--prec-div'
,
'--prec-sqrt'
,
'--use_fast_math'
,
'--prec-div'
,
'--prec-sqrt'
,
'--use_fast_math'
,
'-fmad'
,
'-ftz'
,
'-maxrregcount'
,
'-fmad'
,
'-ftz'
,
'-maxrregcount'
,
'-prec-div'
,
'-prec-sqrt'
,
'-use_fast_math'
,
'-prec-div'
,
'-prec-sqrt'
,
'-use_fast_math'
,
'--use-local-env'
,
'--cl-version='
]:
'--use-local-env'
,
'--cl-version='
]:
...
@@ -311,7 +311,7 @@ class NVCC_compiler(Compiler):
...
@@ -311,7 +311,7 @@ class NVCC_compiler(Compiler):
# https://wiki.debian.org/RpathIssue for details.
# https://wiki.debian.org/RpathIssue for details.
if
(
not
type
(
config
.
cuda
)
.
root
.
is_default
and
if
(
not
type
(
config
.
cuda
)
.
root
.
is_default
and
os
.
path
.
exists
(
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib'
))):
os
.
path
.
exists
(
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib'
))):
rpaths
.
append
(
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib'
))
rpaths
.
append
(
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib'
))
if
sys
.
platform
!=
'darwin'
:
if
sys
.
platform
!=
'darwin'
:
...
@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
...
@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
indexof
=
cmd
.
index
(
'-u'
)
indexof
=
cmd
.
index
(
'-u'
)
cmd
.
pop
(
indexof
)
# Remove -u
cmd
.
pop
(
indexof
)
# Remove -u
cmd
.
pop
(
indexof
)
# Remove argument to -u
cmd
.
pop
(
indexof
)
# Remove argument to -u
except
ValueError
as
e
:
except
ValueError
:
done
=
True
done
=
True
# CUDA Toolkit v4.1 Known Issues:
# CUDA Toolkit v4.1 Known Issues:
...
@@ -359,11 +359,13 @@ class NVCC_compiler(Compiler):
...
@@ -359,11 +359,13 @@ class NVCC_compiler(Compiler):
try
:
try
:
os
.
chdir
(
location
)
os
.
chdir
(
location
)
p
=
subprocess
.
Popen
(
p
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
nvcc_stdout_raw
,
nvcc_stderr_raw
=
p
.
communicate
()[:
2
]
nvcc_stdout_raw
,
nvcc_stderr_raw
=
p
.
communicate
()[:
2
]
console_encoding
=
getpreferredencoding
()
console_encoding
=
getpreferredencoding
()
nvcc_stdout
=
decode_with
(
nvcc_stdout_raw
,
console_encoding
)
nvcc_stdout
=
decode_with
(
nvcc_stdout_raw
,
console_encoding
)
nvcc_stderr
=
decode_with
(
nvcc_stderr_raw
,
console_encoding
)
nvcc_stderr
=
decode_with
(
nvcc_stderr_raw
,
console_encoding
)
p
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
finally
:
finally
:
os
.
chdir
(
orig_dir
)
os
.
chdir
(
orig_dir
)
...
...
theano/sandbox/cuda/opt.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/rng_curand.py
浏览文件 @
b69ad54d
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
__authors__
=
"James Bergstra"
__copyright__
=
"(c) 2011, University of Montreal"
__license__
=
"3-clause BSD License"
__contact__
=
"theano-dev@googlegroups.com"
import
numpy
import
numpy
import
theano.gof
import
theano.gof
from
theano.compat
import
PY3
from
theano.compat
import
PY3
...
@@ -17,12 +7,21 @@ from theano.tensor import (get_vector_length, cast, opt)
...
@@ -17,12 +7,21 @@ from theano.tensor import (get_vector_length, cast, opt)
from
theano.compile
import
optdb
from
theano.compile
import
optdb
from
theano.gof
import
local_optimizer
,
Variable
from
theano.gof
import
local_optimizer
,
Variable
__authors__
=
"James Bergstra"
__copyright__
=
"(c) 2011, University of Montreal"
__license__
=
"3-clause BSD License"
__contact__
=
"theano-dev@googlegroups.com"
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
config
=
theano
.
config
config
=
theano
.
config
class
CURAND_Base
(
GpuOp
):
class
CURAND_Base
(
GpuOp
):
"""
"""
Base class for a random number generator implemented in CURAND.
Base class for a random number generator implemented in CURAND.
The random number generator itself is an opaque reference managed by
The random number generator itself is an opaque reference managed by
...
@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
...
@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
Return a tuple of attributes that define the Op.
Return a tuple of attributes that define the Op.
"""
"""
return
(
return
(
self
.
destructive
,
self
.
destructive
,
self
.
output_type
,
self
.
output_type
,
self
.
seed
,
self
.
seed
,
)
)
...
@@ -88,7 +86,7 @@ class CURAND_Base(GpuOp):
...
@@ -88,7 +86,7 @@ class CURAND_Base(GpuOp):
def
make_node
(
self
,
generator
,
size
):
def
make_node
(
self
,
generator
,
size
):
return
theano
.
gof
.
Apply
(
self
,
[
generator
,
size
],
return
theano
.
gof
.
Apply
(
self
,
[
generator
,
size
],
[
generator
.
type
(),
self
.
output_type
()])
[
generator
.
type
(),
self
.
output_type
()])
@classmethod
@classmethod
def
new_auto_update
(
cls
,
generator
,
ndim
,
dtype
,
size
,
seed
):
def
new_auto_update
(
cls
,
generator
,
ndim
,
dtype
,
size
,
seed
):
...
@@ -101,10 +99,9 @@ class CURAND_Base(GpuOp):
...
@@ -101,10 +99,9 @@ class CURAND_Base(GpuOp):
v_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
v_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
if
ndim
is
None
:
if
ndim
is
None
:
ndim
=
get_vector_length
(
v_size
)
ndim
=
get_vector_length
(
v_size
)
self
=
cls
(
self
=
cls
(
output_type
=
CudaNdarrayType
((
False
,)
*
ndim
),
output_type
=
CudaNdarrayType
((
False
,)
*
ndim
),
seed
=
seed
,
seed
=
seed
,
destructive
=
False
)
destructive
=
False
)
o_gen
,
sample
=
self
(
generator
,
cast
(
v_size
,
'int32'
))
o_gen
,
sample
=
self
(
generator
,
cast
(
v_size
,
'int32'
))
...
@@ -282,7 +279,7 @@ class CURAND_RandomStreams(object):
...
@@ -282,7 +279,7 @@ class CURAND_RandomStreams(object):
RandomStreams instance that creates CURAND-based random variables.
RandomStreams instance that creates CURAND-based random variables.
One caveat is that generators are not serializable.
One caveat is that generators are not serializable.
Parameters
Parameters
----------
----------
seed : int
seed : int
...
@@ -319,7 +316,7 @@ class CURAND_RandomStreams(object):
...
@@ -319,7 +316,7 @@ class CURAND_RandomStreams(object):
return
rval
return
rval
def
uniform
(
self
,
size
,
low
=
0.0
,
high
=
1.0
,
ndim
=
None
,
def
uniform
(
self
,
size
,
low
=
0.0
,
high
=
1.0
,
ndim
=
None
,
dtype
=
config
.
floatX
):
dtype
=
config
.
floatX
):
"""
"""
Return symbolic tensor of uniform numbers.
Return symbolic tensor of uniform numbers.
...
@@ -327,14 +324,14 @@ class CURAND_RandomStreams(object):
...
@@ -327,14 +324,14 @@ class CURAND_RandomStreams(object):
if
isinstance
(
size
,
tuple
):
if
isinstance
(
size
,
tuple
):
msg
=
"size must be a tuple of int or a Theano variable"
msg
=
"size must be a tuple of int or a Theano variable"
assert
all
([
isinstance
(
i
,
int
)
or
isinstance
(
i
,
Variable
)
assert
all
([
isinstance
(
i
,
int
)
or
isinstance
(
i
,
Variable
)
for
i
in
size
]),
msg
for
i
in
size
]),
msg
else
:
else
:
msg
=
"size must be a tuple of int or a Theano variable"
msg
=
"size must be a tuple of int or a Theano variable"
assert
isinstance
(
size
,
Variable
)
and
size
.
ndim
==
1
,
msg
assert
isinstance
(
size
,
Variable
)
and
size
.
ndim
==
1
,
msg
generator
=
theano
.
shared
(
False
)
# makes a generic
generator
=
theano
.
shared
(
False
)
# makes a generic
s_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
s_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
u
=
CURAND_Uniform
.
new_auto_update
(
generator
,
ndim
,
dtype
,
s_size
,
u
=
CURAND_Uniform
.
new_auto_update
(
generator
,
ndim
,
dtype
,
s_size
,
self
.
next_seed
())
self
.
next_seed
())
self
.
state_updates
.
append
(
u
.
update
)
self
.
state_updates
.
append
(
u
.
update
)
rval
=
u
*
(
high
-
low
)
+
low
rval
=
u
*
(
high
-
low
)
+
low
if
u
.
type
.
broadcastable
!=
rval
.
type
.
broadcastable
:
if
u
.
type
.
broadcastable
!=
rval
.
type
.
broadcastable
:
...
@@ -342,10 +339,10 @@ class CURAND_RandomStreams(object):
...
@@ -342,10 +339,10 @@ class CURAND_RandomStreams(object):
'Increase the size to match the broadcasting pattern of '
'Increase the size to match the broadcasting pattern of '
'low and `high` arguments'
'low and `high` arguments'
)
)
return
rval
return
rval
def
normal
(
self
,
size
=
None
,
avg
=
0.0
,
std
=
1.0
,
ndim
=
None
,
def
normal
(
self
,
size
=
None
,
avg
=
0.0
,
std
=
1.0
,
ndim
=
None
,
dtype
=
config
.
floatX
):
dtype
=
config
.
floatX
):
"""
"""
Return symbolic tensor of normally-distributed numbers.
Return symbolic tensor of normally-distributed numbers.
...
@@ -359,14 +356,14 @@ class CURAND_RandomStreams(object):
...
@@ -359,14 +356,14 @@ class CURAND_RandomStreams(object):
if
isinstance
(
size
,
tuple
):
if
isinstance
(
size
,
tuple
):
msg
=
"size must be a tuple of int or a Theano variable"
msg
=
"size must be a tuple of int or a Theano variable"
assert
all
([
isinstance
(
i
,
int
)
or
isinstance
(
i
,
Variable
)
assert
all
([
isinstance
(
i
,
int
)
or
isinstance
(
i
,
Variable
)
for
i
in
size
]),
msg
for
i
in
size
]),
msg
else
:
else
:
msg
=
"size must be a tuple of int or a Theano variable"
msg
=
"size must be a tuple of int or a Theano variable"
assert
isinstance
(
size
,
Variable
)
and
size
.
ndim
==
1
,
msg
assert
isinstance
(
size
,
Variable
)
and
size
.
ndim
==
1
,
msg
generator
=
theano
.
shared
(
False
)
# makes a generic
generator
=
theano
.
shared
(
False
)
# makes a generic
s_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
s_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
u
=
CURAND_Normal
.
new_auto_update
(
generator
,
ndim
,
dtype
,
s_size
,
u
=
CURAND_Normal
.
new_auto_update
(
generator
,
ndim
,
dtype
,
s_size
,
self
.
next_seed
())
self
.
next_seed
())
self
.
state_updates
.
append
(
u
.
update
)
self
.
state_updates
.
append
(
u
.
update
)
rval
=
u
*
std
+
avg
rval
=
u
*
std
+
avg
if
u
.
type
.
broadcastable
!=
rval
.
type
.
broadcastable
:
if
u
.
type
.
broadcastable
!=
rval
.
type
.
broadcastable
:
...
@@ -374,7 +371,7 @@ class CURAND_RandomStreams(object):
...
@@ -374,7 +371,7 @@ class CURAND_RandomStreams(object):
'Increase the size to match the broadcasting pattern of `low`'
'Increase the size to match the broadcasting pattern of `low`'
'and `high` arguments'
'and `high` arguments'
)
)
return
rval
return
rval
@local_optimizer
([
CURAND_Base
])
@local_optimizer
([
CURAND_Base
])
...
@@ -386,5 +383,5 @@ def local_destructive(node):
...
@@ -386,5 +383,5 @@ def local_destructive(node):
return
new_op
.
make_node
(
*
node
.
inputs
)
.
outputs
return
new_op
.
make_node
(
*
node
.
inputs
)
.
outputs
return
False
return
False
optdb
.
register
(
'CURAND_destructive'
,
optdb
.
register
(
'CURAND_destructive'
,
opt
.
in2out
(
local_destructive
,
ignore_newtrees
=
True
),
99
,
'fast_run'
,
opt
.
in2out
(
local_destructive
,
ignore_newtrees
=
True
)
,
'inplace'
)
99
,
'fast_run'
,
'inplace'
)
theano/sandbox/cuda/tests/test_basic_ops.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_bench_loopfusion.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_blas.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_cuda_ndarray.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_driver.py
浏览文件 @
b69ad54d
...
@@ -6,7 +6,7 @@ import theano
...
@@ -6,7 +6,7 @@ import theano
try
:
try
:
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
except
ImportError
:
except
ImportError
:
# To have the GPU back-end work without nose, we need this file to
# To have the GPU back-end work without nose, we need this file to
...
@@ -33,8 +33,9 @@ def test_nvidia_driver1():
...
@@ -33,8 +33,9 @@ def test_nvidia_driver1():
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
2
assert
len
(
topo
)
==
2
if
sum
(
isinstance
(
node
.
op
,
B
.
GpuCAReduce
)
for
node
in
topo
)
!=
1
:
if
sum
(
isinstance
(
node
.
op
,
B
.
GpuCAReduce
)
for
node
in
topo
)
!=
1
:
msg
=
'
\n\t
'
.
join
([
'Expected exactly one occurrence of GpuCAReduce '
+
msg
=
'
\n\t
'
.
join
(
'but got:'
]
+
[
str
(
app
)
for
app
in
topo
])
[
'Expected exactly one occurrence of GpuCAReduce '
+
'but got:'
]
+
[
str
(
app
)
for
app
in
topo
])
raise
AssertionError
(
msg
)
raise
AssertionError
(
msg
)
if
not
numpy
.
allclose
(
f
(),
a
.
sum
()):
if
not
numpy
.
allclose
(
f
(),
a
.
sum
()):
raise
Exception
(
"The nvidia driver version installed with this OS "
raise
Exception
(
"The nvidia driver version installed with this OS "
...
...
theano/sandbox/cuda/tests/test_extra_ops.py
浏览文件 @
b69ad54d
...
@@ -5,24 +5,22 @@ import itertools
...
@@ -5,24 +5,22 @@ import itertools
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
numpy
as
np
import
numpy
as
np
from
six.moves
import
xrange
from
six.moves
import
xrange
from
theano
import
tensor
as
T
import
theano
from
theano.tensor.extra_ops
import
cumsum
,
CumsumOp
from
theano.tests
import
unittest_tools
as
utt
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
is
False
:
if
cuda_ndarray
.
cuda_available
:
import
theano.tensor.tests.test_extra_ops
from
theano.sandbox.cuda.extra_ops
import
GpuCumsum
else
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.tensor.tests.test_extra_ops
from
theano.sandbox.cuda.extra_ops
import
GpuCumsum
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
else
:
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
from
theano
import
tensor
as
T
import
theano
from
theano.tensor.extra_ops
import
cumsum
,
CumsumOp
from
theano.tests
import
unittest_tools
as
utt
class
TestGpuCumsum
(
theano
.
tensor
.
tests
.
test_extra_ops
.
TestCumsumOp
):
class
TestGpuCumsum
(
theano
.
tensor
.
tests
.
test_extra_ops
.
TestCumsumOp
):
mode
=
mode_with_gpu
mode
=
mode_with_gpu
...
@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
...
@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
utt
.
assert_allclose
(
np
.
cumsum
(
a
[:
i
]),
f
(
a
[:
i
]))
utt
.
assert_allclose
(
np
.
cumsum
(
a
[:
i
]),
f
(
a
[:
i
]))
# Use multiple GPU threadblocks
# Use multiple GPU threadblocks
a
=
np
.
random
.
random
((
block_max_size
+
2
,))
.
astype
(
"float32"
)
a
=
np
.
random
.
random
((
block_max_size
+
2
,))
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
# Use recursive cumsum
# Use recursive cumsum
a
=
np
.
ones
((
block_max_size
*
(
block_max_size
+
1
)
+
2
,),
a
=
np
.
ones
((
block_max_size
*
(
block_max_size
+
1
)
+
2
,),
dtype
=
"float32"
)
dtype
=
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
...
@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
...
@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks
# Use multiple GPU threadblocks
a_shape
=
[
5
,
5
]
a_shape
=
[
5
,
5
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks
# Use multiple GPU gridblocks
a_shape
=
[
4
,
4
]
a_shape
=
[
4
,
4
]
a_shape
[
1
-
shape_axis
]
=
self
.
max_grid_size1
+
1
a_shape
[
1
-
shape_axis
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
),
rtol
=
5e-5
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
),
rtol
=
5e-5
)
# Use recursive cumsum
# Use recursive cumsum
a_shape
=
[
3
,
3
]
a_shape
=
[
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
def
test_GpuCumsum3D
(
self
):
def
test_GpuCumsum3D
(
self
):
...
@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
...
@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks (along accumulation axis)
# Use multiple GPU threadblocks (along accumulation axis)
a_shape
=
[
2
,
2
,
2
]
a_shape
=
[
2
,
2
,
2
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks (not along accumulation axis)
# Use multiple GPU gridblocks (not along accumulation axis)
a_shape
=
[
5
,
5
,
5
]
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
1
)
%
3
]
=
self
.
max_grid_size1
+
1
a_shape
[(
shape_axis
+
1
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
if
axis
is
None
:
# Avoid floating point error
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
a_shape
=
[
5
,
5
,
5
]
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
2
)
%
3
]
=
self
.
max_grid_size1
+
1
a_shape
[(
shape_axis
+
2
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
if
axis
is
None
:
# Avoid floating point error
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use recursive cumsum (along accumulation axis)
# Use recursive cumsum (along accumulation axis)
a_shape
=
[
3
,
3
,
3
]
a_shape
=
[
3
,
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
def
test_GpuCumsum4D
(
self
):
def
test_GpuCumsum4D
(
self
):
...
...
theano/sandbox/cuda/tests/test_gemmcorr3d.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
unittest
import
unittest
import
numpy
import
numpy
import
copy
import
theano
import
theano
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
# Skip tests if cuda_ndarray is not available.
# Skip tests if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
if
not
cuda_ndarray
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda not available'
)
from
theano.sandbox.cuda
import
float32_shared_constructor
as
shared
from
theano.sandbox.cuda
import
float32_shared_constructor
as
shared
from
theano.sandbox.cuda.blas
import
(
from
theano.sandbox.cuda.blas
import
(
GpuCorr3dMM
,
GpuCorr3dMM_gradWeights
,
GpuCorr3dMM_gradInputs
)
GpuCorr3dMM
,
GpuCorr3dMM_gradWeights
,
GpuCorr3dMM_gradInputs
)
from
theano.sandbox.cuda.basic_ops
import
gpu_contiguous
from
theano.sandbox.cuda.basic_ops
import
gpu_contiguous
import
theano.sandbox.cuda
as
cuda_ndarray
if
not
cuda_ndarray
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda not available'
)
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
...
@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
...
@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
inputs
=
shared
(
inputs_val
)
inputs
=
shared
(
inputs_val
)
filters
=
shared
(
filters_val
)
filters
=
shared
(
filters_val
)
bias
=
shared
(
numpy
.
zeros
(
filters_shape
[
4
])
.
astype
(
'float32'
))
bias
=
shared
(
numpy
.
zeros
(
filters_shape
[
4
])
.
astype
(
'float32'
))
conv
=
theano
.
tensor
.
nnet
.
convTransp3D
(
W
=
filters
,
b
=
bias
,
d
=
subsample
,
conv
=
theano
.
tensor
.
nnet
.
convTransp3D
(
W
=
filters
,
b
=
bias
,
d
=
subsample
,
H
=
inputs
)
H
=
inputs
)
f_ref
=
theano
.
function
([],
conv
)
f_ref
=
theano
.
function
([],
conv
)
res_ref
=
f_ref
()
res_ref
=
f_ref
()
...
...
theano/sandbox/cuda/tests/test_gradient.py
浏览文件 @
b69ad54d
...
@@ -8,7 +8,7 @@ from theano.sandbox import cuda
...
@@ -8,7 +8,7 @@ from theano.sandbox import cuda
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
...
...
theano/sandbox/cuda/tests/test_memory.py
浏览文件 @
b69ad54d
...
@@ -11,7 +11,7 @@ from theano import ifelse
...
@@ -11,7 +11,7 @@ from theano import ifelse
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
if
cuda
.
cuda_available
==
False
:
if
cuda
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
...
@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
...
@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
theano_alloc
=
cuda
.
cuda_ndarray
.
cuda_ndarray
.
theano_allocated
()
theano_alloc
=
cuda
.
cuda_ndarray
.
cuda_ndarray
.
theano_allocated
()
return
(
"(n malloc/theano mem allocated in KB)"
,
return
(
"(n malloc/theano mem allocated in KB)"
,
n_mallocs
+
extra_alloc
,
n_mallocs
+
extra_alloc
,
int
(
theano_alloc
/
1024
)
+
extra_size
)
int
(
theano_alloc
/
1024
))
return
(
"n malloc on the gpu"
,
n_mallocs
+
extra_alloc
)
return
(
"n malloc on the gpu"
,
n_mallocs
+
extra_alloc
)
# I don't use the following by default as if there is other stuff running
# I don't use the following by default as if there is other stuff running
...
@@ -83,9 +83,12 @@ def test_memory():
...
@@ -83,9 +83,12 @@ def test_memory():
variables
=
cuda
.
shared_constructor
(
np
.
ones
((
shapes
[
1
],),
variables
=
cuda
.
shared_constructor
(
np
.
ones
((
shapes
[
1
],),
dtype
=
'float32'
))
dtype
=
'float32'
))
derp
=
tensor
.
sum
(
tensor
.
dot
(
some_matrix
[:
shapes
[
0
]],
variables
))
derp
=
tensor
.
sum
(
tensor
.
dot
(
some_matrix
[:
shapes
[
0
]],
variables
))
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
print
(
"Shared took "
,
borrow
=
True
,
np
.
prod
(
variables
.
get_value
(
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
borrow
=
True
,
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
mem2
=
freemem
()
mem2
=
freemem
()
print
(
"Before compilation"
,
mem2
)
print
(
"Before compilation"
,
mem2
)
...
@@ -112,7 +115,7 @@ def test_memory():
...
@@ -112,7 +115,7 @@ def test_memory():
del
obj
del
obj
# print "After deleting function 1", freemem()
# print "After deleting function 1", freemem()
#assert mem2 == freemem(), (mem2, freemem())
#
assert mem2 == freemem(), (mem2, freemem())
del
grad
del
grad
print
(
"After deleting function 2"
,
freemem
())
print
(
"After deleting function 2"
,
freemem
())
...
@@ -155,16 +158,19 @@ def test_memory_lazy():
...
@@ -155,16 +158,19 @@ def test_memory_lazy():
derp
=
ifelse
.
IfElse
(
1
)(
branch_select
,
derp
=
ifelse
.
IfElse
(
1
)(
branch_select
,
derp
,
some_matrix
[:
shapes
[
0
]]
.
sum
())
derp
,
some_matrix
[:
shapes
[
0
]]
.
sum
())
derp
+=
1
derp
+=
1
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
print
(
"Shared took "
,
borrow
=
True
,
np
.
prod
(
variables
.
get_value
(
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
borrow
=
True
,
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
mem2
=
freemem
()
mem2
=
freemem
()
print
(
"Before compilation"
,
mem2
)
print
(
"Before compilation"
,
mem2
)
mem2_1
=
freemem
(
extra_alloc
=
more_alloc1
)
mem2_1
=
freemem
(
extra_alloc
=
more_alloc1
)
obj
=
theano
.
function
([
some_vector
,
branch_select
],
derp
,
obj
=
theano
.
function
([
some_vector
,
branch_select
],
derp
,
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
#theano.printing.debugprint(obj, print_type=True)
#
theano.printing.debugprint(obj, print_type=True)
mem3
=
freemem
()
mem3
=
freemem
()
print
(
"After function compilation 1"
,
mem3
)
print
(
"After function compilation 1"
,
mem3
)
assert
mem2_1
==
mem3
,
(
mem2_1
,
mem3
)
assert
mem2_1
==
mem3
,
(
mem2_1
,
mem3
)
...
...
theano/sandbox/cuda/tests/test_mlp.py
浏览文件 @
b69ad54d
...
@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
...
@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
'otherwise it is too slow!'
)
'otherwise it is too slow!'
)
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
if
tcn
.
cuda_available
==
False
:
if
tcn
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
...
@@ -68,7 +68,7 @@ def print_mode(mode):
...
@@ -68,7 +68,7 @@ def print_mode(mode):
def
print_diff_mode
(
a
,
b
):
def
print_diff_mode
(
a
,
b
):
if
(
a
is
not
None
and
if
(
a
is
not
None
and
isinstance
(
a
,
(
theano
.
compile
.
ProfileMode
,))
and
isinstance
(
a
,
(
theano
.
compile
.
ProfileMode
,))
and
isinstance
(
b
,
(
theano
.
compile
.
ProfileMode
,))):
isinstance
(
b
,
(
theano
.
compile
.
ProfileMode
,))):
a
.
print_diff_summary
(
b
)
a
.
print_diff_summary
(
b
)
...
@@ -138,8 +138,8 @@ def test_run_nnet():
...
@@ -138,8 +138,8 @@ def test_run_nnet():
# print "cpu:", rval_cpu
# print "cpu:", rval_cpu
# print "gpu:", rval_gpu
# print "gpu:", rval_gpu
abs_diff
,
rel_diff
=
\
abs_diff
,
rel_diff
=
\
theano
.
gradient
.
numeric_grad
.
abs_rel_err
(
rval_gpu
,
theano
.
gradient
.
numeric_grad
.
abs_rel_err
(
rval_gpu
,
rval_cpu
)
rval_cpu
)
max_abs_diff
=
abs_diff
.
max
()
max_abs_diff
=
abs_diff
.
max
()
# print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d" % (
# print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d" % (
# max_abs_diff, rel_diff.max(), n_in, n_hid)
# max_abs_diff, rel_diff.max(), n_in, n_hid)
...
@@ -147,19 +147,20 @@ def test_run_nnet():
...
@@ -147,19 +147,20 @@ def test_run_nnet():
rtol
=
1e-4
rtol
=
1e-4
if
n_in
*
n_hid
>=
2048
*
4096
:
if
n_in
*
n_hid
>=
2048
*
4096
:
rtol
=
7e-4
rtol
=
7e-4
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
rtol
,
atol
=
1e-6
),
\
assert
numpy
.
allclose
(
(
"max_abs_diff, max_rel_diff, n_in, n_hid"
,
max_abs_diff
,
rval_cpu
,
rval_gpu
,
rtol
=
rtol
,
atol
=
1e-6
),
\
rel_diff
.
max
(),
n_in
,
n_hid
)
(
"max_abs_diff, max_rel_diff, n_in, n_hid"
,
max_abs_diff
,
rel_diff
.
max
(),
n_in
,
n_hid
)
def
test_run_nnet_med
():
def
test_run_nnet_med
():
utt
.
seed_rng
()
utt
.
seed_rng
()
r
val_cpu
=
r
un_nnet
(
False
,
10
,
128
,
50
,
4
,
n_train
=
10000
)
run_nnet
(
False
,
10
,
128
,
50
,
4
,
n_train
=
10000
)
def
test_run_nnet_small
():
def
test_run_nnet_small
():
utt
.
seed_rng
()
utt
.
seed_rng
()
r
val_cpu
=
r
un_nnet
(
False
,
10
,
10
,
4
,
4
,
n_train
=
100000
)
run_nnet
(
False
,
10
,
10
,
4
,
4
,
n_train
=
100000
)
def
run_conv_nnet1
(
use_gpu
):
def
run_conv_nnet1
(
use_gpu
):
...
@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
...
@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
mode
=
get_mode
(
use_gpu
)
mode
=
get_mode
(
use_gpu
)
# print 'building pfunc ...'
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
train
=
pfunc
(
g
in
zip
(
params
,
gparams
)])
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
# print i, n
...
@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
...
@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
n_batch
,
1
,
1
)
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
n_batch
,
1
,
1
)
conv_op1
=
conv
.
ConvOp
((
n_kern
,
logical_hid_shape
[
0
]
//
2
,
conv_op1
=
conv
.
ConvOp
((
n_kern
,
logical_hid_shape
[
0
]
//
2
,
logical_hid_shape
[
1
]
//
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
)
logical_hid_shape
[
1
]
//
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
)
hid
=
tensor
.
tanh
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid
=
tensor
.
tanh
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
[:,
:,
::
2
,
::
2
],
w1
)
+
b1
.
dimshuffle
((
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
[:,
:,
::
2
,
::
2
],
w1
)
+
b1
.
dimshuffle
((
...
@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
...
@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
mode
=
get_mode
(
use_gpu
)
mode
=
get_mode
(
use_gpu
)
# print 'building pfunc ...'
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
train
=
pfunc
(
g
in
zip
(
params
,
gparams
)])
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
# print i, n
...
@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
...
@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
if
downsample_ops
:
if
downsample_ops
:
hid
=
tensor
.
tanh
(
ds_op
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
))))
hid
=
tensor
.
tanh
(
ds_op
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
))))
else
:
else
:
hid
=
tensor
.
tanh
((
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)
hid
=
tensor
.
tanh
(
))[:,
:,
::
2
,
::
2
])
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
(
(
0
,
'x'
,
'x'
)))[:,
:,
::
2
,
::
2
])
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
,
w1
)
+
b1
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
,
w1
)
+
b1
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid_flat
=
hid1
.
reshape
((
n_batch
,
n_hid
))
hid_flat
=
hid1
.
reshape
((
n_batch
,
n_hid
))
out
=
tensor
.
nnet
.
softmax
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
out
=
tensor
.
nnet
.
softmax
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
loss
=
tensor
.
sum
(
tensor
.
nnet
.
crossentropy_categorical_1hot
(
out
,
loss
=
tensor
.
sum
(
tensor
.
nnet
.
crossentropy_categorical_1hot
(
tensor
.
argmax
(
y
,
axis
=
1
))
*
lr
)
out
,
tensor
.
argmax
(
y
,
axis
=
1
))
*
lr
)
# print 'loss type', loss.type
# print 'loss type', loss.type
params
=
[
w0
,
b0
,
w1
,
b1
,
v
,
c
]
params
=
[
w0
,
b0
,
w1
,
b1
,
v
,
c
]
...
@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
...
@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
mode
=
get_mode
(
use_gpu
,
check_isfinite
)
mode
=
get_mode
(
use_gpu
,
check_isfinite
)
# print 'building pfunc ...'
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
train
=
pfunc
(
g
in
zip
(
params
,
gparams
)])
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
if
verbose
:
if
verbose
:
theano
.
printing
.
debugprint
(
train
)
theano
.
printing
.
debugprint
(
train
)
...
@@ -422,13 +435,13 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
...
@@ -422,13 +435,13 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
utt
.
seed_rng
(
seed
)
# Seeds numpy.random with seed
utt
.
seed_rng
(
seed
)
# Seeds numpy.random with seed
train
,
params
,
x_shape
,
y_shape
,
mode
=
build_conv_nnet2_classif
(
train
,
params
,
x_shape
,
y_shape
,
mode
=
build_conv_nnet2_classif
(
use_gpu
=
use_gpu
,
use_gpu
=
use_gpu
,
isize
=
isize
,
isize
=
isize
,
ksize
=
ksize
,
ksize
=
ksize
,
n_batch
=
bsize
,
n_batch
=
bsize
,
verbose
=
verbose
,
verbose
=
verbose
,
version
=
version
,
version
=
version
,
check_isfinite
=
check_isfinite
)
check_isfinite
=
check_isfinite
)
if
use_gpu
:
if
use_gpu
:
device
=
'GPU'
device
=
'GPU'
...
@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
...
@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
rvals
=
my_zeros
(
n_train
)
rvals
=
my_zeros
(
n_train
)
t0
=
time
.
time
()
for
i
in
xrange
(
n_train
):
for
i
in
xrange
(
n_train
):
rvals
[
i
]
=
train
(
xval
,
yval
,
lr
)[
0
]
rvals
[
i
]
=
train
(
xval
,
yval
,
lr
)[
0
]
t1
=
time
.
time
()
print_mode
(
mode
)
print_mode
(
mode
)
if
pickle
and
isinstance
(
mode
,
theano
.
compile
.
ProfileMode
):
if
pickle
and
isinstance
(
mode
,
theano
.
compile
.
ProfileMode
):
...
@@ -495,35 +506,36 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
...
@@ -495,35 +506,36 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
compare
=
True
compare
=
True
if
not
compare
:
if
not
compare
:
return
run_conv_nnet2_classif
(
use_gpu
=
use_gpu
,
return
run_conv_nnet2_classif
(
seed
=
seed
,
isize
=
isize
,
ksize
=
ksize
,
bsize
=
bsize
,
use_gpu
=
use_gpu
,
n_train
=
n_train
,
seed
=
seed
,
isize
=
isize
,
ksize
=
ksize
,
bsize
=
bsize
,
check_isfinite
=
check_isfinite
,
n_train
=
n_train
,
pickle
=
pickle
,
check_isfinite
=
check_isfinite
,
verbose
=
verbose
,
pickle
=
pickle
,
version
=
version
)
verbose
=
verbose
,
version
=
version
)
utt
.
seed_rng
(
seed
)
# Seeds numpy.random with seed
utt
.
seed_rng
(
seed
)
# Seeds numpy.random with seed
train_cpu
,
params_cpu
,
x_shape
,
y_shape
,
mode_cpu
=
\
train_cpu
,
params_cpu
,
x_shape
,
y_shape
,
mode_cpu
=
\
build_conv_nnet2_classif
(
build_conv_nnet2_classif
(
use_gpu
=
False
,
use_gpu
=
False
,
isize
=
isize
,
isize
=
isize
,
ksize
=
ksize
,
ksize
=
ksize
,
n_batch
=
bsize
,
n_batch
=
bsize
,
verbose
=
verbose
,
verbose
=
verbose
,
version
=
version
,
version
=
version
,
check_isfinite
=
check_isfinite
)
check_isfinite
=
check_isfinite
)
utt
.
seed_rng
(
seed
)
# Seeds numpy.random with seed
utt
.
seed_rng
(
seed
)
# Seeds numpy.random with seed
train_gpu
,
params_gpu
,
x_shape_gpu
,
y_shape_gpu
,
mode_gpu
=
\
train_gpu
,
params_gpu
,
x_shape_gpu
,
y_shape_gpu
,
mode_gpu
=
\
build_conv_nnet2_classif
(
build_conv_nnet2_classif
(
use_gpu
=
True
,
use_gpu
=
True
,
isize
=
isize
,
isize
=
isize
,
ksize
=
ksize
,
ksize
=
ksize
,
n_batch
=
bsize
,
n_batch
=
bsize
,
verbose
=
verbose
,
verbose
=
verbose
,
version
=
version
,
version
=
version
,
check_isfinite
=
check_isfinite
)
check_isfinite
=
check_isfinite
)
assert
x_shape
==
x_shape_gpu
assert
x_shape
==
x_shape_gpu
assert
y_shape
==
y_shape_gpu
assert
y_shape
==
y_shape_gpu
...
@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
...
@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
finally
:
finally
:
theano
.
tensor
.
basic
.
float32_atol
=
orig_float32_atol
theano
.
tensor
.
basic
.
float32_atol
=
orig_float32_atol
if
pickle
:
if
isinstance
(
cpu_mode
,
theano
.
compile
.
ProfileMode
):
import
pickle
print
(
"BEGIN CPU profile mode dump"
)
print
(
pickle
.
dumps
(
cpu_mode
))
print
(
"END CPU profile mode dump"
)
if
isinstance
(
gpu_mode
,
theano
.
compile
.
ProfileMode
):
import
pickle
print
(
"BEGIN GPU profile mode dump"
)
print
(
pickle
.
dumps
(
gpu_mode
))
print
(
"END GPU profile mode dump"
)
# print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
# print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
# (time_cpu, time_gpu, time_cpu/time_gpu))
# (time_cpu, time_gpu, time_cpu/time_gpu))
# print "Estimated time for one pass through MNIST with CPU: %f" % (
# print "Estimated time for one pass through MNIST with CPU: %f" % (
...
...
theano/sandbox/cuda/tests/test_neighbours.py
浏览文件 @
b69ad54d
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
unittest
import
theano.tensor.nnet.tests.test_neighbours
from
theano.sandbox.cuda.neighbours
import
GpuImages2Neibs
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.tensor.nnet.tests.test_neighbours
from
theano.sandbox.cuda.neighbours
import
GpuImages2Neibs
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
...
...
theano/sandbox/cuda/tests/test_opt.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_rng_curand.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_tensor_op.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_var.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_viewop.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/walltime.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/type.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/var.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/tests/test_flake8.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论