Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
b69ad54d
提交
b69ad54d
authored
5月 05, 2016
作者:
Xavier Bouthillier
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #4244 from ChihebTrabelsi/ccw2.0
flake8 sandbox/cuda/*.py
上级
200babca
58267dc2
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
33 个修改的文件
包含
295 行增加
和
266 行删除
+295
-266
GpuConvGrad3D.py
theano/sandbox/cuda/GpuConvGrad3D.py
+8
-9
GpuConvTransp3D.py
theano/sandbox/cuda/GpuConvTransp3D.py
+40
-17
basic_ops.py
theano/sandbox/cuda/basic_ops.py
+0
-0
blas.py
theano/sandbox/cuda/blas.py
+0
-0
elemwise.py
theano/sandbox/cuda/elemwise.py
+0
-0
fftconv.py
theano/sandbox/cuda/fftconv.py
+14
-8
kernel_codegen.py
theano/sandbox/cuda/kernel_codegen.py
+22
-26
neighbours.py
theano/sandbox/cuda/neighbours.py
+1
-1
nnet.py
theano/sandbox/cuda/nnet.py
+56
-59
nvcc_compiler.py
theano/sandbox/cuda/nvcc_compiler.py
+9
-7
opt.py
theano/sandbox/cuda/opt.py
+0
-0
rng_curand.py
theano/sandbox/cuda/rng_curand.py
+26
-29
test_basic_ops.py
theano/sandbox/cuda/tests/test_basic_ops.py
+0
-0
test_bench_loopfusion.py
theano/sandbox/cuda/tests/test_bench_loopfusion.py
+0
-0
test_blas.py
theano/sandbox/cuda/tests/test_blas.py
+0
-0
test_conv_cuda_ndarray.py
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+0
-0
test_cuda_ndarray.py
theano/sandbox/cuda/tests/test_cuda_ndarray.py
+0
-0
test_driver.py
theano/sandbox/cuda/tests/test_driver.py
+4
-3
test_extra_ops.py
theano/sandbox/cuda/tests/test_extra_ops.py
+24
-23
test_gemmcorr3d.py
theano/sandbox/cuda/tests/test_gemmcorr3d.py
+6
-5
test_gradient.py
theano/sandbox/cuda/tests/test_gradient.py
+1
-1
test_memory.py
theano/sandbox/cuda/tests/test_memory.py
+16
-10
test_mlp.py
theano/sandbox/cuda/tests/test_mlp.py
+64
-64
test_neighbours.py
theano/sandbox/cuda/tests/test_neighbours.py
+4
-4
test_opt.py
theano/sandbox/cuda/tests/test_opt.py
+0
-0
test_rng_curand.py
theano/sandbox/cuda/tests/test_rng_curand.py
+0
-0
test_tensor_op.py
theano/sandbox/cuda/tests/test_tensor_op.py
+0
-0
test_var.py
theano/sandbox/cuda/tests/test_var.py
+0
-0
test_viewop.py
theano/sandbox/cuda/tests/test_viewop.py
+0
-0
walltime.py
theano/sandbox/cuda/tests/walltime.py
+0
-0
type.py
theano/sandbox/cuda/type.py
+0
-0
var.py
theano/sandbox/cuda/var.py
+0
-0
test_flake8.py
theano/tests/test_flake8.py
+0
-0
没有找到文件。
theano/sandbox/cuda/GpuConvGrad3D.py
浏览文件 @
b69ad54d
...
...
@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
d_
=
T
.
as_tensor_variable
(
d
)
WShape_
=
T
.
as_tensor_variable
(
WShape
)
dCdH_
=
as_cuda_ndarray_variable
(
dCdH
)
broad
=
(
False
,)
*
5
broad
=
(
False
,)
*
5
return
theano
.
Apply
(
self
,
inputs
=
[
V_
,
d_
,
WShape_
,
dCdH_
],
outputs
=
[
CudaNdarrayType
(
dtype
=
V_
.
dtype
,
broadcastable
=
broad
)()])
...
...
@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize
=
dCdH
.
shape
[
0
]
outputFilters
=
dCdH
.
shape
[
1
]
outputHeight
=
dCdH
.
shape
[
2
]
outputWidth
=
dCdH
.
shape
[
3
]
outputDur
=
dCdH
.
shape
[
4
]
assert
V
.
shape
[
0
]
==
batchSize
inputFilters
=
V
.
shape
[
1
]
inputHeight
=
V
.
shape
[
2
]
inputWidth
=
V
.
shape
[
3
]
inputDur
=
V
.
shape
[
4
]
dr
,
dc
,
dt
=
d
dCdW
=
numpy
.
zeros
(
WShape
,
dtype
=
V
.
dtype
)
...
...
@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
for
p
in
xrange
(
0
,
outputHeight
):
for
q
in
xrange
(
0
,
outputWidth
):
for
r
in
xrange
(
0
,
outputDur
):
dCdW
[
j
,
z
,
k
,
l
,
m
]
+=
dCdH
[
i
,
j
,
p
,
q
,
r
]
*
V
[
i
,
z
,
dr
*
p
+
k
,
dc
*
q
+
l
,
dt
*
r
+
m
]
dCdW
[
j
,
z
,
k
,
l
,
m
]
+=
dCdH
[
i
,
j
,
p
,
q
,
r
]
*
\
V
[
i
,
z
,
dr
*
p
+
k
,
dc
*
q
+
l
,
dt
*
r
+
m
]
output_storage
[
0
][
0
]
=
dCdW
...
...
@@ -86,7 +85,7 @@ class GpuConvGrad3D(GpuOp):
dCdW
=
outputs
[
0
]
codeSource
=
"""
codeSource
=
"""
///////////// < code generated by GpuConvGrad3D >
//printf("
\t\t\t\t
GpuConvGrad3DW c code
\\
n");
...
...
@@ -285,7 +284,7 @@ if(!work_complete){
# This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position.
codeSource
=
"""
codeSource
=
"""
__global__ void
//thread block size = WShape[4]
//grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3])
...
...
theano/sandbox/cuda/GpuConvTransp3D.py
浏览文件 @
b69ad54d
...
...
@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
else
:
RShape_
=
T
.
as_tensor_variable
([
-
1
,
-
1
,
-
1
])
return
theano
.
Apply
(
self
,
inputs
=
[
W_
,
b_
,
d_
,
H_
,
RShape_
],
outputs
=
[
CudaNdarrayType
(
dtype
=
H_
.
dtype
,
broadcastable
=
(
False
,)
*
5
)()])
return
theano
.
Apply
(
self
,
inputs
=
[
W_
,
b_
,
d_
,
H_
,
RShape_
],
outputs
=
[
CudaNdarrayType
(
dtype
=
H_
.
dtype
,
broadcastable
=
(
False
,)
*
5
)()])
def
infer_shape
(
self
,
node
,
input_shapes
):
W
,
b
,
d
,
H
,
RShape
=
node
.
inputs
...
...
@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
assert
dc
>
0
assert
dt
>
0
videoHeight
=
(
outputHeight
-
1
)
*
dr
+
filterHeight
videoWidth
=
(
outputWidth
-
1
)
*
dc
+
filterWidth
videoDur
=
(
outputDur
-
1
)
*
dt
+
filterDur
videoHeight
=
(
outputHeight
-
1
)
*
dr
+
filterHeight
videoWidth
=
(
outputWidth
-
1
)
*
dc
+
filterWidth
videoDur
=
(
outputDur
-
1
)
*
dt
+
filterDur
if
Rshape
is
not
None
and
Rshape
[
0
]
!=
-
1
:
if
Rshape
[
0
]
<
videoHeight
:
...
...
@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
# else:
# print "No Rshape passed in"
# print "video size: "
+
str((videoHeight, videoWidth, videoDur))
# print "video size: "
+
str((videoHeight, videoWidth, videoDur))
R
=
numpy
.
zeros
(
(
batchSize
,
inputChannels
,
videoHeight
,
videoWidth
,
videoDur
)
,
dtype
=
H
.
dtype
)
R
=
numpy
.
zeros
((
batchSize
,
inputChannels
,
videoHeight
,
videoWidth
,
videoDur
),
dtype
=
H
.
dtype
)
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
# sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for
i
in
xrange
(
0
,
batchSize
):
# print '\texample '+str(i+1)+'/'+str(batchSize)
for
j
in
xrange
(
0
,
inputChannels
):
# print '\t\tfeature map '
+str(j+1)+'/'+
str(inputChannels)
# print '\t\tfeature map '
+ str(j+1) + '/' +
str(inputChannels)
for
r
in
xrange
(
0
,
videoHeight
):
# print '\t\t\trow '
+str(r+1)+
'/'+str(videoHeight)
# print '\t\t\trow '
+ str(r+1) +
'/'+str(videoHeight)
for
c
in
xrange
(
0
,
videoWidth
):
for
t
in
xrange
(
0
,
videoDur
):
R
[
i
,
j
,
r
,
c
,
t
]
=
b
[
j
]
ftc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
t
-
filterDur
+
1
)
/
float
(
dt
)))
])
fcc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
c
-
filterWidth
+
1
)
/
float
(
dc
)))
])
rc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
r
-
filterHeight
+
1
)
/
float
(
dr
)))
])
ftc
=
max
(
[
0
,
int
(
numpy
.
ceil
(
float
(
t
-
filterDur
+
1
)
/
float
(
dt
)
))
]
)
fcc
=
max
(
[
0
,
int
(
numpy
.
ceil
(
float
(
c
-
filterWidth
+
1
)
/
float
(
dc
)
))
]
)
rc
=
max
(
[
0
,
int
(
numpy
.
ceil
(
float
(
r
-
filterHeight
+
1
)
/
float
(
dr
)
))
]
)
while
rc
<
outputHeight
:
rk
=
r
-
rc
*
dr
if
rk
<
0
:
...
...
@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
if
tk
<
0
:
break
R
[
i
,
j
,
r
,
c
,
t
]
+=
numpy
.
dot
(
W
[:,
j
,
rk
,
ck
,
tk
],
H
[
i
,
:,
rc
,
cc
,
tc
]
)
R
[
i
,
j
,
r
,
c
,
t
]
+=
numpy
.
dot
(
W
[:,
j
,
rk
,
ck
,
tk
],
H
[
i
,
:,
rc
,
cc
,
tc
])
tc
+=
1
""
# close loop over tc
...
...
theano/sandbox/cuda/basic_ops.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/blas.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/elemwise.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/fftconv.py
浏览文件 @
b69ad54d
...
...
@@ -5,9 +5,9 @@ import numpy as np
import
theano
import
theano.tensor
as
T
from
theano.misc.pycuda_init
import
pycuda_available
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
from
theano.ifelse
import
ifelse
from
theano.misc.pycuda_init
import
pycuda_available
if
cuda_available
:
from
theano.sandbox.cuda
import
(
basic_ops
,
CudaNdarrayType
,
...
...
@@ -448,7 +448,7 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
o1
=
i1
+
1
input_padded
=
T
.
zeros
((
b
,
ic
,
o0
,
o1
),
dtype
=
'float32'
)
input_padded
=
T
.
set_subtensor
(
input_padded
[:,
:,
:
i0
,
:
i1
],
input
)
input
)
else
:
o1
=
i1
input_padded
=
input
...
...
@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# how much values are expected.
if
border_mode
==
'valid'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
)]
elif
border_mode
==
'full'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
)]
else
:
raise
ValueError
(
'invalid mode'
)
...
...
@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
output_fft_s
=
mult_and_reduce
(
input_fft_v
,
filters_fft_v
,
input_shape
=
input_fft_v_shape
,
filter_shape
=
filters_fft_v_shape
)
#output_fft_s = input_fft_v
#
output_fft_s = input_fft_v
# reshape for IFFT
output_fft_flat
=
output_fft_s
.
reshape
((
b
*
oc
,
o0
,
o1
,
o2
//
2
+
1
,
2
))
...
...
@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# how much values are expected.
if
border_mode
==
'valid'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
),
(
f2
-
1
):(
f2
-
1
+
i2
-
f2
+
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
),
(
f2
-
1
):(
f2
-
1
+
i2
-
f2
+
1
)]
elif
border_mode
==
'full'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
),
(
f2
-
1
):(
f2
-
1
+
i2
+
f2
-
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
),
(
f2
-
1
):(
f2
-
1
+
i2
+
f2
-
1
)]
else
:
raise
ValueError
(
'invalid mode'
)
#output = output_circ[:, :, :, :, :]
#
output = output_circ[:, :, :, :, :]
# Rescale manually. This is just a factor that comes in during the
# trip through FFT and inverse FFT.
...
...
theano/sandbox/cuda/kernel_codegen.py
浏览文件 @
b69ad54d
...
...
@@ -76,7 +76,7 @@ def inline_reduce(N, buf, pos, count, manner_fn):
rest of the buffer is trashed by this function.
Notes
-----
-----
buf should be in gpu shared memory, we access it many times.
"""
...
...
@@ -167,29 +167,26 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
We use __i as an int variable in a loop.
"""
return
[
# get max of buf (trashing all but buf[0])
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'float row_max = '
+
buf
+
'[0]'
,
'__syncthreads()'
,
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = exp('
+
buf2
+
'[__i] - row_max)'
,
buf2
+
'[__i] = '
+
buf
+
'[__i]'
,
'}'
,
'__syncthreads()'
,
inline_reduce_sum
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'float row_sum = '
+
buf
+
'[0]'
,
'__syncthreads()'
,
# divide each exp() result by the sum to complete the job.
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = '
+
buf2
+
'[__i] / row_sum'
,
'}'
,
'__syncthreads()'
,
]
return
[
# get max of buf (trashing all but buf[0])
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'float row_max = '
+
buf
+
'[0]'
,
'__syncthreads()'
,
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = exp('
+
buf2
+
'[__i] - row_max)'
,
buf2
+
'[__i] = '
+
buf
+
'[__i]'
,
'}'
,
'__syncthreads()'
,
inline_reduce_sum
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'float row_sum = '
+
buf
+
'[0]'
,
'__syncthreads()'
,
# divide each exp() result by the sum to complete the job.
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = '
+
buf2
+
'[__i] / row_sum'
,
'}'
,
'__syncthreads()'
,
]
@code_version
((
1
,))
...
...
@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
init
=
manner_init
(
"
%(x)
s[
%(pos)
s *
%(stride_x)
s]"
%
locals
())
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(x)
s[i *
%(stride_x)
s]"
%
locals
()))
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
buf
)
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
buf
)
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
...
...
theano/sandbox/cuda/neighbours.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
# This is work in progress
from
theano
import
Op
,
Apply
,
tensor
from
theano
import
Apply
,
tensor
from
theano.gof
import
local_optimizer
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
...
...
theano/sandbox/cuda/nnet.py
浏览文件 @
b69ad54d
...
...
@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
"""
%
locals
()
def
c_support_code_apply
(
self
,
node
,
nodename
):
ret1
=
nvcc_kernel
(
"kSoftmax_
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
"extern __shared__ float buf[]"
,
"float * buf2 = buf + N"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
"}"
,
])
ret2
=
nvcc_kernel
(
"kSoftmax_fixed_shared
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
"extern __shared__ float buf[]"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"const float *x_ptr = &x[blockIDX * sx0]"
,
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
'sm_ptr'
,
'sm_s1'
,
'threadIdx.x'
,
'blockDim.x'
),
"__syncthreads()"
,
"}"
,
])
ret1
=
nvcc_kernel
(
"kSoftmax_
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
"extern __shared__ float buf[]"
,
"float * buf2 = buf + N"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
"}"
,
])
ret2
=
nvcc_kernel
(
"kSoftmax_fixed_shared
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
"extern __shared__ float buf[]"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"const float *x_ptr = &x[blockIDX * sx0]"
,
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
'sm_ptr'
,
'sm_s1'
,
'threadIdx.x'
,
'blockDim.x'
),
"__syncthreads()"
,
"}"
,
])
return
ret1
+
"
\n
"
+
ret2
gpu_softmax
=
GpuSoftmax
()
...
...
@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'const float * b'
,
'const int sb0'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
"extern __shared__ float buf[]"
,
"float * buf2 = buf + N"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf[tx] += b[tx * sb0]"
,
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
"}"
,
])
body
=
[
"extern __shared__ float buf[]"
,
"float * buf2 = buf + N"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf[tx] += b[tx * sb0]"
,
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
"}"
,
])
ret2
=
nvcc_kernel
(
"kSoftmaxWithBias_fixed_shared
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
...
...
@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
'sm_ptr'
,
'sm_s1'
,
'sm_ptr'
,
'sm_s1'
,
'threadIdx.x'
,
'blockDim.x'
,
'b'
,
'sb0'
),
...
...
theano/sandbox/cuda/nvcc_compiler.py
浏览文件 @
b69ad54d
...
...
@@ -4,7 +4,6 @@ import logging
import
os
import
subprocess
import
sys
import
warnings
from
locale
import
getpreferredencoding
import
numpy
...
...
@@ -249,8 +248,9 @@ class NVCC_compiler(Compiler):
_logger
.
debug
(
'Writing module C++ code to
%
s'
,
cppfilename
)
cppfile
.
write
(
src_code
)
lib_filename
=
os
.
path
.
join
(
location
,
'
%
s.
%
s'
%
(
module_name
,
get_lib_extension
()))
lib_filename
=
os
.
path
.
join
(
location
,
'
%
s.
%
s'
%
(
module_name
,
get_lib_extension
()))
_logger
.
debug
(
'Generating shared lib
%
s'
,
lib_filename
)
# TODO: Why do these args cause failure on gtx285 that has 1.3
...
...
@@ -268,7 +268,7 @@ class NVCC_compiler(Compiler):
continue
for
pattern
in
[
'-O'
,
'-arch='
,
'-ccbin='
,
'-G'
,
'-g'
,
'-I'
,
'-L'
,
'--fmad'
,
'--ftz'
,
'--maxrregcount'
,
'--prec-div'
,
'--prec-sqrt'
,
'--use_fast_math'
,
'--prec-div'
,
'--prec-sqrt'
,
'--use_fast_math'
,
'-fmad'
,
'-ftz'
,
'-maxrregcount'
,
'-prec-div'
,
'-prec-sqrt'
,
'-use_fast_math'
,
'--use-local-env'
,
'--cl-version='
]:
...
...
@@ -311,7 +311,7 @@ class NVCC_compiler(Compiler):
# https://wiki.debian.org/RpathIssue for details.
if
(
not
type
(
config
.
cuda
)
.
root
.
is_default
and
os
.
path
.
exists
(
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib'
))):
os
.
path
.
exists
(
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib'
))):
rpaths
.
append
(
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib'
))
if
sys
.
platform
!=
'darwin'
:
...
...
@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
indexof
=
cmd
.
index
(
'-u'
)
cmd
.
pop
(
indexof
)
# Remove -u
cmd
.
pop
(
indexof
)
# Remove argument to -u
except
ValueError
as
e
:
except
ValueError
:
done
=
True
# CUDA Toolkit v4.1 Known Issues:
...
...
@@ -359,11 +359,13 @@ class NVCC_compiler(Compiler):
try
:
os
.
chdir
(
location
)
p
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
nvcc_stdout_raw
,
nvcc_stderr_raw
=
p
.
communicate
()[:
2
]
console_encoding
=
getpreferredencoding
()
nvcc_stdout
=
decode_with
(
nvcc_stdout_raw
,
console_encoding
)
nvcc_stderr
=
decode_with
(
nvcc_stderr_raw
,
console_encoding
)
p
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
finally
:
os
.
chdir
(
orig_dir
)
...
...
theano/sandbox/cuda/opt.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/rng_curand.py
浏览文件 @
b69ad54d
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
from
__future__
import
absolute_import
,
print_function
,
division
__authors__
=
"James Bergstra"
__copyright__
=
"(c) 2011, University of Montreal"
__license__
=
"3-clause BSD License"
__contact__
=
"theano-dev@googlegroups.com"
import
numpy
import
theano.gof
from
theano.compat
import
PY3
...
...
@@ -17,12 +7,21 @@ from theano.tensor import (get_vector_length, cast, opt)
from
theano.compile
import
optdb
from
theano.gof
import
local_optimizer
,
Variable
__authors__
=
"James Bergstra"
__copyright__
=
"(c) 2011, University of Montreal"
__license__
=
"3-clause BSD License"
__contact__
=
"theano-dev@googlegroups.com"
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
config
=
theano
.
config
class
CURAND_Base
(
GpuOp
):
"""
"""
Base class for a random number generator implemented in CURAND.
The random number generator itself is an opaque reference managed by
...
...
@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
Return a tuple of attributes that define the Op.
"""
return
(
self
.
destructive
,
return
(
self
.
destructive
,
self
.
output_type
,
self
.
seed
,
)
...
...
@@ -88,7 +86,7 @@ class CURAND_Base(GpuOp):
def
make_node
(
self
,
generator
,
size
):
return
theano
.
gof
.
Apply
(
self
,
[
generator
,
size
],
[
generator
.
type
(),
self
.
output_type
()])
[
generator
.
type
(),
self
.
output_type
()])
@classmethod
def
new_auto_update
(
cls
,
generator
,
ndim
,
dtype
,
size
,
seed
):
...
...
@@ -101,10 +99,9 @@ class CURAND_Base(GpuOp):
v_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
if
ndim
is
None
:
ndim
=
get_vector_length
(
v_size
)
self
=
cls
(
output_type
=
CudaNdarrayType
((
False
,)
*
ndim
),
seed
=
seed
,
destructive
=
False
)
self
=
cls
(
output_type
=
CudaNdarrayType
((
False
,)
*
ndim
),
seed
=
seed
,
destructive
=
False
)
o_gen
,
sample
=
self
(
generator
,
cast
(
v_size
,
'int32'
))
...
...
@@ -282,7 +279,7 @@ class CURAND_RandomStreams(object):
RandomStreams instance that creates CURAND-based random variables.
One caveat is that generators are not serializable.
Parameters
----------
seed : int
...
...
@@ -319,7 +316,7 @@ class CURAND_RandomStreams(object):
return
rval
def
uniform
(
self
,
size
,
low
=
0.0
,
high
=
1.0
,
ndim
=
None
,
dtype
=
config
.
floatX
):
dtype
=
config
.
floatX
):
"""
Return symbolic tensor of uniform numbers.
...
...
@@ -327,14 +324,14 @@ class CURAND_RandomStreams(object):
if
isinstance
(
size
,
tuple
):
msg
=
"size must be a tuple of int or a Theano variable"
assert
all
([
isinstance
(
i
,
int
)
or
isinstance
(
i
,
Variable
)
for
i
in
size
]),
msg
for
i
in
size
]),
msg
else
:
msg
=
"size must be a tuple of int or a Theano variable"
assert
isinstance
(
size
,
Variable
)
and
size
.
ndim
==
1
,
msg
generator
=
theano
.
shared
(
False
)
# makes a generic
s_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
u
=
CURAND_Uniform
.
new_auto_update
(
generator
,
ndim
,
dtype
,
s_size
,
self
.
next_seed
())
self
.
next_seed
())
self
.
state_updates
.
append
(
u
.
update
)
rval
=
u
*
(
high
-
low
)
+
low
if
u
.
type
.
broadcastable
!=
rval
.
type
.
broadcastable
:
...
...
@@ -342,10 +339,10 @@ class CURAND_RandomStreams(object):
'Increase the size to match the broadcasting pattern of '
'low and `high` arguments'
)
return
rval
return
rval
def
normal
(
self
,
size
=
None
,
avg
=
0.0
,
std
=
1.0
,
ndim
=
None
,
dtype
=
config
.
floatX
):
dtype
=
config
.
floatX
):
"""
Return symbolic tensor of normally-distributed numbers.
...
...
@@ -359,14 +356,14 @@ class CURAND_RandomStreams(object):
if
isinstance
(
size
,
tuple
):
msg
=
"size must be a tuple of int or a Theano variable"
assert
all
([
isinstance
(
i
,
int
)
or
isinstance
(
i
,
Variable
)
for
i
in
size
]),
msg
for
i
in
size
]),
msg
else
:
msg
=
"size must be a tuple of int or a Theano variable"
assert
isinstance
(
size
,
Variable
)
and
size
.
ndim
==
1
,
msg
generator
=
theano
.
shared
(
False
)
# makes a generic
s_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
u
=
CURAND_Normal
.
new_auto_update
(
generator
,
ndim
,
dtype
,
s_size
,
self
.
next_seed
())
self
.
next_seed
())
self
.
state_updates
.
append
(
u
.
update
)
rval
=
u
*
std
+
avg
if
u
.
type
.
broadcastable
!=
rval
.
type
.
broadcastable
:
...
...
@@ -374,7 +371,7 @@ class CURAND_RandomStreams(object):
'Increase the size to match the broadcasting pattern of `low`'
'and `high` arguments'
)
return
rval
return
rval
@local_optimizer
([
CURAND_Base
])
...
...
@@ -386,5 +383,5 @@ def local_destructive(node):
return
new_op
.
make_node
(
*
node
.
inputs
)
.
outputs
return
False
optdb
.
register
(
'CURAND_destructive'
,
opt
.
in2out
(
local_destructive
,
ignore_newtrees
=
True
),
99
,
'fast_run'
,
'inplace'
)
opt
.
in2out
(
local_destructive
,
ignore_newtrees
=
True
)
,
99
,
'fast_run'
,
'inplace'
)
theano/sandbox/cuda/tests/test_basic_ops.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_bench_loopfusion.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_blas.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_cuda_ndarray.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_driver.py
浏览文件 @
b69ad54d
...
...
@@ -6,7 +6,7 @@ import theano
try
:
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
except
ImportError
:
# To have the GPU back-end work without nose, we need this file to
...
...
@@ -33,8 +33,9 @@ def test_nvidia_driver1():
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
2
if
sum
(
isinstance
(
node
.
op
,
B
.
GpuCAReduce
)
for
node
in
topo
)
!=
1
:
msg
=
'
\n\t
'
.
join
([
'Expected exactly one occurrence of GpuCAReduce '
+
'but got:'
]
+
[
str
(
app
)
for
app
in
topo
])
msg
=
'
\n\t
'
.
join
(
[
'Expected exactly one occurrence of GpuCAReduce '
+
'but got:'
]
+
[
str
(
app
)
for
app
in
topo
])
raise
AssertionError
(
msg
)
if
not
numpy
.
allclose
(
f
(),
a
.
sum
()):
raise
Exception
(
"The nvidia driver version installed with this OS "
...
...
theano/sandbox/cuda/tests/test_extra_ops.py
浏览文件 @
b69ad54d
...
...
@@ -5,24 +5,22 @@ import itertools
from
nose.plugins.skip
import
SkipTest
import
numpy
as
np
from
six.moves
import
xrange
from
theano
import
tensor
as
T
import
theano
from
theano.tensor.extra_ops
import
cumsum
,
CumsumOp
from
theano.tests
import
unittest_tools
as
utt
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
is
False
:
if
cuda_ndarray
.
cuda_available
:
import
theano.tensor.tests.test_extra_ops
from
theano.sandbox.cuda.extra_ops
import
GpuCumsum
else
:
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.tensor.tests.test_extra_ops
from
theano.sandbox.cuda.extra_ops
import
GpuCumsum
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
from
theano
import
tensor
as
T
import
theano
from
theano.tensor.extra_ops
import
cumsum
,
CumsumOp
from
theano.tests
import
unittest_tools
as
utt
class
TestGpuCumsum
(
theano
.
tensor
.
tests
.
test_extra_ops
.
TestCumsumOp
):
mode
=
mode_with_gpu
...
...
@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
utt
.
assert_allclose
(
np
.
cumsum
(
a
[:
i
]),
f
(
a
[:
i
]))
# Use multiple GPU threadblocks
a
=
np
.
random
.
random
((
block_max_size
+
2
,))
.
astype
(
"float32"
)
a
=
np
.
random
.
random
((
block_max_size
+
2
,))
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
# Use recursive cumsum
a
=
np
.
ones
((
block_max_size
*
(
block_max_size
+
1
)
+
2
,),
a
=
np
.
ones
((
block_max_size
*
(
block_max_size
+
1
)
+
2
,),
dtype
=
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
...
...
@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks
a_shape
=
[
5
,
5
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks
a_shape
=
[
4
,
4
]
a_shape
[
1
-
shape_axis
]
=
self
.
max_grid_size1
+
1
a_shape
[
1
-
shape_axis
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
),
rtol
=
5e-5
)
# Use recursive cumsum
a_shape
=
[
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
def
test_GpuCumsum3D
(
self
):
...
...
@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks (along accumulation axis)
a_shape
=
[
2
,
2
,
2
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks (not along accumulation axis)
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
1
)
%
3
]
=
self
.
max_grid_size1
+
1
a_shape
[(
shape_axis
+
1
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
2
)
%
3
]
=
self
.
max_grid_size1
+
1
a_shape
[(
shape_axis
+
2
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use recursive cumsum (along accumulation axis)
a_shape
=
[
3
,
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
def
test_GpuCumsum4D
(
self
):
...
...
theano/sandbox/cuda/tests/test_gemmcorr3d.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
import
unittest
import
numpy
import
copy
import
theano
from
theano.tests
import
unittest_tools
as
utt
# Skip tests if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
if
not
cuda_ndarray
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda not available'
)
from
theano.sandbox.cuda
import
float32_shared_constructor
as
shared
from
theano.sandbox.cuda.blas
import
(
GpuCorr3dMM
,
GpuCorr3dMM_gradWeights
,
GpuCorr3dMM_gradInputs
)
from
theano.sandbox.cuda.basic_ops
import
gpu_contiguous
import
theano.sandbox.cuda
as
cuda_ndarray
if
not
cuda_ndarray
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda not available'
)
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
...
...
@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
inputs
=
shared
(
inputs_val
)
filters
=
shared
(
filters_val
)
bias
=
shared
(
numpy
.
zeros
(
filters_shape
[
4
])
.
astype
(
'float32'
))
conv
=
theano
.
tensor
.
nnet
.
convTransp3D
(
W
=
filters
,
b
=
bias
,
d
=
subsample
,
conv
=
theano
.
tensor
.
nnet
.
convTransp3D
(
W
=
filters
,
b
=
bias
,
d
=
subsample
,
H
=
inputs
)
f_ref
=
theano
.
function
([],
conv
)
res_ref
=
f_ref
()
...
...
theano/sandbox/cuda/tests/test_gradient.py
浏览文件 @
b69ad54d
...
...
@@ -8,7 +8,7 @@ from theano.sandbox import cuda
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
...
...
theano/sandbox/cuda/tests/test_memory.py
浏览文件 @
b69ad54d
...
...
@@ -11,7 +11,7 @@ from theano import ifelse
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
if
cuda
.
cuda_available
==
False
:
if
cuda
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
...
...
@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
theano_alloc
=
cuda
.
cuda_ndarray
.
cuda_ndarray
.
theano_allocated
()
return
(
"(n malloc/theano mem allocated in KB)"
,
n_mallocs
+
extra_alloc
,
int
(
theano_alloc
/
1024
)
+
extra_size
)
int
(
theano_alloc
/
1024
))
return
(
"n malloc on the gpu"
,
n_mallocs
+
extra_alloc
)
# I don't use the following by default as if there is other stuff running
...
...
@@ -83,9 +83,12 @@ def test_memory():
variables
=
cuda
.
shared_constructor
(
np
.
ones
((
shapes
[
1
],),
dtype
=
'float32'
))
derp
=
tensor
.
sum
(
tensor
.
dot
(
some_matrix
[:
shapes
[
0
]],
variables
))
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
mem2
=
freemem
()
print
(
"Before compilation"
,
mem2
)
...
...
@@ -112,7 +115,7 @@ def test_memory():
del
obj
# print "After deleting function 1", freemem()
#assert mem2 == freemem(), (mem2, freemem())
#
assert mem2 == freemem(), (mem2, freemem())
del
grad
print
(
"After deleting function 2"
,
freemem
())
...
...
@@ -155,16 +158,19 @@ def test_memory_lazy():
derp
=
ifelse
.
IfElse
(
1
)(
branch_select
,
derp
,
some_matrix
[:
shapes
[
0
]]
.
sum
())
derp
+=
1
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
mem2
=
freemem
()
print
(
"Before compilation"
,
mem2
)
mem2_1
=
freemem
(
extra_alloc
=
more_alloc1
)
obj
=
theano
.
function
([
some_vector
,
branch_select
],
derp
,
mode
=
mode_with_gpu
)
#theano.printing.debugprint(obj, print_type=True)
#
theano.printing.debugprint(obj, print_type=True)
mem3
=
freemem
()
print
(
"After function compilation 1"
,
mem3
)
assert
mem2_1
==
mem3
,
(
mem2_1
,
mem3
)
...
...
theano/sandbox/cuda/tests/test_mlp.py
浏览文件 @
b69ad54d
...
...
@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
'otherwise it is too slow!'
)
# Skip test if cuda_ndarray is not available.
if
tcn
.
cuda_available
==
False
:
if
tcn
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
...
...
@@ -68,7 +68,7 @@ def print_mode(mode):
def
print_diff_mode
(
a
,
b
):
if
(
a
is
not
None
and
isinstance
(
a
,
(
theano
.
compile
.
ProfileMode
,))
and
isinstance
(
b
,
(
theano
.
compile
.
ProfileMode
,))):
isinstance
(
b
,
(
theano
.
compile
.
ProfileMode
,))):
a
.
print_diff_summary
(
b
)
...
...
@@ -138,8 +138,8 @@ def test_run_nnet():
# print "cpu:", rval_cpu
# print "gpu:", rval_gpu
abs_diff
,
rel_diff
=
\
theano
.
gradient
.
numeric_grad
.
abs_rel_err
(
rval_gpu
,
rval_cpu
)
theano
.
gradient
.
numeric_grad
.
abs_rel_err
(
rval_gpu
,
rval_cpu
)
max_abs_diff
=
abs_diff
.
max
()
# print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d" % (
# max_abs_diff, rel_diff.max(), n_in, n_hid)
...
...
@@ -147,19 +147,20 @@ def test_run_nnet():
rtol
=
1e-4
if
n_in
*
n_hid
>=
2048
*
4096
:
rtol
=
7e-4
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
rtol
,
atol
=
1e-6
),
\
(
"max_abs_diff, max_rel_diff, n_in, n_hid"
,
max_abs_diff
,
rel_diff
.
max
(),
n_in
,
n_hid
)
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
rtol
,
atol
=
1e-6
),
\
(
"max_abs_diff, max_rel_diff, n_in, n_hid"
,
max_abs_diff
,
rel_diff
.
max
(),
n_in
,
n_hid
)
def
test_run_nnet_med
():
utt
.
seed_rng
()
r
val_cpu
=
r
un_nnet
(
False
,
10
,
128
,
50
,
4
,
n_train
=
10000
)
run_nnet
(
False
,
10
,
128
,
50
,
4
,
n_train
=
10000
)
def
test_run_nnet_small
():
utt
.
seed_rng
()
r
val_cpu
=
r
un_nnet
(
False
,
10
,
10
,
4
,
4
,
n_train
=
100000
)
run_nnet
(
False
,
10
,
10
,
4
,
4
,
n_train
=
100000
)
def
run_conv_nnet1
(
use_gpu
):
...
...
@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
mode
=
get_mode
(
use_gpu
)
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
train
=
pfunc
(
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
...
...
@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
n_batch
,
1
,
1
)
conv_op1
=
conv
.
ConvOp
((
n_kern
,
logical_hid_shape
[
0
]
//
2
,
logical_hid_shape
[
1
]
//
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
)
logical_hid_shape
[
1
]
//
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
)
hid
=
tensor
.
tanh
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
[:,
:,
::
2
,
::
2
],
w1
)
+
b1
.
dimshuffle
((
...
...
@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
mode
=
get_mode
(
use_gpu
)
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
train
=
pfunc
(
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
...
...
@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
if
downsample_ops
:
hid
=
tensor
.
tanh
(
ds_op
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
))))
else
:
hid
=
tensor
.
tanh
((
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)
))[:,
:,
::
2
,
::
2
])
hid
=
tensor
.
tanh
(
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
(
(
0
,
'x'
,
'x'
)))[:,
:,
::
2
,
::
2
])
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
,
w1
)
+
b1
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid_flat
=
hid1
.
reshape
((
n_batch
,
n_hid
))
out
=
tensor
.
nnet
.
softmax
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
loss
=
tensor
.
sum
(
tensor
.
nnet
.
crossentropy_categorical_1hot
(
out
,
tensor
.
argmax
(
y
,
axis
=
1
))
*
lr
)
loss
=
tensor
.
sum
(
tensor
.
nnet
.
crossentropy_categorical_1hot
(
out
,
tensor
.
argmax
(
y
,
axis
=
1
))
*
lr
)
# print 'loss type', loss.type
params
=
[
w0
,
b0
,
w1
,
b1
,
v
,
c
]
...
...
@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
mode
=
get_mode
(
use_gpu
,
check_isfinite
)
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
train
=
pfunc
(
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
if
verbose
:
theano
.
printing
.
debugprint
(
train
)
...
...
@@ -422,13 +435,13 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
utt
.
seed_rng
(
seed
)
# Seeds numpy.random with seed
train
,
params
,
x_shape
,
y_shape
,
mode
=
build_conv_nnet2_classif
(
use_gpu
=
use_gpu
,
isize
=
isize
,
ksize
=
ksize
,
n_batch
=
bsize
,
verbose
=
verbose
,
version
=
version
,
check_isfinite
=
check_isfinite
)
use_gpu
=
use_gpu
,
isize
=
isize
,
ksize
=
ksize
,
n_batch
=
bsize
,
verbose
=
verbose
,
version
=
version
,
check_isfinite
=
check_isfinite
)
if
use_gpu
:
device
=
'GPU'
...
...
@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
rvals
=
my_zeros
(
n_train
)
t0
=
time
.
time
()
for
i
in
xrange
(
n_train
):
rvals
[
i
]
=
train
(
xval
,
yval
,
lr
)[
0
]
t1
=
time
.
time
()
print_mode
(
mode
)
if
pickle
and
isinstance
(
mode
,
theano
.
compile
.
ProfileMode
):
...
...
@@ -495,35 +506,36 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
compare
=
True
if
not
compare
:
return
run_conv_nnet2_classif
(
use_gpu
=
use_gpu
,
seed
=
seed
,
isize
=
isize
,
ksize
=
ksize
,
bsize
=
bsize
,
n_train
=
n_train
,
check_isfinite
=
check_isfinite
,
pickle
=
pickle
,
verbose
=
verbose
,
version
=
version
)
return
run_conv_nnet2_classif
(
use_gpu
=
use_gpu
,
seed
=
seed
,
isize
=
isize
,
ksize
=
ksize
,
bsize
=
bsize
,
n_train
=
n_train
,
check_isfinite
=
check_isfinite
,
pickle
=
pickle
,
verbose
=
verbose
,
version
=
version
)
utt
.
seed_rng
(
seed
)
# Seeds numpy.random with seed
train_cpu
,
params_cpu
,
x_shape
,
y_shape
,
mode_cpu
=
\
build_conv_nnet2_classif
(
use_gpu
=
False
,
isize
=
isize
,
ksize
=
ksize
,
n_batch
=
bsize
,
verbose
=
verbose
,
version
=
version
,
check_isfinite
=
check_isfinite
)
build_conv_nnet2_classif
(
use_gpu
=
False
,
isize
=
isize
,
ksize
=
ksize
,
n_batch
=
bsize
,
verbose
=
verbose
,
version
=
version
,
check_isfinite
=
check_isfinite
)
utt
.
seed_rng
(
seed
)
# Seeds numpy.random with seed
train_gpu
,
params_gpu
,
x_shape_gpu
,
y_shape_gpu
,
mode_gpu
=
\
build_conv_nnet2_classif
(
use_gpu
=
True
,
isize
=
isize
,
ksize
=
ksize
,
n_batch
=
bsize
,
verbose
=
verbose
,
version
=
version
,
check_isfinite
=
check_isfinite
)
build_conv_nnet2_classif
(
use_gpu
=
True
,
isize
=
isize
,
ksize
=
ksize
,
n_batch
=
bsize
,
verbose
=
verbose
,
version
=
version
,
check_isfinite
=
check_isfinite
)
assert
x_shape
==
x_shape_gpu
assert
y_shape
==
y_shape_gpu
...
...
@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
finally
:
theano
.
tensor
.
basic
.
float32_atol
=
orig_float32_atol
if
pickle
:
if
isinstance
(
cpu_mode
,
theano
.
compile
.
ProfileMode
):
import
pickle
print
(
"BEGIN CPU profile mode dump"
)
print
(
pickle
.
dumps
(
cpu_mode
))
print
(
"END CPU profile mode dump"
)
if
isinstance
(
gpu_mode
,
theano
.
compile
.
ProfileMode
):
import
pickle
print
(
"BEGIN GPU profile mode dump"
)
print
(
pickle
.
dumps
(
gpu_mode
))
print
(
"END GPU profile mode dump"
)
# print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
# (time_cpu, time_gpu, time_cpu/time_gpu))
# print "Estimated time for one pass through MNIST with CPU: %f" % (
...
...
theano/sandbox/cuda/tests/test_neighbours.py
浏览文件 @
b69ad54d
# Skip test if cuda_ndarray is not available.
from
__future__
import
absolute_import
,
print_function
,
division
from
nose.plugins.skip
import
SkipTest
import
unittest
import
theano.tensor.nnet.tests.test_neighbours
from
theano.sandbox.cuda.neighbours
import
GpuImages2Neibs
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.tensor.nnet.tests.test_neighbours
from
theano.sandbox.cuda.neighbours
import
GpuImages2Neibs
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
...
...
theano/sandbox/cuda/tests/test_opt.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_rng_curand.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_tensor_op.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_var.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_viewop.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/walltime.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/type.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/var.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/tests/test_flake8.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论