Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
85447abe
提交
85447abe
authored
6月 28, 2010
作者:
James Bergstra
浏览文件
操作
浏览文件
下载
差异文件
merge
上级
13b8fb68
7ae6897c
显示空白字符变更
内嵌
并排
正在显示
8 个修改的文件
包含
635 行增加
和
487 行删除
+635
-487
index.txt
doc/index.txt
+6
-0
nnet.py
theano/sandbox/cuda/nnet.py
+17
-10
test_mlp.py
theano/sandbox/cuda/tests/test_mlp.py
+491
-0
test_nnet.py
theano/sandbox/cuda/tests/test_nnet.py
+82
-456
type.py
theano/sandbox/cuda/type.py
+3
-1
multinomial.py
theano/sandbox/multinomial.py
+28
-12
rng_mrg.py
theano/sandbox/rng_mrg.py
+3
-4
test_rng_mrg.py
theano/sandbox/test_rng_mrg.py
+5
-4
没有找到文件。
doc/index.txt
浏览文件 @
85447abe
...
@@ -52,6 +52,10 @@ Community
...
@@ -52,6 +52,10 @@ Community
* Register and post to `theano-dev`_ if you want to talk to the developers.
* Register and post to `theano-dev`_ if you want to talk to the developers.
* Register and post to `theano-announce`_ if you want to be keep informed on important change on theano(low volume).
* Register and post to `theano-buildbot`_ if you want to receive our daily buildbot email.
* We try to stay organized with `Theano's Trac <http://trac-hg.assembla.com/theano/report/1>`__
* We try to stay organized with `Theano's Trac <http://trac-hg.assembla.com/theano/report/1>`__
* Come visit us in Montreal! Most of the developers are students in the LISA_ group at the `University of Montreal`_.
* Come visit us in Montreal! Most of the developers are students in the LISA_ group at the `University of Montreal`_.
...
@@ -77,6 +81,8 @@ Community
...
@@ -77,6 +81,8 @@ Community
.. _theano-dev: http://groups.google.com/group/theano-dev
.. _theano-dev: http://groups.google.com/group/theano-dev
.. _theano-users: http://groups.google.com/group/theano-users
.. _theano-users: http://groups.google.com/group/theano-users
.. _theano-announce: http://groups.google.com/group/theano-announce
.. _theano-buildbot: http://groups.google.com/group/theano-buildbot
.. _tickets: http://pylearn.org/theano/trac/query?status=accepted&status=assigned&status=new&status=reopened&group=milestone&max=200&col=id&col=summary&col=status&col=owner&col=type&col=priority&col=component&col=time&report=9&order=priority
.. _tickets: http://pylearn.org/theano/trac/query?status=accepted&status=assigned&status=new&status=reopened&group=milestone&max=200&col=id&col=summary&col=status&col=owner&col=type&col=priority&col=component&col=time&report=9&order=priority
.. _LISA: http://www.iro.umontreal.ca/~lisa
.. _LISA: http://www.iro.umontreal.ca/~lisa
...
...
theano/sandbox/cuda/nnet.py
浏览文件 @
85447abe
...
@@ -188,7 +188,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
...
@@ -188,7 +188,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
def
make_node
(
self
,
dy
,
sm
,
y_idx
):
def
make_node
(
self
,
dy
,
sm
,
y_idx
):
return
Apply
(
self
,
[
dy
,
sm
,
y_idx
],[
sm
.
type
()])
return
Apply
(
self
,
[
dy
,
sm
,
y_idx
],[
sm
.
type
()])
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
2
,)
return
(
3
,)
#return ()
#return ()
def
c_code
(
self
,
node
,
nodename
,
(
dnll
,
sm
,
y_idx
),
(
dx
,),
sub
):
def
c_code
(
self
,
node
,
nodename
,
(
dnll
,
sm
,
y_idx
),
(
dx
,),
sub
):
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
...
@@ -229,7 +229,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
...
@@ -229,7 +229,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
kCrossEntropySoftmax1HotWithBiasDx_
%(nodename)
s
kCrossEntropySoftmax1HotWithBiasDx_
%(nodename)
s
<<<
<<<
CudaNdarray_HOST_DIMS(
%(dx)
s)[0],
CudaNdarray_HOST_DIMS(
%(dx)
s)[0],
CudaNdarray_HOST_DIMS(
%(dx)
s)[1]
std::min(CudaNdarray_HOST_DIMS(
%(dx)
s)[1],256)
>>>(
>>>(
CudaNdarray_HOST_DIMS(
%(dx)
s)[0],
CudaNdarray_HOST_DIMS(
%(dx)
s)[0],
CudaNdarray_HOST_DIMS(
%(dx)
s)[1],
CudaNdarray_HOST_DIMS(
%(dx)
s)[1],
...
@@ -303,7 +303,7 @@ class GpuSoftmax (Op):
...
@@ -303,7 +303,7 @@ class GpuSoftmax (Op):
return
shape
return
shape
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
#return ()
#return ()
return
(
1
,)
+
inline_softmax
.
code_version
return
(
2
,)
+
inline_softmax
.
code_version
def
c_code
(
self
,
node
,
nodename
,
(
x
,),
(
z
,),
sub
):
def
c_code
(
self
,
node
,
nodename
,
(
x
,),
(
z
,),
sub
):
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
return
"""
return
"""
...
@@ -330,7 +330,7 @@ class GpuSoftmax (Op):
...
@@ -330,7 +330,7 @@ class GpuSoftmax (Op):
kSoftmax_
%(nodename)
s
kSoftmax_
%(nodename)
s
<<<
<<<
// todo: cap these at the card limits, implement loops in kernel
// todo: cap these at the card limits, implement loops in kernel
CudaNdarray_HOST_DIMS(
%(x)
s)[0]
,
std::min(CudaNdarray_HOST_DIMS(
%(x)
s)[0],32*1024)
,
CudaNdarray_HOST_DIMS(
%(x)
s)[1],
CudaNdarray_HOST_DIMS(
%(x)
s)[1],
CudaNdarray_HOST_DIMS(
%(x)
s)[1] * 2 * sizeof(float)
CudaNdarray_HOST_DIMS(
%(x)
s)[1] * 2 * sizeof(float)
>>>(
>>>(
...
@@ -362,11 +362,14 @@ class GpuSoftmax (Op):
...
@@ -362,11 +362,14 @@ class GpuSoftmax (Op):
body
=
[
body
=
[
"extern __shared__ float buf[]"
,
"extern __shared__ float buf[]"
,
"float * buf2 = buf + N"
,
"float * buf2 = buf + N"
,
"buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]"
,
"for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){"
,
"buf[threadIdx.x] = x[blockIDX * sx0 + threadIdx.x * sx1]"
,
"buf2[threadIdx.x] = buf[threadIdx.x]"
,
"buf2[threadIdx.x] = buf[threadIdx.x]"
,
"__syncthreads()"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
"sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
"sm[blockIDX * N + threadIdx.x] = buf[threadIdx.x]"
,
"__syncthreads()"
,
"}"
,
])
])
...
@@ -386,7 +389,7 @@ class GpuSoftmaxWithBias (Op):
...
@@ -386,7 +389,7 @@ class GpuSoftmaxWithBias (Op):
return
[
shape
[
0
]]
return
[
shape
[
0
]]
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
#return ()
#return ()
return
(
1
,)
+
inline_softmax
.
code_version
return
(
2
,)
+
inline_softmax
.
code_version
def
c_code
(
self
,
node
,
nodename
,
(
x
,
b
),
(
z
,),
sub
):
def
c_code
(
self
,
node
,
nodename
,
(
x
,
b
),
(
z
,),
sub
):
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
...
@@ -425,7 +428,7 @@ class GpuSoftmaxWithBias (Op):
...
@@ -425,7 +428,7 @@ class GpuSoftmaxWithBias (Op):
kSoftmaxWithBias_
%(nodename)
s
kSoftmaxWithBias_
%(nodename)
s
<<<
<<<
// todo: cap these at the card limits, implement loops in kernel
// todo: cap these at the card limits, implement loops in kernel
CudaNdarray_HOST_DIMS(
%(x)
s)[0]
,
std::min(CudaNdarray_HOST_DIMS(
%(x)
s)[0],32*1024)
,
CudaNdarray_HOST_DIMS(
%(x)
s)[1],
CudaNdarray_HOST_DIMS(
%(x)
s)[1],
CudaNdarray_HOST_DIMS(
%(x)
s)[1] * 2 * sizeof(float)
CudaNdarray_HOST_DIMS(
%(x)
s)[1] * 2 * sizeof(float)
>>>(
>>>(
...
@@ -461,10 +464,14 @@ class GpuSoftmaxWithBias (Op):
...
@@ -461,10 +464,14 @@ class GpuSoftmaxWithBias (Op):
body
=
[
body
=
[
"extern __shared__ float buf[]"
,
"extern __shared__ float buf[]"
,
"float * buf2 = buf + N"
,
"float * buf2 = buf + N"
,
"buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]"
,
"for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){"
,
"buf[threadIdx.x] = x[blockIDX * sx0 + threadIdx.x * sx1]"
,
"buf[threadIdx.x] += b[threadIdx.x * sb0]"
,
"buf[threadIdx.x] += b[threadIdx.x * sb0]"
,
"buf2[threadIdx.x] = buf[threadIdx.x]"
,
"buf2[threadIdx.x] = buf[threadIdx.x]"
,
"__syncthreads()"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
"sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
"sm[blockIDX * N + threadIdx.x] = buf[threadIdx.x]"
,
"__syncthreads()"
,
"}"
,
])
])
#for (int i = blockIdx.x; i < N; i += gridDim.x)
theano/sandbox/cuda/tests/test_mlp.py
0 → 100644
浏览文件 @
85447abe
import
sys
,
time
import
theano
from
theano.compile.sharedvalue
import
shared
from
theano.compile.pfunc
import
pfunc
from
theano
import
tensor
import
theano.tensor.nnet
from
theano
import
config
import
theano.tensor.nnet.conv
as
conv
import
theano.tensor.signal.downsample
as
downsample
import
numpy
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
raise
SkipTest
(
'SKIP TO PREVENT THE BUILDBOT FROM CRASHING. THERE IS A DIFFICULT BUG TO FIX WITH MEMORY LEAK AND/OR WHEN Cuda_Ndarray alloc fail!'
)
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.sandbox.cuda
as
tcn
import
logging
logging
.
getLogger
(
'theano.sandbox.cuda.tests.test_nnet'
)
.
setLevel
(
logging
.
INFO
)
def
my_rand
(
*
shape
):
return
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
def
my_randn
(
*
shape
):
return
theano
.
_asarray
(
numpy
.
random
.
randn
(
*
shape
),
dtype
=
'float32'
)
def
my_zeros
(
*
shape
):
return
theano
.
_asarray
(
numpy
.
zeros
(
*
shape
),
dtype
=
'float32'
)
def
get_mode
(
use_gpu
):
ret
=
theano
.
compile
.
get_default_mode
()
if
isinstance
(
ret
,
theano
.
compile
.
ProfileMode
):
ret
=
theano
.
compile
.
ProfileMode
()
if
use_gpu
:
ret
=
ret
.
including
(
'gpu'
)
else
:
ret
=
ret
.
excluding
(
'gpu'
)
return
ret
def
print_mode
(
mode
):
if
mode
!=
None
and
isinstance
(
mode
,(
theano
.
compile
.
ProfileMode
,)):
mode
.
print_summary
()
def
print_diff_mode
(
a
,
b
):
if
a
!=
None
and
isinstance
(
a
,(
theano
.
compile
.
ProfileMode
,))
and
isinstance
(
b
,(
theano
.
compile
.
ProfileMode
,)):
a
.
print_diff_summary
(
b
)
def
run_nnet
(
use_gpu
,
n_batch
=
60
,
n_in
=
1024
,
n_hid
=
2048
,
n_out
=
10
,
n_train
=
100
):
if
config
.
mode
==
'DEBUG_MODE'
:
n_train
=
1
if
use_gpu
:
w
=
tcn
.
shared_constructor
(
0.01
*
(
my_rand
(
n_in
,
n_hid
)
-
0.5
),
'w'
)
b
=
tcn
.
shared_constructor
(
my_zeros
(
n_hid
),
'b'
)
v
=
tcn
.
shared_constructor
(
my_zeros
((
n_hid
,
n_out
)),
'c'
)
c
=
tcn
.
shared_constructor
(
my_zeros
(
n_out
),
'c'
)
else
:
w
=
shared
(
0.01
*
(
my_rand
(
n_in
,
n_hid
)
-
0.5
),
'w'
)
b
=
shared
(
my_zeros
(
n_hid
),
'b'
)
v
=
shared
(
my_zeros
((
n_hid
,
n_out
)),
'c'
)
c
=
shared
(
my_zeros
(
n_out
),
'c'
)
x
=
tensor
.
fmatrix
(
'x'
)
y
=
tensor
.
fmatrix
(
'y'
)
lr
=
tensor
.
fscalar
(
'lr'
)
hid
=
tensor
.
tanh
(
tensor
.
dot
(
x
,
w
)
+
b
)
out
=
tensor
.
tanh
(
tensor
.
dot
(
hid
,
v
)
+
c
)
loss
=
tensor
.
sum
(
0.5
*
(
out
-
y
)
**
2
*
lr
)
if
0
:
print
'loss type'
,
loss
.
type
params
=
[
w
,
b
,
v
,
c
]
gparams
=
tensor
.
grad
(
loss
,
params
)
mode
=
get_mode
(
use_gpu
)
print
'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
if
0
:
for
i
,
n
in
enumerate
(
train
.
maker
.
env
.
toposort
()):
print
i
,
n
xval
=
my_rand
(
n_batch
,
n_in
)
yval
=
my_rand
(
n_batch
,
n_out
)
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
t0
=
time
.
time
()
rval
=
[]
for
i
in
xrange
(
n_train
):
rval
.
append
(
train
(
xval
,
yval
,
lr
))
dt
=
time
.
time
()
-
t0
print_mode
(
mode
)
return
numpy
.
asarray
(
rval
),
dt
def
test_run_nnet
():
for
n_in
in
1024
,
2048
,
4096
:
for
n_hid
in
1024
,
2048
,
4096
:
numpy
.
random
.
seed
(
23456
)
rval_cpu
,
tc
=
run_nnet
(
False
,
n_in
=
n_in
,
n_hid
=
n_hid
)
numpy
.
random
.
seed
(
23456
)
rval_gpu
,
tg
=
run_nnet
(
True
,
n_in
=
n_in
,
n_hid
=
n_hid
)
#print "cpu:", rval_cpu
#print "gpu:", rval_gpu
print
"max abs diff:"
,
numpy
.
max
(
numpy
.
absolute
(
rval_gpu
-
rval_cpu
))
print
"time cpu:
%
f, time gpu:
%
f, speed up
%
f"
%
(
tc
,
tg
,
tc
/
tg
)
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
1e-4
,
atol
=
1e-6
)
def
test_run_nnet_med
():
numpy
.
random
.
seed
(
23456
)
rval_cpu
=
run_nnet
(
False
,
10
,
128
,
50
,
4
,
n_train
=
10000
)
def
test_run_nnet_small
():
numpy
.
random
.
seed
(
23456
)
rval_cpu
=
run_nnet
(
False
,
10
,
10
,
4
,
4
,
n_train
=
100000
)
def
run_conv_nnet1
(
use_gpu
):
if
use_gpu
:
shared_fn
=
tcn
.
shared_constructor
else
:
shared_fn
=
shared
n_batch
=
16
n_kern
=
20
shape_img
=
(
n_batch
,
1
,
32
,
32
)
shape_kern
=
(
n_kern
,
1
,
5
,
5
)
n_train
=
10
if
config
.
mode
==
'DEBUG_MODE'
:
n_train
=
1
logical_hid_shape
=
tcn
.
blas
.
GpuConv
.
logical_output_shape_2d
(
shape_img
[
2
:],
shape_kern
[
2
:],
'valid'
)
n_hid
=
n_kern
*
logical_hid_shape
[
0
]
*
logical_hid_shape
[
1
]
n_out
=
10
w
=
shared_fn
(
0.01
*
(
my_rand
(
*
shape_kern
)
-
0.5
),
'w'
)
b
=
shared_fn
(
my_zeros
((
n_kern
,)),
'b'
)
v
=
shared_fn
(
my_zeros
((
n_hid
,
n_out
)),
'c'
)
c
=
shared_fn
(
my_zeros
(
n_out
),
'c'
)
x
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,
1
,
0
,
0
))(
'x'
)
y
=
tensor
.
fmatrix
(
'y'
)
lr
=
tensor
.
fscalar
(
'lr'
)
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
n_batch
,
1
,
1
)
conv_op
.
set_flops
()
hid
=
tensor
.
tanh
(
conv_op
(
x
,
w
)
+
b
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid_flat
=
hid
.
reshape
((
n_batch
,
n_hid
))
out
=
tensor
.
tanh
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
loss
=
tensor
.
sum
(
0.5
*
(
out
-
y
)
**
2
*
lr
)
print
'loss type'
,
loss
.
type
params
=
[
w
,
b
,
v
,
c
]
gparams
=
tensor
.
grad
(
loss
,
params
)
mode
=
get_mode
(
use_gpu
)
print
'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval
=
my_rand
(
*
shape_img
)
yval
=
my_rand
(
n_batch
,
n_out
)
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
for
i
in
xrange
(
n_train
):
rval
=
train
(
xval
,
yval
,
lr
)
print
'training done'
print_mode
(
mode
)
return
rval
def
test_conv_nnet1
():
numpy
.
random
.
seed
(
23456
)
rval_cpu
=
run_conv_nnet1
(
False
)
numpy
.
random
.
seed
(
23456
)
rval_gpu
=
run_conv_nnet1
(
True
)
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
1e-4
,
atol
=
1e-6
)
def
run_conv_nnet2
(
use_gpu
):
# pretend we are training LeNet for MNIST
if
use_gpu
:
shared_fn
=
tcn
.
shared_constructor
else
:
shared_fn
=
shared
#cumulativ rounding error affect this comparaison of result. So we lower the tolerance.
#TODO: why the last two example see the error lower? We are converging?
#n_train=10, n_batch=3, n_kern=1, n_kern1=1, error see of 1e-9
#n_train=10, n_batch=3, n_kern=10, n_kern1=1, error see of -1.27777e-06
#n_train=10, n_batch=3, n_kern=10, n_kern1=10, error see of -6.91377e-05
#n_train=10, n_batch=30, n_kern=10, n_kern1=10, error see of -0.00185963
#n_train=10, n_batch=60, n_kern=10, n_kern1=10, error see of -5.26905e-05
#n_train=30, n_batch=60, n_kern=10, n_kern1=10, error see of -3.8147e-06
#n_train=30, n_batch=60, n_kern=20, n_kern1=10, error see of 6.82771e-05
#n_train=30, n_batch=60, n_kern=20, n_kern1=30, error see of 0.000231534
n_batch
=
60
shape_img
=
(
n_batch
,
1
,
32
,
32
)
n_kern
=
20
shape_kern
=
(
n_kern
,
1
,
5
,
5
)
n_kern1
=
10
shape_kern1
=
(
n_kern1
,
n_kern
,
5
,
5
)
n_train
=
30
if
config
.
mode
==
'DEBUG_MODE'
:
n_train
=
1
logical_hid_shape
=
tcn
.
blas
.
GpuConv
.
logical_output_shape_2d
(
tuple
(
shape_img
[
2
:]),
tuple
(
shape_kern
[
2
:]),
'valid'
)
logical_hid_shape1
=
tcn
.
blas
.
GpuConv
.
logical_output_shape_2d
((
logical_hid_shape
[
0
]
/
2
,
logical_hid_shape
[
1
]
/
2
),
tuple
(
shape_kern1
[
2
:]),
'valid'
)
n_hid
=
n_kern1
*
logical_hid_shape1
[
0
]
*
logical_hid_shape1
[
1
]
n_out
=
10
w0
=
shared_fn
(
0.01
*
(
my_rand
(
*
shape_kern
)
-
0.5
),
'w0'
)
b0
=
shared_fn
(
my_zeros
((
n_kern
,)),
'b0'
)
w1
=
shared_fn
(
0.01
*
(
my_rand
(
*
shape_kern1
)
-
0.5
),
'w1'
)
b1
=
shared_fn
(
my_zeros
((
n_kern1
,)),
'b1'
)
v
=
shared_fn
(
my_zeros
((
n_hid
,
n_out
)),
'c'
)
c
=
shared_fn
(
my_zeros
(
n_out
),
'c'
)
x
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,
1
,
0
,
0
))(
'x'
)
y
=
tensor
.
fmatrix
(
'y'
)
lr
=
tensor
.
fscalar
(
'lr'
)
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
n_batch
,
1
,
1
)
conv_op1
=
conv
.
ConvOp
((
n_kern
,
logical_hid_shape
[
0
]
/
2
,
logical_hid_shape
[
1
]
/
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
)
conv_op
.
set_flops
()
conv_op1
.
set_flops
()
hid
=
tensor
.
tanh
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
[:,:,::
2
,::
2
],
w1
)
+
b1
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid_flat
=
hid1
.
reshape
((
n_batch
,
n_hid
))
out
=
tensor
.
tanh
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
loss
=
tensor
.
sum
(
0.5
*
(
out
-
y
)
**
2
*
lr
)
print
'loss type'
,
loss
.
type
params
=
[
w0
,
b0
,
w1
,
b1
,
v
,
c
]
gparams
=
tensor
.
grad
(
loss
,
params
)
mode
=
get_mode
(
use_gpu
)
print
'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval
=
my_rand
(
*
shape_img
)
yval
=
my_rand
(
n_batch
,
n_out
)
#int32 make all 0...
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
for
i
in
xrange
(
n_train
):
rval
=
train
(
xval
,
yval
,
lr
)
print_mode
(
mode
)
return
rval
def
test_conv_nnet2
():
numpy
.
random
.
seed
(
23456
)
rval_gpu
=
run_conv_nnet2
(
True
)
if
True
:
numpy
.
random
.
seed
(
23456
)
rval_cpu
=
run_conv_nnet2
(
False
)
print
rval_cpu
[
0
],
rval_gpu
[
0
],
rval_cpu
[
0
]
-
rval_gpu
[
0
]
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
1e-4
,
atol
=
1e-4
)
def
run_conv_nnet2_classif
(
use_gpu
,
isize
,
ksize
,
n_batch
,
n_train
,
downsample_ops
=
True
,
verbose
=
0
,
version
=-
1
):
if
use_gpu
:
shared_fn
=
tcn
.
shared_constructor
else
:
shared_fn
=
shared
isize1
=
isize
isize2
=
isize
if
isinstance
(
isize
,(
tuple
,)):
isize1
=
isize
[
0
]
isize2
=
isize
[
1
]
shape_img
=
(
n_batch
,
1
,
isize1
,
isize2
)
n_kern
=
20
# 6 were used in LeNet5
shape_kern
=
(
n_kern
,
1
,
ksize
,
ksize
)
n_kern1
=
30
# 16 were used in LeNet5
shape_kern1
=
(
n_kern1
,
n_kern
,
ksize
,
ksize
)
logical_hid_shape
=
tcn
.
blas
.
GpuConv
.
logical_output_shape_2d
((
isize1
,
isize2
),
(
ksize
,
ksize
),
'valid'
)
logical_hid_shape1
=
tcn
.
blas
.
GpuConv
.
logical_output_shape_2d
((
logical_hid_shape
[
0
]
/
2
,
logical_hid_shape
[
1
]
/
2
),
(
ksize
,
ksize
),
'valid'
)
n_hid
=
n_kern1
*
logical_hid_shape1
[
0
]
*
logical_hid_shape1
[
1
]
n_out
=
10
w0
=
shared_fn
(
0.01
*
(
my_rand
(
*
shape_kern
)
-
0.5
),
'w0'
)
b0
=
shared_fn
(
my_zeros
((
n_kern
,)),
'b0'
)
w1
=
shared_fn
(
0.01
*
(
my_rand
(
*
shape_kern1
)
-
0.5
),
'w1'
)
b1
=
shared_fn
(
my_zeros
((
n_kern1
,)),
'b1'
)
v
=
shared_fn
(
0.01
*
my_randn
(
n_hid
,
n_out
),
'v'
)
c
=
shared_fn
(
my_zeros
(
n_out
),
'c'
)
print
'ALLOCATING ARCH: w0 shape'
,
w0
.
value
.
shape
print
'ALLOCATING ARCH: w1 shape'
,
w1
.
value
.
shape
print
'ALLOCATING ARCH: v shape'
,
v
.
value
.
shape
x
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,
1
,
0
,
0
))(
'x'
)
y
=
tensor
.
fmatrix
(
'y'
)
lr
=
tensor
.
fscalar
(
'lr'
)
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
n_batch
,
1
,
1
,
verbose
=
verbose
,
version
=
version
)
conv_op1
=
conv
.
ConvOp
(
(
n_kern
,
logical_hid_shape
[
0
]
/
2
,
logical_hid_shape
[
1
]
/
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
,
verbose
=
verbose
,
version
=
version
)
conv_op
.
set_flops
()
conv_op1
.
set_flops
()
ds_op
=
downsample
.
DownsampleFactorMax
((
2
,
2
),
ignore_border
=
False
)
if
downsample_ops
:
hid
=
tensor
.
tanh
(
ds_op
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
))))
else
:
hid
=
tensor
.
tanh
((
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)))[:,:,::
2
,::
2
])
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
,
w1
)
+
b1
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid_flat
=
hid1
.
reshape
((
n_batch
,
n_hid
))
out
=
tensor
.
nnet
.
softmax
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
loss
=
tensor
.
sum
(
tensor
.
nnet
.
crossentropy_categorical_1hot
(
out
,
tensor
.
argmax
(
y
,
axis
=
1
))
*
lr
)
print
'loss type'
,
loss
.
type
params
=
[
w0
,
b0
,
w1
,
b1
,
v
,
c
]
gparams
=
tensor
.
grad
(
loss
,
params
,
warn_type
=
True
)
mode
=
get_mode
(
use_gpu
)
print
'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
if
False
:
for
i
,
n
in
enumerate
(
train
.
maker
.
env
.
toposort
()):
print
i
,
n
xval
=
my_rand
(
*
shape_img
)
yval
=
my_rand
(
n_batch
,
n_out
)
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
rvals
=
my_zeros
(
n_train
)
t0
=
time
.
time
()
for
i
in
xrange
(
n_train
):
rvals
[
i
]
=
train
(
xval
,
yval
,
lr
)[
0
]
t1
=
time
.
time
()
print_mode
(
mode
)
return
rvals
,
t1
-
t0
,
mode
def
cmp_run_conv_nnet2_classif
(
seed
,
isize
,
ksize
,
bsize
,
ignore_error
=
False
,
n_train
=
10
,
gpu_only
=
False
,
cpu_only
=
False
,
float_atol
=
1e-06
,
check_isfinite
=
True
,
pickle
=
False
,
verbose
=
0
,
version
=-
1
):
"""
float_atol: None mean use the default value.
check_isfinite: the debug mode option. We forward this value to debug mode.
For some parameter CrossentropyCategorical1Hot op generate inf when not optimized.
"""
if
config
.
mode
==
'DEBUG_MODE'
:
n_train
=
1
numpy
.
random
.
seed
(
seed
)
import
theano.tensor.basic
import
theano.compile.debugmode
from
theano.compile.mode
import
predefined_modes
orig_float32_atol
=
theano
.
tensor
.
basic
.
float32_atol
orig_check_isfinite
=
predefined_modes
[
"DEBUG_MODE"
]
.
check_isfinite
try
:
predefined_modes
[
"DEBUG_MODE"
]
.
check_isfinite
=
check_isfinite
if
gpu_only
:
tcn
.
use
()
if
float_atol
:
print
"float_atol"
,
float_atol
theano
.
tensor
.
basic
.
float32_atol
=
float_atol
if
not
cpu_only
:
rval_gpu
,
tg
,
gpu_mode
=
run_conv_nnet2_classif
(
True
,
isize
,
ksize
,
bsize
,
n_train
,
verbose
=
verbose
,
version
=
version
)
finally
:
predefined_modes
[
"DEBUG_MODE"
]
.
check_isfinite
=
orig_check_isfinite
theano
.
tensor
.
basic
.
float32_atol
=
orig_float32_atol
if
gpu_only
:
print
"time gpu:
%.3
f"
%
(
tg
)
return
try
:
predefined_modes
[
"DEBUG_MODE"
]
.
check_isfinite
=
check_isfinite
numpy
.
random
.
seed
(
seed
)
rval_cpu
,
tc
,
cpu_mode
=
run_conv_nnet2_classif
(
False
,
isize
,
ksize
,
bsize
,
n_train
,
verbose
=
verbose
,
version
=
version
)
if
pickle
and
isinstance
(
cpu_mode
,(
theano
.
compile
.
ProfileMode
,)):
import
pickle
print
"BEGIN GPU profile mode dump"
#print pickle.dumps(gpu_mode)
print
"END GPU profile mode dump"
print
"BEGIN CPU profile mode dump"
print
pickle
.
dumps
(
cpu_mode
)
print
"END CPU profile mode dump"
finally
:
predefined_modes
[
"DEBUG_MODE"
]
.
check_isfinite
=
orig_check_isfinite
theano
.
tensor
.
basic
.
float32_atol
=
orig_float32_atol
if
not
cpu_only
:
if
verbose
or
not
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
1e-3
,
atol
=
float_atol
):
print
"cpu:"
,
rval_cpu
print
"gpu:"
,
rval_gpu
print
"abs diff:"
,
numpy
.
absolute
(
rval_gpu
-
rval_cpu
)
print
"time cpu:
%.3
f, time gpu:
%.3
f, speed up
%
f"
%
(
tc
,
tg
,
tc
/
tg
)
print
"estimated time for one pass through MNIST with cpu:
%
f"
%
(
tc
*
(
60000.0
/
(
n_train
*
bsize
)))
print
"estimated time for one pass through MNIST with gpu:
%
f"
%
(
tg
*
(
60000.0
/
(
n_train
*
bsize
)))
else
:
print
"time cpu:
%.3
f"
%
(
tc
)
print
"estimated time for one pass through MNIST with cpu:
%
f"
%
(
tc
*
(
60000.0
/
(
n_train
*
bsize
)))
if
not
ignore_error
and
not
cpu_only
and
not
gpu_only
:
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
1e-3
,
atol
=
float_atol
)
gpu_only
=
False
cpu_only
=
False
ignore_error
=
False
verbose
=
0
version
=-
1
def
test_lenet_28
():
#MNIST
cmp_run_conv_nnet2_classif
(
23485
,
28
,
5
,
60
,
n_train
=
10
,
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
cpu_only
=
cpu_only
,
verbose
=
verbose
,
version
=
version
)
def
test_lenet_32
():
#CIFAR10 / Shapeset
cmp_run_conv_nnet2_classif
(
23485
,
32
,
5
,
60
,
n_train
=
10
,
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
verbose
=
verbose
,
version
=
version
)
def
test_lenet_32_long
():
#CIFAR10 / Shapeset
# this tests the gradient of downsample on the GPU,
# which does not recieve specific testing
cmp_run_conv_nnet2_classif
(
23485
,
32
,
5
,
30
,
n_train
=
50
,
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
cpu_only
=
cpu_only
,
verbose
=
verbose
,
version
=
version
)
def
test_lenet_64
():
# ???
#float_atol need to pass in debug mode
#needed as cpu use extended precision and gpu don't
cmp_run_conv_nnet2_classif
(
23485
,
64
,
7
,
10
,
n_train
=
10
,
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
cpu_only
=
cpu_only
,
verbose
=
verbose
,
float_atol
=
5e-4
,
check_isfinite
=
True
,
version
=
version
)
def
test_lenet_108
():
# NORB
cmp_run_conv_nnet2_classif
(
23485
,
108
,
7
,
5
,
n_train
=
4
,
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
cpu_only
=
cpu_only
,
verbose
=
verbose
,
check_isfinite
=
True
,
version
=
version
,
float_atol
=
7e-2
)
def
test_lenet_256
():
# ImageNet
cmp_run_conv_nnet2_classif
(
23485
,
256
,
9
,
2
,
n_train
=
5
,
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
cpu_only
=
cpu_only
,
verbose
=
verbose
,
check_isfinite
=
True
,
version
=
version
)
#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
def
tes_lenet_hd
():
#HD 720p: 1280(wid)x720(len)
cmp_run_conv_nnet2_classif
(
23485
,
(
720
,
1280
),
9
,
2
,
n_train
=
3
,
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
cpu_only
=
cpu_only
,
verbose
=
verbose
,
check_isfinite
=
True
,
version
=
version
)
#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
def
tes_lenet_full_hd
():
#HD 1080p: 1920(wid)x1080(len)
cmp_run_conv_nnet2_classif
(
23485
,
(
1080
,
1920
),
9
,
2
,
n_train
=
3
,
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
cpu_only
=
cpu_only
,
verbose
=
verbose
,
check_isfinite
=
True
,
version
=
version
)
theano/sandbox/cuda/tests/test_nnet.py
浏览文件 @
85447abe
import
sys
,
time
import
theano
,
numpy
import
theano
import
theano.tensor
as
T
from
theano.compile.sharedvalue
import
shared
from
theano.compile.pfunc
import
pfunc
from
theano
import
tensor
import
theano.tensor.nnet
from
theano
import
config
import
theano.tensor.nnet.conv
as
conv
import
theano.tensor.signal.downsample
as
downsample
import
numpy
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
raise
SkipTest
(
'SKIP TO PREVENT THE BUILDBOT FROM CRASHING. THERE IS A DIFFICULT BUG TO FIX WITH MEMORY LEAK AND/OR WHEN Cuda_Ndarray alloc fail!'
)
import
theano.sandbox.cuda
as
cuda
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
==
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.sandbox.cuda
as
tcn
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
import
logging
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpu'
)
logging
.
getLogger
(
'theano.sandbox.cuda.tests.test_nnet'
)
.
setLevel
(
logging
.
INFO
)
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpu'
)
def
my_rand
(
*
shape
):
return
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
def
my_randn
(
*
shape
):
return
theano
.
_asarray
(
numpy
.
random
.
randn
(
*
shape
),
dtype
=
'float32'
)
def
my_zeros
(
*
shape
):
return
theano
.
_asarray
(
numpy
.
zeros
(
*
shape
),
dtype
=
'float32'
)
def
get_mode
(
use_gpu
):
ret
=
theano
.
compile
.
get_default_mode
()
if
isinstance
(
ret
,
theano
.
compile
.
ProfileMode
):
ret
=
theano
.
compile
.
ProfileMode
()
if
use_gpu
:
ret
=
ret
.
including
(
'gpu'
)
else
:
ret
=
ret
.
excluding
(
'gpu'
)
return
ret
def
print_mode
(
mode
):
if
mode
!=
None
and
isinstance
(
mode
,(
theano
.
compile
.
ProfileMode
,)):
mode
.
print_summary
()
def
print_diff_mode
(
a
,
b
):
if
a
!=
None
and
isinstance
(
a
,(
theano
.
compile
.
ProfileMode
,))
and
isinstance
(
b
,(
theano
.
compile
.
ProfileMode
,)):
a
.
print_diff_summary
(
b
)
def
run_nnet
(
use_gpu
,
n_batch
=
60
,
n_in
=
1024
,
n_hid
=
2048
,
n_out
=
10
,
n_train
=
100
):
if
config
.
mode
==
'DEBUG_MODE'
:
n_train
=
1
if
use_gpu
:
w
=
tcn
.
shared_constructor
(
0.01
*
(
my_rand
(
n_in
,
n_hid
)
-
0.5
),
'w'
)
b
=
tcn
.
shared_constructor
(
my_zeros
(
n_hid
),
'b'
)
v
=
tcn
.
shared_constructor
(
my_zeros
((
n_hid
,
n_out
)),
'c'
)
c
=
tcn
.
shared_constructor
(
my_zeros
(
n_out
),
'c'
)
else
:
w
=
shared
(
0.01
*
(
my_rand
(
n_in
,
n_hid
)
-
0.5
),
'w'
)
b
=
shared
(
my_zeros
(
n_hid
),
'b'
)
v
=
shared
(
my_zeros
((
n_hid
,
n_out
)),
'c'
)
c
=
shared
(
my_zeros
(
n_out
),
'c'
)
x
=
tensor
.
fmatrix
(
'x'
)
y
=
tensor
.
fmatrix
(
'y'
)
lr
=
tensor
.
fscalar
(
'lr'
)
hid
=
tensor
.
tanh
(
tensor
.
dot
(
x
,
w
)
+
b
)
out
=
tensor
.
tanh
(
tensor
.
dot
(
hid
,
v
)
+
c
)
loss
=
tensor
.
sum
(
0.5
*
(
out
-
y
)
**
2
*
lr
)
if
0
:
print
'loss type'
,
loss
.
type
params
=
[
w
,
b
,
v
,
c
]
gparams
=
tensor
.
grad
(
loss
,
params
)
mode
=
get_mode
(
use_gpu
)
print
'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
if
0
:
for
i
,
n
in
enumerate
(
train
.
maker
.
env
.
toposort
()):
print
i
,
n
xval
=
my_rand
(
n_batch
,
n_in
)
yval
=
my_rand
(
n_batch
,
n_out
)
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
t0
=
time
.
time
()
rval
=
[]
for
i
in
xrange
(
n_train
):
rval
.
append
(
train
(
xval
,
yval
,
lr
))
dt
=
time
.
time
()
-
t0
print_mode
(
mode
)
return
numpy
.
asarray
(
rval
),
dt
def
test_run_nnet
():
for
n_in
in
1024
,
2048
,
4096
:
for
n_hid
in
1024
,
2048
,
4096
:
numpy
.
random
.
seed
(
23456
)
rval_cpu
,
tc
=
run_nnet
(
False
,
n_in
=
n_in
,
n_hid
=
n_hid
)
numpy
.
random
.
seed
(
23456
)
rval_gpu
,
tg
=
run_nnet
(
True
,
n_in
=
n_in
,
n_hid
=
n_hid
)
#print "cpu:", rval_cpu
#print "gpu:", rval_gpu
print
"max abs diff:"
,
numpy
.
max
(
numpy
.
absolute
(
rval_gpu
-
rval_cpu
))
print
"time cpu:
%
f, time gpu:
%
f, speed up
%
f"
%
(
tc
,
tg
,
tc
/
tg
)
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
1e-4
,
atol
=
1e-6
)
def
test_run_nnet_med
():
numpy
.
random
.
seed
(
23456
)
rval_cpu
=
run_nnet
(
False
,
10
,
128
,
50
,
4
,
n_train
=
10000
)
def
test_run_nnet_small
():
numpy
.
random
.
seed
(
23456
)
rval_cpu
=
run_nnet
(
False
,
10
,
10
,
4
,
4
,
n_train
=
100000
)
def
run_conv_nnet1
(
use_gpu
):
if
use_gpu
:
shared_fn
=
tcn
.
shared_constructor
else
:
shared_fn
=
shared
n_batch
=
16
n_kern
=
20
shape_img
=
(
n_batch
,
1
,
32
,
32
)
shape_kern
=
(
n_kern
,
1
,
5
,
5
)
n_train
=
10
if
config
.
mode
==
'DEBUG_MODE'
:
n_train
=
1
logical_hid_shape
=
tcn
.
blas
.
GpuConv
.
logical_output_shape_2d
(
shape_img
[
2
:],
shape_kern
[
2
:],
'valid'
)
n_hid
=
n_kern
*
logical_hid_shape
[
0
]
*
logical_hid_shape
[
1
]
n_out
=
10
w
=
shared_fn
(
0.01
*
(
my_rand
(
*
shape_kern
)
-
0.5
),
'w'
)
b
=
shared_fn
(
my_zeros
((
n_kern
,)),
'b'
)
v
=
shared_fn
(
my_zeros
((
n_hid
,
n_out
)),
'c'
)
c
=
shared_fn
(
my_zeros
(
n_out
),
'c'
)
x
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,
1
,
0
,
0
))(
'x'
)
y
=
tensor
.
fmatrix
(
'y'
)
lr
=
tensor
.
fscalar
(
'lr'
)
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
n_batch
,
1
,
1
)
conv_op
.
set_flops
()
hid
=
tensor
.
tanh
(
conv_op
(
x
,
w
)
+
b
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid_flat
=
hid
.
reshape
((
n_batch
,
n_hid
))
out
=
tensor
.
tanh
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
loss
=
tensor
.
sum
(
0.5
*
(
out
-
y
)
**
2
*
lr
)
print
'loss type'
,
loss
.
type
params
=
[
w
,
b
,
v
,
c
]
gparams
=
tensor
.
grad
(
loss
,
params
)
mode
=
get_mode
(
use_gpu
)
print
'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval
=
my_rand
(
*
shape_img
)
yval
=
my_rand
(
n_batch
,
n_out
)
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
for
i
in
xrange
(
n_train
):
def
test_GpuCrossentropySoftmax1HotWithBiasDx
():
rval
=
train
(
xval
,
yval
,
lr
)
"""
print
'training done'
This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias and GpuCrossentropySoftmax1HotWithBiasDx
print_mode
(
mode
)
return
rval
def
test_conv_nnet1
():
numpy
.
random
.
seed
(
23456
)
rval_cpu
=
run_conv_nnet1
(
False
)
numpy
.
random
.
seed
(
23456
)
rval_gpu
=
run_conv_nnet1
(
True
)
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
1e-4
,
atol
=
1e-6
)
def
run_conv_nnet2
(
use_gpu
):
# pretend we are training LeNet for MNIST
if
use_gpu
:
shared_fn
=
tcn
.
shared_constructor
else
:
shared_fn
=
shared
#cumulativ rounding error affect this comparaison of result. So we lower the tolerance.
#TODO: why the last two example see the error lower? We are converging?
#n_train=10, n_batch=3, n_kern=1, n_kern1=1, error see of 1e-9
#n_train=10, n_batch=3, n_kern=10, n_kern1=1, error see of -1.27777e-06
#n_train=10, n_batch=3, n_kern=10, n_kern1=10, error see of -6.91377e-05
#n_train=10, n_batch=30, n_kern=10, n_kern1=10, error see of -0.00185963
#n_train=10, n_batch=60, n_kern=10, n_kern1=10, error see of -5.26905e-05
#n_train=30, n_batch=60, n_kern=10, n_kern1=10, error see of -3.8147e-06
#n_train=30, n_batch=60, n_kern=20, n_kern1=10, error see of 6.82771e-05
#n_train=30, n_batch=60, n_kern=20, n_kern1=30, error see of 0.000231534
n_batch
=
60
shape_img
=
(
n_batch
,
1
,
32
,
32
)
n_kern
=
20
shape_kern
=
(
n_kern
,
1
,
5
,
5
)
n_kern1
=
10
shape_kern1
=
(
n_kern1
,
n_kern
,
5
,
5
)
n_train
=
30
if
config
.
mode
==
'DEBUG_MODE'
:
n_train
=
1
logical_hid_shape
=
tcn
.
blas
.
GpuConv
.
logical_output_shape_2d
(
tuple
(
shape_img
[
2
:]),
tuple
(
shape_kern
[
2
:]),
'valid'
)
logical_hid_shape1
=
tcn
.
blas
.
GpuConv
.
logical_output_shape_2d
((
logical_hid_shape
[
0
]
/
2
,
logical_hid_shape
[
1
]
/
2
),
tuple
(
shape_kern1
[
2
:]),
'valid'
)
n_hid
=
n_kern1
*
logical_hid_shape1
[
0
]
*
logical_hid_shape1
[
1
]
n_out
=
10
w0
=
shared_fn
(
0.01
*
(
my_rand
(
*
shape_kern
)
-
0.5
),
'w0'
)
b0
=
shared_fn
(
my_zeros
((
n_kern
,)),
'b0'
)
w1
=
shared_fn
(
0.01
*
(
my_rand
(
*
shape_kern1
)
-
0.5
),
'w1'
)
b1
=
shared_fn
(
my_zeros
((
n_kern1
,)),
'b1'
)
v
=
shared_fn
(
my_zeros
((
n_hid
,
n_out
)),
'c'
)
c
=
shared_fn
(
my_zeros
(
n_out
),
'c'
)
x
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,
1
,
0
,
0
))(
'x'
)
y
=
tensor
.
fmatrix
(
'y'
)
lr
=
tensor
.
fscalar
(
'lr'
)
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
n_batch
,
1
,
1
)
conv_op1
=
conv
.
ConvOp
((
n_kern
,
logical_hid_shape
[
0
]
/
2
,
logical_hid_shape
[
1
]
/
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
)
conv_op
.
set_flops
()
conv_op1
.
set_flops
()
hid
=
tensor
.
tanh
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
[:,:,::
2
,::
2
],
w1
)
+
b1
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid_flat
=
hid1
.
reshape
((
n_batch
,
n_hid
))
out
=
tensor
.
tanh
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
loss
=
tensor
.
sum
(
0.5
*
(
out
-
y
)
**
2
*
lr
)
print
'loss type'
,
loss
.
type
params
=
[
w0
,
b0
,
w1
,
b1
,
v
,
c
]
gparams
=
tensor
.
grad
(
loss
,
params
)
mode
=
get_mode
(
use_gpu
)
print
'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval
=
my_rand
(
*
shape_img
)
yval
=
my_rand
(
n_batch
,
n_out
)
#int32 make all 0...
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
for
i
in
xrange
(
n_train
):
rval
=
train
(
xval
,
yval
,
lr
)
print_mode
(
mode
)
return
rval
def
test_conv_nnet2
():
numpy
.
random
.
seed
(
23456
)
rval_gpu
=
run_conv_nnet2
(
True
)
if
True
:
numpy
.
random
.
seed
(
23456
)
rval_cpu
=
run_conv_nnet2
(
False
)
print
rval_cpu
[
0
],
rval_gpu
[
0
],
rval_cpu
[
0
]
-
rval_gpu
[
0
]
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
1e-4
,
atol
=
1e-4
)
def
run_conv_nnet2_classif
(
use_gpu
,
isize
,
ksize
,
n_batch
,
n_train
,
downsample_ops
=
True
,
verbose
=
0
,
version
=-
1
):
if
use_gpu
:
shared_fn
=
tcn
.
shared_constructor
else
:
shared_fn
=
shared
isize1
=
isize
isize2
=
isize
if
isinstance
(
isize
,(
tuple
,)):
isize1
=
isize
[
0
]
isize2
=
isize
[
1
]
shape_img
=
(
n_batch
,
1
,
isize1
,
isize2
)
n_kern
=
20
# 6 were used in LeNet5
shape_kern
=
(
n_kern
,
1
,
ksize
,
ksize
)
n_kern1
=
30
# 16 were used in LeNet5
shape_kern1
=
(
n_kern1
,
n_kern
,
ksize
,
ksize
)
logical_hid_shape
=
tcn
.
blas
.
GpuConv
.
logical_output_shape_2d
((
isize1
,
isize2
),
(
ksize
,
ksize
),
'valid'
)
We check that we loop when their is too much threads
logical_hid_shape1
=
tcn
.
blas
.
GpuConv
.
logical_output_shape_2d
((
logical_hid_shape
[
0
]
/
2
,
TODO: check that we loop when their is too much block(>32*1024)
logical_hid_shape
[
1
]
/
2
),
(
ksize
,
ksize
),
'valid'
)
"""
n_hid
=
n_kern1
*
logical_hid_shape1
[
0
]
*
logical_hid_shape1
[
1
]
n_out
=
10
n_in
=
1000
batch_size
=
4097
n_out
=
1250
w0
=
shared_fn
(
0.01
*
(
my_rand
(
*
shape_kern
)
-
0.5
),
'w0'
)
if
theano
.
config
.
mode
!=
"DEBUG_MODE"
:
b0
=
shared_fn
(
my_zeros
((
n_kern
,)),
'b0'
)
n_in
=
4098
w1
=
shared_fn
(
0.01
*
(
my_rand
(
*
shape_kern1
)
-
0.5
),
'w1'
)
n_out
=
4099
b1
=
shared_fn
(
my_zeros
((
n_kern1
,)),
'b1'
)
v
=
shared_fn
(
0.01
*
my_randn
(
n_hid
,
n_out
),
'v'
)
c
=
shared_fn
(
my_zeros
(
n_out
),
'c'
)
print
'ALLOCATING ARCH: w0 shape'
,
w0
.
value
.
shape
x
=
T
.
fmatrix
(
'x'
)
print
'ALLOCATING ARCH: w1 shape'
,
w1
.
value
.
shape
y
=
T
.
lvector
(
'y'
)
print
'ALLOCATING ARCH: v shape'
,
v
.
value
.
shape
x
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,
1
,
0
,
0
))(
'x'
)
y
=
tensor
.
fmatrix
(
'y'
)
lr
=
tensor
.
fscalar
(
'lr'
)
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
b
=
T
.
fvector
()
n_batch
,
1
,
1
,
verbose
=
verbose
,
version
=
version
)
W
=
T
.
fmatrix
()
conv_op1
=
conv
.
ConvOp
(
(
n_kern
,
logical_hid_shape
[
0
]
/
2
,
logical_hid_shape
[
1
]
/
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
,
verbose
=
verbose
,
version
=
version
)
conv_op
.
set_flops
()
conv_op1
.
set_flops
()
ds_op
=
downsample
.
DownsampleFactorMax
((
2
,
2
),
ignore_border
=
False
)
p_y_given_x
=
T
.
nnet
.
softmax
(
T
.
dot
(
x
,
W
)
+
b
)
if
downsample_ops
:
y_pred
=
T
.
argmax
(
p_y_given_x
)
hid
=
tensor
.
tanh
(
ds_op
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
))))
loss
=
-
T
.
mean
(
T
.
log
(
p_y_given_x
)[
T
.
arange
(
y
.
shape
[
0
]),
y
])
else
:
dW
=
T
.
grad
(
loss
,
W
)
hid
=
tensor
.
tanh
((
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)))[:,:,::
2
,::
2
])
classify
=
theano
.
function
(
inputs
=
[
x
,
y
,
b
,
W
],
outputs
=
[
loss
,
y_pred
,
dW
],
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
,
w1
)
+
b1
.
dimshuffle
((
0
,
'x'
,
'x'
)))
mode
=
mode_without_gpu
)
hid_flat
=
hid1
.
reshape
((
n_batch
,
n_hid
))
classify_gpu
=
theano
.
function
(
inputs
=
[
x
,
y
,
b
,
W
],
outputs
=
[
loss
,
y_pred
,
dW
],
out
=
tensor
.
nnet
.
softmax
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
mode
=
mode_with_gpu
)
loss
=
tensor
.
sum
(
tensor
.
nnet
.
crossentropy_categorical_1hot
(
out
,
tensor
.
argmax
(
y
,
axis
=
1
))
*
lr
)
print
'loss type'
,
loss
.
type
params
=
[
w0
,
b0
,
w1
,
b1
,
v
,
c
]
xx
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
batch_size
,
n_in
),
dtype
=
numpy
.
float32
)
gparams
=
tensor
.
grad
(
loss
,
params
,
warn_type
=
True
)
yy
=
numpy
.
ones
((
batch_size
,),
dtype
=
'float32'
)
b_values
=
numpy
.
zeros
((
n_out
,),
dtype
=
'float32'
)
W_values
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
n_in
,
n_out
),
dtype
=
'float32'
)
mode
=
get_mode
(
use_gpu
)
print
'building pfunc ...'
assert
any
([
isinstance
(
node
.
op
,
T
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
)
for
node
in
classify
.
maker
.
env
.
toposort
()])
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
assert
any
([
isinstance
(
node
.
op
,
T
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
)
for
node
in
classify
.
maker
.
env
.
toposort
()])
assert
any
([
isinstance
(
node
.
op
,
cuda
.
nnet
.
GpuCrossentropySoftmaxArgmax1HotWithBias
)
for
node
in
classify_gpu
.
maker
.
env
.
toposort
()])
assert
any
([
isinstance
(
node
.
op
,
cuda
.
nnet
.
GpuCrossentropySoftmax1HotWithBiasDx
)
for
node
in
classify_gpu
.
maker
.
env
.
toposort
()])
if
False
:
out
=
classify
(
xx
,
yy
,
b_values
,
W_values
)
for
i
,
n
in
enumerate
(
train
.
maker
.
env
.
toposort
()):
gout
=
classify_gpu
(
xx
,
yy
,
b_values
,
W_values
)
print
i
,
n
xval
=
my_rand
(
*
shape_img
)
assert
numpy
.
allclose
(
out
[
0
],
gout
[
0
]
)
yval
=
my_rand
(
n_batch
,
n_out
)
assert
numpy
.
allclose
(
out
[
1
],
gout
[
1
]
)
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
assert
numpy
.
allclose
(
out
[
2
],
gout
[
2
],
atol
=
2e-6
)
rvals
=
my_zeros
(
n_train
)
t0
=
time
.
time
()
for
i
in
xrange
(
n_train
):
rvals
[
i
]
=
train
(
xval
,
yval
,
lr
)[
0
]
t1
=
time
.
time
()
print_mode
(
mode
)
return
rvals
,
t1
-
t0
,
mode
def
cmp_run_conv_nnet2_classif
(
seed
,
isize
,
ksize
,
bsize
,
def
test_softmax_with_bias
():
ignore_error
=
False
,
n_train
=
10
,
gpu_only
=
False
,
cpu_only
=
False
,
float_atol
=
1e-06
,
check_isfinite
=
True
,
pickle
=
False
,
verbose
=
0
,
version
=-
1
):
"""
"""
float_atol: None mean use the default value.
This is basic test for GpuSoftmaxWithBias
check_isfinite: the debug mode option. We forward this value to debug mode.
For some parameter CrossentropyCategorical1Hot op generate inf when not optimized.
"""
if
config
.
mode
==
'DEBUG_MODE'
:
n_train
=
1
numpy
.
random
.
seed
(
seed
)
import
theano.tensor.basic
import
theano.compile.debugmode
from
theano.compile.mode
import
predefined_modes
orig_float32_atol
=
theano
.
tensor
.
basic
.
float32_atol
orig_check_isfinite
=
predefined_modes
[
"DEBUG_MODE"
]
.
check_isfinite
try
:
predefined_modes
[
"DEBUG_MODE"
]
.
check_isfinite
=
check_isfinite
if
gpu_only
:
tcn
.
use
()
if
float_atol
:
print
"float_atol"
,
float_atol
theano
.
tensor
.
basic
.
float32_atol
=
float_atol
if
not
cpu_only
:
rval_gpu
,
tg
,
gpu_mode
=
run_conv_nnet2_classif
(
True
,
isize
,
ksize
,
bsize
,
n_train
,
verbose
=
verbose
,
version
=
version
)
finally
:
predefined_modes
[
"DEBUG_MODE"
]
.
check_isfinite
=
orig_check_isfinite
theano
.
tensor
.
basic
.
float32_atol
=
orig_float32_atol
if
gpu_only
:
We check that we loop when their is too much block
print
"time gpu:
%.3
f"
%
(
tg
)
TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
return
"""
x
=
T
.
fmatrix
(
'x'
)
try
:
predefined_modes
[
"DEBUG_MODE"
]
.
check_isfinite
=
check_isfinite
numpy
.
random
.
seed
(
seed
)
rval_cpu
,
tc
,
cpu_mode
=
run_conv_nnet2_classif
(
False
,
isize
,
ksize
,
bsize
,
n_train
,
verbose
=
verbose
,
version
=
version
)
if
pickle
and
isinstance
(
cpu_mode
,(
theano
.
compile
.
ProfileMode
,)):
import
pickle
print
"BEGIN GPU profile mode dump"
#print pickle.dumps(gpu_mode)
print
"END GPU profile mode dump"
print
"BEGIN CPU profile mode dump"
print
pickle
.
dumps
(
cpu_mode
)
print
"END CPU profile mode dump"
finally
:
#we need to test n>32*1024 to check that we make the block loop.
predefined_modes
[
"DEBUG_MODE"
]
.
check_isfinite
=
orig_check_isfinite
n
,
m
=
2
<<
15
,
5
theano
.
tensor
.
basic
.
float32_atol
=
orig_float32_atol
if
not
cpu_only
:
data
=
numpy
.
arange
(
n
*
m
,
dtype
=
'float32'
)
.
reshape
(
n
,
m
)
if
verbose
or
not
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
1e-3
,
atol
=
float_atol
):
print
"cpu:"
,
rval_cpu
print
"gpu:"
,
rval_gpu
print
"abs diff:"
,
numpy
.
absolute
(
rval_gpu
-
rval_cpu
)
print
"time cpu:
%.3
f, time gpu:
%.3
f, speed up
%
f"
%
(
tc
,
tg
,
tc
/
tg
)
print
"estimated time for one pass through MNIST with cpu:
%
f"
%
(
tc
*
(
60000.0
/
(
n_train
*
bsize
)))
print
"estimated time for one pass through MNIST with gpu:
%
f"
%
(
tg
*
(
60000.0
/
(
n_train
*
bsize
)))
else
:
print
"time cpu:
%.3
f"
%
(
tc
)
print
"estimated time for one pass through MNIST with cpu:
%
f"
%
(
tc
*
(
60000.0
/
(
n_train
*
bsize
)))
if
not
ignore_error
and
not
cpu_only
and
not
gpu_only
:
z
=
T
.
nnet
.
softmax_with_bias
(
x
,
T
.
zeros_like
(
x
[
0
,:]))
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
1e-3
,
atol
=
float_atol
)
gpu_only
=
False
f
=
theano
.
function
([
x
],
z
,
mode
=
mode_without_gpu
)
cpu_only
=
False
f_gpu
=
theano
.
function
([
x
],
z
,
mode
=
mode_with_gpu
)
ignore_error
=
False
assert
f
.
maker
.
env
.
toposort
()[
-
1
]
.
op
==
T
.
nnet
.
softmax_with_bias
verbose
=
0
assert
isinstance
(
f_gpu
.
maker
.
env
.
toposort
()[
-
2
]
.
op
,
cuda
.
nnet
.
GpuSoftmaxWithBias
)
version
=-
1
def
test_lenet_28
():
#MNIST
out
=
f
(
data
)
cmp_run_conv_nnet2_classif
(
23485
,
28
,
5
,
60
,
n_train
=
10
,
gout
=
f_gpu
(
data
)
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
assert
numpy
.
allclose
(
out
,
gout
),
numpy
.
absolute
(
out
-
gout
)
cpu_only
=
cpu_only
,
verbose
=
verbose
,
version
=
version
)
def
test_lenet_32
():
#CIFAR10 / Shapeset
def
test_softmax
():
cmp_run_conv_nnet2_classif
(
23485
,
32
,
5
,
60
,
n_train
=
10
,
"""
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
This is basic test for GpuSoftmax
verbose
=
verbose
,
version
=
version
)
def
test_lenet_32_long
():
#CIFAR10 / Shapeset
We check that we loop when their is too much block
# this tests the gradient of downsample on the GPU,
TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
# which does not recieve specific testing
"""
cmp_run_conv_nnet2_classif
(
23485
,
32
,
5
,
30
,
n_train
=
50
,
x
=
T
.
fmatrix
(
'x'
)
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
cpu_only
=
cpu_only
,
verbose
=
verbose
,
version
=
version
)
def
test_lenet_64
():
# ???
#we need to test n>32*1024 to check that we make the block loop.
#float_atol need to pass in debug mode
n
,
m
=
2
<<
15
,
5
#needed as cpu use extended precision and gpu don't
cmp_run_conv_nnet2_classif
(
23485
,
64
,
7
,
10
,
n_train
=
10
,
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
cpu_only
=
cpu_only
,
verbose
=
verbose
,
float_atol
=
5e-4
,
check_isfinite
=
True
,
version
=
version
)
def
test_lenet_108
():
# NORB
data
=
numpy
.
arange
(
n
*
m
,
dtype
=
'float32'
)
.
reshape
(
n
,
m
)
cmp_run_conv_nnet2_classif
(
23485
,
108
,
7
,
5
,
n_train
=
4
,
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
cpu_only
=
cpu_only
,
verbose
=
verbose
,
check_isfinite
=
True
,
version
=
version
,
float_atol
=
7e-2
)
def
test_lenet_256
():
# ImageNet
z
=
T
.
nnet
.
softmax
(
x
)
cmp_run_conv_nnet2_classif
(
23485
,
256
,
9
,
2
,
n_train
=
5
,
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
cpu_only
=
cpu_only
,
verbose
=
verbose
,
check_isfinite
=
True
,
version
=
version
)
#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
f
=
theano
.
function
([
x
],
z
,
mode
=
mode_without_gpu
)
def
tes_lenet_hd
():
#HD 720p: 1280(wid)x720(len)
f_gpu
=
theano
.
function
([
x
],
z
,
mode
=
mode_with_gpu
)
cmp_run_conv_nnet2_classif
(
23485
,
(
720
,
1280
),
9
,
2
,
n_train
=
3
,
assert
f
.
maker
.
env
.
toposort
()[
-
1
]
.
op
==
T
.
nnet
.
softmax
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
assert
isinstance
(
f_gpu
.
maker
.
env
.
toposort
()[
-
2
]
.
op
,
cuda
.
nnet
.
GpuSoftmax
)
cpu_only
=
cpu_only
,
verbose
=
verbose
,
check_isfinite
=
True
,
version
=
version
)
#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
out
=
f
(
data
)
def
tes_lenet_full_hd
():
#HD 1080p: 1920(wid)x1080(len)
gout
=
f_gpu
(
data
)
cmp_run_conv_nnet2_classif
(
23485
,
(
1080
,
1920
),
9
,
2
,
n_train
=
3
,
assert
numpy
.
allclose
(
out
,
gout
),
numpy
.
absolute
(
out
-
gout
)
ignore_error
=
ignore_error
,
gpu_only
=
gpu_only
,
cpu_only
=
cpu_only
,
verbose
=
verbose
,
check_isfinite
=
True
,
version
=
version
)
theano/sandbox/cuda/type.py
浏览文件 @
85447abe
...
@@ -254,7 +254,9 @@ class CudaNdarrayType(Type):
...
@@ -254,7 +254,9 @@ class CudaNdarrayType(Type):
return
ret
return
ret
def
c_libraries
(
self
):
def
c_libraries
(
self
):
return
[
'cudart'
]
# returning cublas because the cuda_ndarray.cuh header includes calls to SetVector and
# cublasGetError
return
[
'cudart'
,
'cublas'
]
def
c_support_code
(
cls
):
def
c_support_code
(
cls
):
return
""
return
""
...
...
theano/sandbox/multinomial.py
浏览文件 @
85447abe
...
@@ -4,7 +4,7 @@ import theano.tensor as T
...
@@ -4,7 +4,7 @@ import theano.tensor as T
from
theano.tensor.opt
import
register_specialize
from
theano.tensor.opt
import
register_specialize
from
theano.gof
import
local_optimizer
from
theano.gof
import
local_optimizer
from
theano.sandbox.cuda
import
cuda_available
from
theano.sandbox.cuda
import
cuda_available
,
cuda_enabled
if
cuda_available
:
if
cuda_available
:
from
theano.sandbox.cuda
import
CudaNdarrayType
from
theano.sandbox.cuda
import
CudaNdarrayType
from
theano.sandbox.cuda.basic_ops
import
host_from_gpu
,
gpu_from_host
from
theano.sandbox.cuda.basic_ops
import
host_from_gpu
,
gpu_from_host
...
@@ -109,12 +109,11 @@ class GpuMultinomial(Multinomial):
...
@@ -109,12 +109,11 @@ class GpuMultinomial(Multinomial):
raise
TypeError
(
'pvals must be cudandarray'
,
pvals
)
raise
TypeError
(
'pvals must be cudandarray'
,
pvals
)
if
not
isinstance
(
unis
.
type
,
CudaNdarrayType
):
if
not
isinstance
(
unis
.
type
,
CudaNdarrayType
):
raise
TypeError
(
'unis must be cudandarray'
,
unis
)
raise
TypeError
(
'unis must be cudandarray'
,
unis
)
return
Apply
(
self
,
[
pvals
,
unis
],
[
pvals
.
type
()])
return
Apply
(
self
,
[
pvals
,
unis
],
[
pvals
.
type
()])
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
#
return ()
return
()
return
(
super
(
GpuMultinomial
,
self
)
.
c_code_cache_version
(),
1
)
#
return (super(GpuMultinomial,self).c_code_cache_version(),1)
def
c_support_code_apply
(
self
,
node
,
nodename
):
def
c_support_code_apply
(
self
,
node
,
nodename
):
return
"""
return
"""
...
@@ -128,7 +127,7 @@ class GpuMultinomial(Multinomial):
...
@@ -128,7 +127,7 @@ class GpuMultinomial(Multinomial):
float * global_outs
float * global_outs
)
)
{
{
int n =
32
*blockIdx.x + threadIdx.x;
int n =
blockDim.x
*blockIdx.x + threadIdx.x;
if (n < nb_multi)
if (n < nb_multi)
{
{
...
@@ -201,14 +200,31 @@ class GpuMultinomial(Multinomial):
...
@@ -201,14 +200,31 @@ class GpuMultinomial(Multinomial):
int nb_outcomes = CudaNdarray_HOST_DIMS(
%(z)
s)[0];
int nb_outcomes = CudaNdarray_HOST_DIMS(
%(z)
s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(
%(z)
s)[1];
int nb_multi = CudaNdarray_HOST_DIMS(
%(z)
s)[1];
int nb_block;
//TODO : change this for a beautiful constant
if (nb_multi
%% 32
== 0)
int max_nb_blocks = 2<<15 - 1;
nb_block = nb_multi/32;
int nb_blocks = max_nb_blocks + 1;
int nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi
%%
nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
else
nb_block = (int)((float)nb_multi/32. + 1.);
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("
\\
nN=
%%
i b=
%%
i t=
%%
i t*b=
%%
i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (
%%
i)", nb_multi);
%(fail)
s;
}
dim3 n_blocks(nb_block,1,1);
dim3 n_blocks(nb_block
s
,1,1);
dim3 n_threads(
32
,1,1);
dim3 n_threads(
nb_threads
,1,1);
int n_shared = 0;
int n_shared = 0;
k_multi_warp_
%(name)
s<<<n_blocks, n_threads, n_shared>>>(
k_multi_warp_
%(name)
s<<<n_blocks, n_threads, n_shared>>>(
...
@@ -244,6 +260,6 @@ gpu_multinomial = GpuMultinomial()
...
@@ -244,6 +260,6 @@ gpu_multinomial = GpuMultinomial()
def
use_gpu_multinomial
(
node
):
def
use_gpu_multinomial
(
node
):
if
node
.
op
==
multinomial
:
if
node
.
op
==
multinomial
:
return
[
host_from_gpu
(
gpu_multinomial
(
*
[
gpu_from_host
(
i
)
for
i
in
node
.
inputs
]))]
return
[
host_from_gpu
(
gpu_multinomial
(
*
[
gpu_from_host
(
i
)
for
i
in
node
.
inputs
]))]
if
theano
.
config
.
device
.
startswith
(
'gpu'
):
if
cuda_enabled
:
#
theano.config.device.startswith('gpu'):
register_specialize
(
use_gpu_multinomial
)
register_specialize
(
use_gpu_multinomial
)
theano/sandbox/rng_mrg.py
浏览文件 @
85447abe
...
@@ -685,7 +685,7 @@ class MRG_RandomStreams(object):
...
@@ -685,7 +685,7 @@ class MRG_RandomStreams(object):
else
:
else
:
raise
NotImplementedError
(
"MRG_RandomStreams.binomial with n > 1"
)
raise
NotImplementedError
(
"MRG_RandomStreams.binomial with n > 1"
)
def
multinomial
(
self
,
size
=
None
,
n
=
1
,
pvals
=
[[
.
5
,
.
5
]]
,
ndim
=
None
,
dtype
=
'int64'
):
def
multinomial
(
self
,
size
=
None
,
n
=
1
,
pvals
=
None
,
ndim
=
None
,
dtype
=
'int64'
):
"""
"""
Sample `n` (currently `n` needs to be 1) times from a multinomial distribution defined by
Sample `n` (currently `n` needs to be 1) times from a multinomial distribution defined by
probabilities pvals.
probabilities pvals.
...
@@ -696,13 +696,12 @@ class MRG_RandomStreams(object):
...
@@ -696,13 +696,12 @@ class MRG_RandomStreams(object):
`size` and `ndim` are only there keep the same signature as other uniform, binomial, normal, etc.
`size` and `ndim` are only there keep the same signature as other uniform, binomial, normal, etc.
todo : adapt multinomial to take that into account
todo : adapt multinomial to take that into account
"""
"""
if
pvals
is
None
:
raise
TypeError
(
"You have to specify pvals"
)
pvals
=
as_tensor_variable
(
pvals
)
pvals
=
as_tensor_variable
(
pvals
)
if
n
==
1
and
pvals
.
ndim
==
2
:
if
n
==
1
and
pvals
.
ndim
==
2
:
pvals
=
as_tensor_variable
(
pvals
)
unis
=
self
.
uniform
(
size
=
pvals
.
shape
[
0
:
1
],
ndim
=
1
)
unis
=
self
.
uniform
(
size
=
pvals
.
shape
[
0
:
1
],
ndim
=
1
)
return
cast
(
multinomial
(
pvals
.
T
,
unis
)
.
T
,
dtype
)
return
cast
(
multinomial
(
pvals
.
T
,
unis
)
.
T
,
dtype
)
else
:
else
:
raise
NotImplementedError
(
"MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2"
)
raise
NotImplementedError
(
"MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2"
)
...
...
theano/sandbox/test_rng_mrg.py
浏览文件 @
85447abe
...
@@ -345,7 +345,7 @@ def test_uniform():
...
@@ -345,7 +345,7 @@ def test_uniform():
#print 'random?[-1,-10:]\n', out[-1,-10:]
#print 'random?[-1,-10:]\n', out[-1,-10:]
basictest
(
f
,
steps
,
sample_size
,
prefix
=
'mrg cpu'
,
inputs
=
input
)
basictest
(
f
,
steps
,
sample_size
,
prefix
=
'mrg cpu'
,
inputs
=
input
)
if
mode
!=
'FAST_COMPILE'
:
if
mode
!=
'FAST_COMPILE'
and
cuda_available
:
print
''
print
''
print
'ON GPU with size=(
%
s):'
%
str
(
size
)
print
'ON GPU with size=(
%
s):'
%
str
(
size
)
R
=
MRG_RandomStreams
(
234
,
use_cuda
=
True
)
R
=
MRG_RandomStreams
(
234
,
use_cuda
=
True
)
...
@@ -403,7 +403,7 @@ def test_binomial():
...
@@ -403,7 +403,7 @@ def test_binomial():
print
'random?[-1,-10:]
\n
'
,
out
[
-
1
,
-
10
:]
print
'random?[-1,-10:]
\n
'
,
out
[
-
1
,
-
10
:]
basictest
(
f
,
steps
,
sample_size
,
prefix
=
'mrg cpu'
,
inputs
=
input
,
allow_01
=
True
,
target_avg
=
mean
)
basictest
(
f
,
steps
,
sample_size
,
prefix
=
'mrg cpu'
,
inputs
=
input
,
allow_01
=
True
,
target_avg
=
mean
)
if
mode
!=
'FAST_COMPILE'
:
if
mode
!=
'FAST_COMPILE'
and
cuda_available
:
print
''
print
''
print
'ON GPU with size=(
%
s) and mean(
%
d):'
%
(
str
(
size
),
mean
)
print
'ON GPU with size=(
%
s) and mean(
%
d):'
%
(
str
(
size
),
mean
)
R
=
MRG_RandomStreams
(
234
,
use_cuda
=
True
)
R
=
MRG_RandomStreams
(
234
,
use_cuda
=
True
)
...
@@ -450,7 +450,7 @@ def test_normal0():
...
@@ -450,7 +450,7 @@ def test_normal0():
# now with odd number of samples
# now with odd number of samples
sample_size
=
(
sample_size
[
0
],
sample_size
[
1
]
-
1
)
sample_size
=
(
sample_size
[
0
],
sample_size
[
1
]
-
1
)
if
mode
!=
'FAST_COMPILE'
:
if
mode
!=
'FAST_COMPILE'
and
cuda_available
:
print
''
print
''
print
'ON GPU:'
print
'ON GPU:'
R
=
MRG_RandomStreams
(
234
,
use_cuda
=
True
)
R
=
MRG_RandomStreams
(
234
,
use_cuda
=
True
)
...
@@ -465,7 +465,7 @@ def test_normal0():
...
@@ -465,7 +465,7 @@ def test_normal0():
print
'random?[:10]
\n
'
,
numpy
.
asarray
(
f
())[
0
,
0
:
10
]
print
'random?[:10]
\n
'
,
numpy
.
asarray
(
f
())[
0
,
0
:
10
]
print
'----'
print
'----'
sys
.
stdout
.
flush
()
sys
.
stdout
.
flush
()
basictest
(
f
,
steps
,
sample_size
_odd
,
target_avg
=-
5.0
,
target_std
=
2.0
,
prefix
=
'gpu mrg '
,
allow_01
=
True
)
basictest
(
f
,
steps
,
sample_size
,
target_avg
=-
5.0
,
target_std
=
2.0
,
prefix
=
'gpu mrg '
,
allow_01
=
True
)
print
''
print
''
...
@@ -528,6 +528,7 @@ def test_multinomial():
...
@@ -528,6 +528,7 @@ def test_multinomial():
print
''
print
''
print
'ON GPU:'
print
'ON GPU:'
R
=
MRG_RandomStreams
(
234
,
use_cuda
=
True
)
R
=
MRG_RandomStreams
(
234
,
use_cuda
=
True
)
pvals
=
numpy
.
asarray
(
pvals
,
dtype
=
'float32'
)
n
=
R
.
multinomial
(
pvals
=
pvals
,
dtype
=
'float32'
)
n
=
R
.
multinomial
(
pvals
=
pvals
,
dtype
=
'float32'
)
assert
n
.
dtype
==
'float32'
#well, it's really that this test w GPU doesn't make sense otw
assert
n
.
dtype
==
'float32'
#well, it's really that this test w GPU doesn't make sense otw
f
=
theano
.
function
([],
theano
.
Out
(
f
=
theano
.
function
([],
theano
.
Out
(
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论