Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
b69ad54d
提交
b69ad54d
authored
5月 05, 2016
作者:
Xavier Bouthillier
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #4244 from ChihebTrabelsi/ccw2.0
flake8 sandbox/cuda/*.py
上级
200babca
58267dc2
全部展开
显示空白字符变更
内嵌
并排
正在显示
33 个修改的文件
包含
311 行增加
和
294 行删除
+311
-294
GpuConvGrad3D.py
theano/sandbox/cuda/GpuConvGrad3D.py
+6
-7
GpuConvTransp3D.py
theano/sandbox/cuda/GpuConvTransp3D.py
+40
-17
basic_ops.py
theano/sandbox/cuda/basic_ops.py
+0
-0
blas.py
theano/sandbox/cuda/blas.py
+45
-34
elemwise.py
theano/sandbox/cuda/elemwise.py
+0
-0
fftconv.py
theano/sandbox/cuda/fftconv.py
+13
-7
kernel_codegen.py
theano/sandbox/cuda/kernel_codegen.py
+6
-10
neighbours.py
theano/sandbox/cuda/neighbours.py
+1
-1
nnet.py
theano/sandbox/cuda/nnet.py
+32
-35
nvcc_compiler.py
theano/sandbox/cuda/nvcc_compiler.py
+5
-3
opt.py
theano/sandbox/cuda/opt.py
+0
-0
rng_curand.py
theano/sandbox/cuda/rng_curand.py
+13
-16
test_basic_ops.py
theano/sandbox/cuda/tests/test_basic_ops.py
+0
-0
test_bench_loopfusion.py
theano/sandbox/cuda/tests/test_bench_loopfusion.py
+0
-0
test_blas.py
theano/sandbox/cuda/tests/test_blas.py
+0
-0
test_conv_cuda_ndarray.py
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+0
-0
test_cuda_ndarray.py
theano/sandbox/cuda/tests/test_cuda_ndarray.py
+0
-0
test_driver.py
theano/sandbox/cuda/tests/test_driver.py
+4
-3
test_extra_ops.py
theano/sandbox/cuda/tests/test_extra_ops.py
+24
-23
test_gemmcorr3d.py
theano/sandbox/cuda/tests/test_gemmcorr3d.py
+6
-5
test_gradient.py
theano/sandbox/cuda/tests/test_gradient.py
+1
-1
test_memory.py
theano/sandbox/cuda/tests/test_memory.py
+14
-8
test_mlp.py
theano/sandbox/cuda/tests/test_mlp.py
+30
-30
test_neighbours.py
theano/sandbox/cuda/tests/test_neighbours.py
+4
-4
test_opt.py
theano/sandbox/cuda/tests/test_opt.py
+0
-0
test_rng_curand.py
theano/sandbox/cuda/tests/test_rng_curand.py
+1
-1
test_tensor_op.py
theano/sandbox/cuda/tests/test_tensor_op.py
+6
-8
test_var.py
theano/sandbox/cuda/tests/test_var.py
+10
-10
test_viewop.py
theano/sandbox/cuda/tests/test_viewop.py
+1
-2
walltime.py
theano/sandbox/cuda/tests/walltime.py
+34
-21
type.py
theano/sandbox/cuda/type.py
+11
-12
var.py
theano/sandbox/cuda/var.py
+4
-2
test_flake8.py
theano/tests/test_flake8.py
+0
-34
没有找到文件。
theano/sandbox/cuda/GpuConvGrad3D.py
浏览文件 @
b69ad54d
...
@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
...
@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
d_
=
T
.
as_tensor_variable
(
d
)
d_
=
T
.
as_tensor_variable
(
d
)
WShape_
=
T
.
as_tensor_variable
(
WShape
)
WShape_
=
T
.
as_tensor_variable
(
WShape
)
dCdH_
=
as_cuda_ndarray_variable
(
dCdH
)
dCdH_
=
as_cuda_ndarray_variable
(
dCdH
)
broad
=
(
False
,)
*
5
broad
=
(
False
,)
*
5
return
theano
.
Apply
(
self
,
inputs
=
[
V_
,
d_
,
WShape_
,
dCdH_
],
return
theano
.
Apply
(
self
,
inputs
=
[
V_
,
d_
,
WShape_
,
dCdH_
],
outputs
=
[
CudaNdarrayType
(
dtype
=
V_
.
dtype
,
outputs
=
[
CudaNdarrayType
(
dtype
=
V_
.
dtype
,
broadcastable
=
broad
)()])
broadcastable
=
broad
)()])
...
@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
...
@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize
=
dCdH
.
shape
[
0
]
batchSize
=
dCdH
.
shape
[
0
]
outputFilters
=
dCdH
.
shape
[
1
]
outputHeight
=
dCdH
.
shape
[
2
]
outputHeight
=
dCdH
.
shape
[
2
]
outputWidth
=
dCdH
.
shape
[
3
]
outputWidth
=
dCdH
.
shape
[
3
]
outputDur
=
dCdH
.
shape
[
4
]
outputDur
=
dCdH
.
shape
[
4
]
assert
V
.
shape
[
0
]
==
batchSize
assert
V
.
shape
[
0
]
==
batchSize
inputFilters
=
V
.
shape
[
1
]
inputHeight
=
V
.
shape
[
2
]
inputWidth
=
V
.
shape
[
3
]
inputDur
=
V
.
shape
[
4
]
dr
,
dc
,
dt
=
d
dr
,
dc
,
dt
=
d
dCdW
=
numpy
.
zeros
(
WShape
,
dtype
=
V
.
dtype
)
dCdW
=
numpy
.
zeros
(
WShape
,
dtype
=
V
.
dtype
)
...
@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
...
@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
for
p
in
xrange
(
0
,
outputHeight
):
for
p
in
xrange
(
0
,
outputHeight
):
for
q
in
xrange
(
0
,
outputWidth
):
for
q
in
xrange
(
0
,
outputWidth
):
for
r
in
xrange
(
0
,
outputDur
):
for
r
in
xrange
(
0
,
outputDur
):
dCdW
[
j
,
z
,
k
,
l
,
m
]
+=
dCdH
[
i
,
j
,
p
,
q
,
r
]
*
V
[
i
,
z
,
dr
*
p
+
k
,
dc
*
q
+
l
,
dt
*
r
+
m
]
dCdW
[
j
,
z
,
k
,
l
,
m
]
+=
dCdH
[
i
,
j
,
p
,
q
,
r
]
*
\
V
[
i
,
z
,
dr
*
p
+
k
,
dc
*
q
+
l
,
dt
*
r
+
m
]
output_storage
[
0
][
0
]
=
dCdW
output_storage
[
0
][
0
]
=
dCdW
...
...
theano/sandbox/cuda/GpuConvTransp3D.py
浏览文件 @
b69ad54d
...
@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
...
@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
else
:
else
:
RShape_
=
T
.
as_tensor_variable
([
-
1
,
-
1
,
-
1
])
RShape_
=
T
.
as_tensor_variable
([
-
1
,
-
1
,
-
1
])
return
theano
.
Apply
(
self
,
inputs
=
[
W_
,
b_
,
d_
,
H_
,
RShape_
],
return
theano
.
Apply
(
outputs
=
[
CudaNdarrayType
(
dtype
=
H_
.
dtype
,
self
,
inputs
=
[
W_
,
b_
,
d_
,
H_
,
RShape_
],
broadcastable
=
(
False
,)
*
5
)()])
outputs
=
[
CudaNdarrayType
(
dtype
=
H_
.
dtype
,
broadcastable
=
(
False
,)
*
5
)()])
def
infer_shape
(
self
,
node
,
input_shapes
):
def
infer_shape
(
self
,
node
,
input_shapes
):
W
,
b
,
d
,
H
,
RShape
=
node
.
inputs
W
,
b
,
d
,
H
,
RShape
=
node
.
inputs
...
@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
...
@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
assert
dc
>
0
assert
dc
>
0
assert
dt
>
0
assert
dt
>
0
videoHeight
=
(
outputHeight
-
1
)
*
dr
+
filterHeight
videoHeight
=
(
outputHeight
-
1
)
*
dr
+
filterHeight
videoWidth
=
(
outputWidth
-
1
)
*
dc
+
filterWidth
videoWidth
=
(
outputWidth
-
1
)
*
dc
+
filterWidth
videoDur
=
(
outputDur
-
1
)
*
dt
+
filterDur
videoDur
=
(
outputDur
-
1
)
*
dt
+
filterDur
if
Rshape
is
not
None
and
Rshape
[
0
]
!=
-
1
:
if
Rshape
is
not
None
and
Rshape
[
0
]
!=
-
1
:
if
Rshape
[
0
]
<
videoHeight
:
if
Rshape
[
0
]
<
videoHeight
:
...
@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
...
@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
# else:
# else:
# print "No Rshape passed in"
# print "No Rshape passed in"
# print "video size: "
+
str((videoHeight, videoWidth, videoDur))
# print "video size: "
+
str((videoHeight, videoWidth, videoDur))
R
=
numpy
.
zeros
(
(
batchSize
,
inputChannels
,
videoHeight
,
R
=
numpy
.
zeros
((
batchSize
,
inputChannels
,
videoHeight
,
videoWidth
,
videoDur
)
,
dtype
=
H
.
dtype
)
videoWidth
,
videoDur
),
dtype
=
H
.
dtype
)
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
# sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for
i
in
xrange
(
0
,
batchSize
):
for
i
in
xrange
(
0
,
batchSize
):
# print '\texample '+str(i+1)+'/'+str(batchSize)
# print '\texample '+str(i+1)+'/'+str(batchSize)
for
j
in
xrange
(
0
,
inputChannels
):
for
j
in
xrange
(
0
,
inputChannels
):
# print '\t\tfeature map '
+str(j+1)+'/'+
str(inputChannels)
# print '\t\tfeature map '
+ str(j+1) + '/' +
str(inputChannels)
for
r
in
xrange
(
0
,
videoHeight
):
for
r
in
xrange
(
0
,
videoHeight
):
# print '\t\t\trow '
+str(r+1)+
'/'+str(videoHeight)
# print '\t\t\trow '
+ str(r+1) +
'/'+str(videoHeight)
for
c
in
xrange
(
0
,
videoWidth
):
for
c
in
xrange
(
0
,
videoWidth
):
for
t
in
xrange
(
0
,
videoDur
):
for
t
in
xrange
(
0
,
videoDur
):
R
[
i
,
j
,
r
,
c
,
t
]
=
b
[
j
]
R
[
i
,
j
,
r
,
c
,
t
]
=
b
[
j
]
ftc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
t
-
filterDur
+
1
)
/
float
(
dt
)))
])
ftc
=
max
(
fcc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
c
-
filterWidth
+
1
)
/
float
(
dc
)))
])
[
0
,
int
(
numpy
.
ceil
(
rc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
r
-
filterHeight
+
1
)
/
float
(
dr
)))
])
float
(
t
-
filterDur
+
1
)
/
float
(
dt
)
))
]
)
fcc
=
max
(
[
0
,
int
(
numpy
.
ceil
(
float
(
c
-
filterWidth
+
1
)
/
float
(
dc
)
))
]
)
rc
=
max
(
[
0
,
int
(
numpy
.
ceil
(
float
(
r
-
filterHeight
+
1
)
/
float
(
dr
)
))
]
)
while
rc
<
outputHeight
:
while
rc
<
outputHeight
:
rk
=
r
-
rc
*
dr
rk
=
r
-
rc
*
dr
if
rk
<
0
:
if
rk
<
0
:
...
@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
...
@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
if
tk
<
0
:
if
tk
<
0
:
break
break
R
[
i
,
j
,
r
,
c
,
t
]
+=
numpy
.
dot
(
W
[:,
j
,
rk
,
ck
,
tk
],
H
[
i
,
:,
rc
,
cc
,
tc
]
)
R
[
i
,
j
,
r
,
c
,
t
]
+=
numpy
.
dot
(
W
[:,
j
,
rk
,
ck
,
tk
],
H
[
i
,
:,
rc
,
cc
,
tc
])
tc
+=
1
tc
+=
1
""
# close loop over tc
""
# close loop over tc
...
...
theano/sandbox/cuda/basic_ops.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/blas.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
copy
import
os
import
os
import
logging
import
logging
_logger
=
logging
.
getLogger
(
__name__
)
from
six
import
integer_types
from
six
import
integer_types
from
six.moves
import
StringIO
,
reduce
from
six.moves
import
StringIO
,
reduce
import
theano
import
theano
from
theano
import
Apply
from
theano
import
Apply
from
theano
import
tensor
from
theano
import
tensor
...
@@ -15,6 +11,7 @@ from theano.sandbox.cuda import GpuOp
...
@@ -15,6 +11,7 @@ from theano.sandbox.cuda import GpuOp
from
theano.sandbox.cuda.basic_ops
import
(
as_cuda_ndarray_variable
,
from
theano.sandbox.cuda.basic_ops
import
(
as_cuda_ndarray_variable
,
gpu_contiguous
)
gpu_contiguous
)
from
theano.tensor
import
as_tensor_variable
from
theano.tensor
import
as_tensor_variable
_logger
=
logging
.
getLogger
(
__name__
)
class
GpuBatchedDot
(
GpuOp
):
class
GpuBatchedDot
(
GpuOp
):
...
@@ -183,8 +180,7 @@ class GpuBatchedDot(GpuOp):
...
@@ -183,8 +180,7 @@ class GpuBatchedDot(GpuOp):
}
}
} else {
} else {
// copy inputs if not contiguous
// copy inputs if not contiguous
"""
+
"""
+
(
"
\n
"
.
join
(
"""
(
"
\n
"
.
join
(
"""
if (( CudaNdarray_HOST_DIMS(
%(var)
s)[0] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[0] != 1
if (( CudaNdarray_HOST_DIMS(
%(var)
s)[0] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[0] != 1
&& CudaNdarray_HOST_DIMS(
%(var)
s)[1] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[1] != 1
&& CudaNdarray_HOST_DIMS(
%(var)
s)[1] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[1] != 1
&& CudaNdarray_HOST_DIMS(
%(var)
s)[2] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[2] != 1)
&& CudaNdarray_HOST_DIMS(
%(var)
s)[2] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[2] != 1)
...
@@ -198,8 +194,7 @@ class GpuBatchedDot(GpuOp):
...
@@ -198,8 +194,7 @@ class GpuBatchedDot(GpuOp):
Py_XDECREF(
%(var)
s);
Py_XDECREF(
%(var)
s);
%(var)
s = _copy;
%(var)
s = _copy;
}
}
"""
%
dict
(
var
=
var
,
fail
=
fail
)
for
var
in
(
bx
,
by
)))
"""
%
dict
(
var
=
var
,
fail
=
fail
)
for
var
in
(
bx
,
by
)))
+
"""
+
"""
// fail if the output is not contiguous; we can't copy it because we
// fail if the output is not contiguous; we can't copy it because we
// need to write to the original memory
// need to write to the original memory
...
@@ -537,8 +532,8 @@ class GpuGemm(GpuOp):
...
@@ -537,8 +532,8 @@ class GpuGemm(GpuOp):
return
'GpuGemm{no_inplace}'
return
'GpuGemm{no_inplace}'
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
\
return
(
type
(
self
)
==
type
(
other
)
and
and
self
.
inplace
==
other
.
inplace
)
self
.
inplace
==
other
.
inplace
)
def
__hash__
(
self
):
def
__hash__
(
self
):
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
...
@@ -562,7 +557,7 @@ class GpuGemm(GpuOp):
...
@@ -562,7 +557,7 @@ class GpuGemm(GpuOp):
return
(
4
,)
return
(
4
,)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
#z_out = alpha * dot(x,y) + beta * z_in
#
z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in
# inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out.
# not inplace version, we copy z_in to z_out.
z_in
,
a
,
x
,
y
,
b
=
inputs
z_in
,
a
,
x
,
y
,
b
=
inputs
...
@@ -657,8 +652,8 @@ class GpuGemv(GpuOp):
...
@@ -657,8 +652,8 @@ class GpuGemv(GpuOp):
return
'GpuGemv{no_inplace}'
return
'GpuGemv{no_inplace}'
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
\
return
(
type
(
self
)
==
type
(
other
)
and
and
self
.
inplace
==
other
.
inplace
)
self
.
inplace
==
other
.
inplace
)
def
__hash__
(
self
):
def
__hash__
(
self
):
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
...
@@ -682,7 +677,7 @@ class GpuGemv(GpuOp):
...
@@ -682,7 +677,7 @@ class GpuGemv(GpuOp):
return
(
3
,)
return
(
3
,)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
#z_out = alpha * dot(x,y) + beta * z_in
#
z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in
# inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out.
# not inplace version, we copy z_in to z_out.
z_in
,
a
,
x
,
y
,
b
=
inputs
z_in
,
a
,
x
,
y
,
b
=
inputs
...
@@ -757,8 +752,8 @@ class GpuGer(GpuOp):
...
@@ -757,8 +752,8 @@ class GpuGer(GpuOp):
return
'GpuGer{no_inplace}'
return
'GpuGer{no_inplace}'
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
\
return
(
type
(
self
)
==
type
(
other
)
and
and
self
.
inplace
==
other
.
inplace
)
self
.
inplace
==
other
.
inplace
)
def
__hash__
(
self
):
def
__hash__
(
self
):
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
...
@@ -782,7 +777,7 @@ class GpuGer(GpuOp):
...
@@ -782,7 +777,7 @@ class GpuGer(GpuOp):
return
(
2
,)
return
(
2
,)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
#z_out = alpha * dot(x,y) + beta * z_in
#
z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in
# inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out.
# not inplace version, we copy z_in to z_out.
z_in
,
a
,
x
,
y
=
inputs
z_in
,
a
,
x
,
y
=
inputs
...
@@ -1283,11 +1278,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
...
@@ -1283,11 +1278,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
bottom
,
top
=
inp
[:
2
]
bottom
,
top
=
inp
[:
2
]
weights
,
=
grads
weights
,
=
grads
weights
=
gpu_contiguous
(
weights
)
weights
=
gpu_contiguous
(
weights
)
d_bottom
=
GpuCorrMM_gradInputs
(
self
.
border_mode
,
self
.
subsample
)(
d_bottom
=
GpuCorrMM_gradInputs
(
weights
,
top
,
bottom
.
shape
[
-
2
:])
self
.
border_mode
,
self
.
subsample
)(
weights
,
d_top
=
GpuCorrMM
(
self
.
border_mode
,
self
.
subsample
)(
top
,
bottom
,
weights
)
bottom
.
shape
[
-
2
:])
d_height_width
=
(
theano
.
gradient
.
DisconnectedType
()(),)
*
2
if
len
(
inp
)
==
4
else
()
d_top
=
GpuCorrMM
(
self
.
border_mode
,
self
.
subsample
)(
bottom
,
weights
)
d_height_width
=
(
theano
.
gradient
.
DisconnectedType
()(),
)
*
2
if
len
(
inp
)
==
4
else
()
return
(
d_bottom
,
d_top
)
+
d_height_width
return
(
d_bottom
,
d_top
)
+
d_height_width
def
connection_pattern
(
self
,
node
):
def
connection_pattern
(
self
,
node
):
...
@@ -1342,11 +1341,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
...
@@ -1342,11 +1341,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
weights
,
top
=
inp
[:
2
]
weights
,
top
=
inp
[:
2
]
bottom
,
=
grads
bottom
,
=
grads
bottom
=
gpu_contiguous
(
bottom
)
bottom
=
gpu_contiguous
(
bottom
)
d_weights
=
GpuCorrMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
)(
d_weights
=
GpuCorrMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
)(
bottom
,
top
,
weights
.
shape
[
-
2
:])
bottom
,
top
,
weights
.
shape
[
-
2
:])
d_top
=
GpuCorrMM
(
self
.
border_mode
,
self
.
subsample
)(
d_top
=
GpuCorrMM
(
bottom
,
weights
)
self
.
border_mode
,
self
.
subsample
)(
bottom
,
weights
)
d_height_width
=
(
theano
.
gradient
.
DisconnectedType
()(),)
*
2
if
len
(
inp
)
==
4
else
()
d_height_width
=
(
theano
.
gradient
.
DisconnectedType
()(),
)
*
2
if
len
(
inp
)
==
4
else
()
return
(
d_weights
,
d_top
)
+
d_height_width
return
(
d_weights
,
d_top
)
+
d_height_width
def
connection_pattern
(
self
,
node
):
def
connection_pattern
(
self
,
node
):
...
@@ -1755,10 +1757,16 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
...
@@ -1755,10 +1757,16 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
bottom
,
weights
=
inp
bottom
,
weights
=
inp
top
,
=
grads
top
,
=
grads
top
=
gpu_contiguous
(
top
)
top
=
gpu_contiguous
(
top
)
d_bottom
=
GpuCorr3dMM_gradInputs
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
d_bottom
=
GpuCorr3dMM_gradInputs
(
self
.
border_mode
,
weights
,
top
,
bottom
.
shape
[
-
3
:])
self
.
subsample
,
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
self
.
pad
)(
weights
,
bottom
,
top
,
weights
.
shape
[
-
3
:])
top
,
bottom
.
shape
[
-
3
:])
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
top
,
weights
.
shape
[
-
3
:])
return
d_bottom
,
d_weights
return
d_bottom
,
d_weights
...
@@ -1863,11 +1871,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
...
@@ -1863,11 +1871,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
weights
,
top
=
inp
[:
2
]
weights
,
top
=
inp
[:
2
]
bottom
,
=
grads
bottom
,
=
grads
bottom
=
gpu_contiguous
(
bottom
)
bottom
=
gpu_contiguous
(
bottom
)
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
top
,
weights
.
shape
[
-
3
:])
bottom
,
top
,
weights
.
shape
[
-
3
:])
d_top
=
GpuCorr3dMM
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
d_top
=
GpuCorr3dMM
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
weights
)
bottom
,
weights
)
d_height_width_depth
=
(
theano
.
gradient
.
DisconnectedType
()(),)
*
3
if
len
(
inp
)
==
5
else
()
d_height_width_depth
=
(
theano
.
gradient
.
DisconnectedType
()(),)
\
*
3
if
len
(
inp
)
==
5
else
()
return
(
d_weights
,
d_top
)
+
d_height_width_depth
return
(
d_weights
,
d_top
)
+
d_height_width_depth
def
connection_pattern
(
self
,
node
):
def
connection_pattern
(
self
,
node
):
...
@@ -2186,7 +2197,7 @@ class GpuDownsampleFactorMax(GpuOp):
...
@@ -2186,7 +2197,7 @@ class GpuDownsampleFactorMax(GpuOp):
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
# def perform(self, node, input_storage, output_storage):
# def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented')
#
raise NotImplementedError('only C is implemented')
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
6
)
return
(
6
)
...
...
theano/sandbox/cuda/elemwise.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/fftconv.py
浏览文件 @
b69ad54d
...
@@ -5,9 +5,9 @@ import numpy as np
...
@@ -5,9 +5,9 @@ import numpy as np
import
theano
import
theano
import
theano.tensor
as
T
import
theano.tensor
as
T
from
theano.misc.pycuda_init
import
pycuda_available
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
from
theano.ifelse
import
ifelse
from
theano.ifelse
import
ifelse
from
theano.misc.pycuda_init
import
pycuda_available
if
cuda_available
:
if
cuda_available
:
from
theano.sandbox.cuda
import
(
basic_ops
,
CudaNdarrayType
,
from
theano.sandbox.cuda
import
(
basic_ops
,
CudaNdarrayType
,
...
@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
...
@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# special way because we specify explicitly here
# how much values are expected.
# how much values are expected.
if
border_mode
==
'valid'
:
if
border_mode
==
'valid'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
)]
elif
border_mode
==
'full'
:
elif
border_mode
==
'full'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
)]
else
:
else
:
raise
ValueError
(
'invalid mode'
)
raise
ValueError
(
'invalid mode'
)
...
@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
...
@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
output_fft_s
=
mult_and_reduce
(
input_fft_v
,
filters_fft_v
,
output_fft_s
=
mult_and_reduce
(
input_fft_v
,
filters_fft_v
,
input_shape
=
input_fft_v_shape
,
input_shape
=
input_fft_v_shape
,
filter_shape
=
filters_fft_v_shape
)
filter_shape
=
filters_fft_v_shape
)
#output_fft_s = input_fft_v
#
output_fft_s = input_fft_v
# reshape for IFFT
# reshape for IFFT
output_fft_flat
=
output_fft_s
.
reshape
((
b
*
oc
,
o0
,
o1
,
o2
//
2
+
1
,
2
))
output_fft_flat
=
output_fft_s
.
reshape
((
b
*
oc
,
o0
,
o1
,
o2
//
2
+
1
,
2
))
...
@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
...
@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# special way because we specify explicitly here
# how much values are expected.
# how much values are expected.
if
border_mode
==
'valid'
:
if
border_mode
==
'valid'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
),
(
f2
-
1
):(
f2
-
1
+
i2
-
f2
+
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
),
(
f2
-
1
):(
f2
-
1
+
i2
-
f2
+
1
)]
elif
border_mode
==
'full'
:
elif
border_mode
==
'full'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
),
(
f2
-
1
):(
f2
-
1
+
i2
+
f2
-
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
),
(
f2
-
1
):(
f2
-
1
+
i2
+
f2
-
1
)]
else
:
else
:
raise
ValueError
(
'invalid mode'
)
raise
ValueError
(
'invalid mode'
)
#output = output_circ[:, :, :, :, :]
#
output = output_circ[:, :, :, :, :]
# Rescale manually. This is just a factor that comes in during the
# Rescale manually. This is just a factor that comes in during the
# trip through FFT and inverse FFT.
# trip through FFT and inverse FFT.
...
...
theano/sandbox/cuda/kernel_codegen.py
浏览文件 @
b69ad54d
...
@@ -167,17 +167,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
...
@@ -167,17 +167,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
We use __i as an int variable in a loop.
We use __i as an int variable in a loop.
"""
"""
return
[
return
[
# get max of buf (trashing all but buf[0])
# get max of buf (trashing all but buf[0])
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'__syncthreads()'
,
'float row_max = '
+
buf
+
'[0]'
,
'float row_max = '
+
buf
+
'[0]'
,
'__syncthreads()'
,
'__syncthreads()'
,
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
'; __i+='
+
threadCount
+
'){'
,
threadCount
+
'){'
,
buf
+
'[__i] = exp('
+
buf2
+
'[__i] - row_max)'
,
buf
+
'[__i] = exp('
+
buf2
+
'[__i] - row_max)'
,
buf2
+
'[__i] = '
+
buf
+
'[__i]'
,
buf2
+
'[__i] = '
+
buf
+
'[__i]'
,
'}'
,
'}'
,
'__syncthreads()'
,
'__syncthreads()'
,
inline_reduce_sum
(
N
,
buf
,
threadPos
,
threadCount
),
inline_reduce_sum
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'__syncthreads()'
,
...
@@ -186,8 +184,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
...
@@ -186,8 +184,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
# divide each exp() result by the sum to complete the job.
# divide each exp() result by the sum to complete the job.
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
threadCount
+
'){'
,
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = '
+
buf2
+
'[__i] / row_sum'
,
buf
+
'[__i] = '
+
buf2
+
'[__i] / row_sum'
,
'}'
,
'}'
,
'__syncthreads()'
,
'__syncthreads()'
,
]
]
...
@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
...
@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
init
=
manner_init
(
"
%(x)
s[
%(pos)
s *
%(stride_x)
s]"
%
locals
())
init
=
manner_init
(
"
%(x)
s[
%(pos)
s *
%(stride_x)
s]"
%
locals
())
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(x)
s[i *
%(stride_x)
s]"
%
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(x)
s[i *
%(stride_x)
s]"
%
locals
()))
locals
()))
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
buf
)
"
%
s[i]"
%
buf
)
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
...
...
theano/sandbox/cuda/neighbours.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
# This is work in progress
# This is work in progress
from
theano
import
Op
,
Apply
,
tensor
from
theano
import
Apply
,
tensor
from
theano.gof
import
local_optimizer
from
theano.gof
import
local_optimizer
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
...
...
theano/sandbox/cuda/nnet.py
浏览文件 @
b69ad54d
...
@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
...
@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
"""
%
locals
()
"""
%
locals
()
def
c_support_code_apply
(
self
,
node
,
nodename
):
def
c_support_code_apply
(
self
,
node
,
nodename
):
ret1
=
nvcc_kernel
(
"kSoftmax_
%
s"
%
nodename
,
ret1
=
nvcc_kernel
(
"kSoftmax_
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'const float * x'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
'const int sx0'
,
body
=
[
'const int sx1'
,
"extern __shared__ float buf[]"
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
"extern __shared__ float buf[]"
,
"float * buf2 = buf + N"
,
"float * buf2 = buf + N"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf2[tx] = buf[tx]"
,
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
"}"
,
inline_softmax
(
'N'
,
"__syncthreads()"
,
'buf'
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
'threadIdx.x'
,
'blockDim.x'
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
# This set all value correctly
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"}"
,
"__syncthreads()"
,
"}"
,
])
"__syncthreads()"
,
ret2
=
nvcc_kernel
(
"}"
,
"kSoftmax_fixed_shared
%
s"
%
nodename
,
])
ret2
=
nvcc_kernel
(
"kSoftmax_fixed_shared
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
body
=
[
"extern __shared__ float buf[]"
,
"extern __shared__ float buf[]"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
" blockIDX += gridDim.x){"
,
"const float *x_ptr = &x[blockIDX * sx0]"
,
"const float *x_ptr = &x[blockIDX * sx0]"
,
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
'sm_ptr'
,
'sm_s1'
,
'sm_ptr'
,
'sm_s1'
,
'threadIdx.x'
,
'blockDim.x'
),
'threadIdx.x'
,
"__syncthreads()"
,
'blockDim.x'
),
"}"
,
"__syncthreads()"
,
"}"
,
])
])
return
ret1
+
"
\n
"
+
ret2
return
ret1
+
"
\n
"
+
ret2
gpu_softmax
=
GpuSoftmax
()
gpu_softmax
=
GpuSoftmax
()
...
@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
...
@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'const float * b'
,
'const int sb0'
,
'const float * b'
,
'const int sb0'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
body
=
[
"extern __shared__ float buf[]"
,
"extern __shared__ float buf[]"
,
"float * buf2 = buf + N"
,
"float * buf2 = buf + N"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf[tx] += b[tx * sb0]"
,
"buf[tx] += b[tx * sb0]"
,
"buf2[tx] = buf[tx]"
,
"buf2[tx] = buf[tx]"
,
"}"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
"__syncthreads()"
,
'threadIdx.x'
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'blockDim.x'
),
'threadIdx.x'
,
'blockDim.x'
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"}"
,
"__syncthreads()"
,
"}"
,
])
"__syncthreads()"
,
"}"
,
])
ret2
=
nvcc_kernel
(
"kSoftmaxWithBias_fixed_shared
%
s"
%
nodename
,
ret2
=
nvcc_kernel
(
"kSoftmaxWithBias_fixed_shared
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
'const float * x'
,
...
@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
...
@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
'x_ptr'
,
'sx1'
,
'sm_ptr'
,
'sm_s1'
,
'sm_ptr'
,
'sm_s1'
,
'threadIdx.x'
,
'threadIdx.x'
,
'blockDim.x'
,
'blockDim.x'
,
'b'
,
'sb0'
),
'b'
,
'sb0'
),
...
...
theano/sandbox/cuda/nvcc_compiler.py
浏览文件 @
b69ad54d
...
@@ -4,7 +4,6 @@ import logging
...
@@ -4,7 +4,6 @@ import logging
import
os
import
os
import
subprocess
import
subprocess
import
sys
import
sys
import
warnings
from
locale
import
getpreferredencoding
from
locale
import
getpreferredencoding
import
numpy
import
numpy
...
@@ -249,7 +248,8 @@ class NVCC_compiler(Compiler):
...
@@ -249,7 +248,8 @@ class NVCC_compiler(Compiler):
_logger
.
debug
(
'Writing module C++ code to
%
s'
,
cppfilename
)
_logger
.
debug
(
'Writing module C++ code to
%
s'
,
cppfilename
)
cppfile
.
write
(
src_code
)
cppfile
.
write
(
src_code
)
lib_filename
=
os
.
path
.
join
(
location
,
'
%
s.
%
s'
%
lib_filename
=
os
.
path
.
join
(
location
,
'
%
s.
%
s'
%
(
module_name
,
get_lib_extension
()))
(
module_name
,
get_lib_extension
()))
_logger
.
debug
(
'Generating shared lib
%
s'
,
lib_filename
)
_logger
.
debug
(
'Generating shared lib
%
s'
,
lib_filename
)
...
@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
...
@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
indexof
=
cmd
.
index
(
'-u'
)
indexof
=
cmd
.
index
(
'-u'
)
cmd
.
pop
(
indexof
)
# Remove -u
cmd
.
pop
(
indexof
)
# Remove -u
cmd
.
pop
(
indexof
)
# Remove argument to -u
cmd
.
pop
(
indexof
)
# Remove argument to -u
except
ValueError
as
e
:
except
ValueError
:
done
=
True
done
=
True
# CUDA Toolkit v4.1 Known Issues:
# CUDA Toolkit v4.1 Known Issues:
...
@@ -364,6 +364,8 @@ class NVCC_compiler(Compiler):
...
@@ -364,6 +364,8 @@ class NVCC_compiler(Compiler):
console_encoding
=
getpreferredencoding
()
console_encoding
=
getpreferredencoding
()
nvcc_stdout
=
decode_with
(
nvcc_stdout_raw
,
console_encoding
)
nvcc_stdout
=
decode_with
(
nvcc_stdout_raw
,
console_encoding
)
nvcc_stderr
=
decode_with
(
nvcc_stderr_raw
,
console_encoding
)
nvcc_stderr
=
decode_with
(
nvcc_stderr_raw
,
console_encoding
)
p
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
finally
:
finally
:
os
.
chdir
(
orig_dir
)
os
.
chdir
(
orig_dir
)
...
...
theano/sandbox/cuda/opt.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/rng_curand.py
浏览文件 @
b69ad54d
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
__authors__
=
"James Bergstra"
__copyright__
=
"(c) 2011, University of Montreal"
__license__
=
"3-clause BSD License"
__contact__
=
"theano-dev@googlegroups.com"
import
numpy
import
numpy
import
theano.gof
import
theano.gof
from
theano.compat
import
PY3
from
theano.compat
import
PY3
...
@@ -17,6 +7,15 @@ from theano.tensor import (get_vector_length, cast, opt)
...
@@ -17,6 +7,15 @@ from theano.tensor import (get_vector_length, cast, opt)
from
theano.compile
import
optdb
from
theano.compile
import
optdb
from
theano.gof
import
local_optimizer
,
Variable
from
theano.gof
import
local_optimizer
,
Variable
__authors__
=
"James Bergstra"
__copyright__
=
"(c) 2011, University of Montreal"
__license__
=
"3-clause BSD License"
__contact__
=
"theano-dev@googlegroups.com"
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
config
=
theano
.
config
config
=
theano
.
config
...
@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
...
@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
Return a tuple of attributes that define the Op.
Return a tuple of attributes that define the Op.
"""
"""
return
(
return
(
self
.
destructive
,
self
.
destructive
,
self
.
output_type
,
self
.
output_type
,
self
.
seed
,
self
.
seed
,
)
)
...
@@ -101,8 +99,7 @@ class CURAND_Base(GpuOp):
...
@@ -101,8 +99,7 @@ class CURAND_Base(GpuOp):
v_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
v_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
if
ndim
is
None
:
if
ndim
is
None
:
ndim
=
get_vector_length
(
v_size
)
ndim
=
get_vector_length
(
v_size
)
self
=
cls
(
self
=
cls
(
output_type
=
CudaNdarrayType
((
False
,)
*
ndim
),
output_type
=
CudaNdarrayType
((
False
,)
*
ndim
),
seed
=
seed
,
seed
=
seed
,
destructive
=
False
)
destructive
=
False
)
...
@@ -386,5 +383,5 @@ def local_destructive(node):
...
@@ -386,5 +383,5 @@ def local_destructive(node):
return
new_op
.
make_node
(
*
node
.
inputs
)
.
outputs
return
new_op
.
make_node
(
*
node
.
inputs
)
.
outputs
return
False
return
False
optdb
.
register
(
'CURAND_destructive'
,
optdb
.
register
(
'CURAND_destructive'
,
opt
.
in2out
(
local_destructive
,
ignore_newtrees
=
True
),
99
,
'fast_run'
,
opt
.
in2out
(
local_destructive
,
ignore_newtrees
=
True
)
,
'inplace'
)
99
,
'fast_run'
,
'inplace'
)
theano/sandbox/cuda/tests/test_basic_ops.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_bench_loopfusion.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_blas.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_cuda_ndarray.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_driver.py
浏览文件 @
b69ad54d
...
@@ -6,7 +6,7 @@ import theano
...
@@ -6,7 +6,7 @@ import theano
try
:
try
:
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
except
ImportError
:
except
ImportError
:
# To have the GPU back-end work without nose, we need this file to
# To have the GPU back-end work without nose, we need this file to
...
@@ -33,8 +33,9 @@ def test_nvidia_driver1():
...
@@ -33,8 +33,9 @@ def test_nvidia_driver1():
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
2
assert
len
(
topo
)
==
2
if
sum
(
isinstance
(
node
.
op
,
B
.
GpuCAReduce
)
for
node
in
topo
)
!=
1
:
if
sum
(
isinstance
(
node
.
op
,
B
.
GpuCAReduce
)
for
node
in
topo
)
!=
1
:
msg
=
'
\n\t
'
.
join
([
'Expected exactly one occurrence of GpuCAReduce '
+
msg
=
'
\n\t
'
.
join
(
'but got:'
]
+
[
str
(
app
)
for
app
in
topo
])
[
'Expected exactly one occurrence of GpuCAReduce '
+
'but got:'
]
+
[
str
(
app
)
for
app
in
topo
])
raise
AssertionError
(
msg
)
raise
AssertionError
(
msg
)
if
not
numpy
.
allclose
(
f
(),
a
.
sum
()):
if
not
numpy
.
allclose
(
f
(),
a
.
sum
()):
raise
Exception
(
"The nvidia driver version installed with this OS "
raise
Exception
(
"The nvidia driver version installed with this OS "
...
...
theano/sandbox/cuda/tests/test_extra_ops.py
浏览文件 @
b69ad54d
...
@@ -5,24 +5,22 @@ import itertools
...
@@ -5,24 +5,22 @@ import itertools
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
numpy
as
np
import
numpy
as
np
from
six.moves
import
xrange
from
six.moves
import
xrange
from
theano
import
tensor
as
T
import
theano
from
theano.tensor.extra_ops
import
cumsum
,
CumsumOp
from
theano.tests
import
unittest_tools
as
utt
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
is
False
:
if
cuda_ndarray
.
cuda_available
:
import
theano.tensor.tests.test_extra_ops
from
theano.sandbox.cuda.extra_ops
import
GpuCumsum
else
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.tensor.tests.test_extra_ops
from
theano.sandbox.cuda.extra_ops
import
GpuCumsum
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
else
:
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
from
theano
import
tensor
as
T
import
theano
from
theano.tensor.extra_ops
import
cumsum
,
CumsumOp
from
theano.tests
import
unittest_tools
as
utt
class
TestGpuCumsum
(
theano
.
tensor
.
tests
.
test_extra_ops
.
TestCumsumOp
):
class
TestGpuCumsum
(
theano
.
tensor
.
tests
.
test_extra_ops
.
TestCumsumOp
):
mode
=
mode_with_gpu
mode
=
mode_with_gpu
...
@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
...
@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
utt
.
assert_allclose
(
np
.
cumsum
(
a
[:
i
]),
f
(
a
[:
i
]))
utt
.
assert_allclose
(
np
.
cumsum
(
a
[:
i
]),
f
(
a
[:
i
]))
# Use multiple GPU threadblocks
# Use multiple GPU threadblocks
a
=
np
.
random
.
random
((
block_max_size
+
2
,))
.
astype
(
"float32"
)
a
=
np
.
random
.
random
((
block_max_size
+
2
,))
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
# Use recursive cumsum
# Use recursive cumsum
a
=
np
.
ones
((
block_max_size
*
(
block_max_size
+
1
)
+
2
,),
a
=
np
.
ones
((
block_max_size
*
(
block_max_size
+
1
)
+
2
,),
dtype
=
"float32"
)
dtype
=
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
...
@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
...
@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks
# Use multiple GPU threadblocks
a_shape
=
[
5
,
5
]
a_shape
=
[
5
,
5
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks
# Use multiple GPU gridblocks
a_shape
=
[
4
,
4
]
a_shape
=
[
4
,
4
]
a_shape
[
1
-
shape_axis
]
=
self
.
max_grid_size1
+
1
a_shape
[
1
-
shape_axis
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
),
rtol
=
5e-5
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
),
rtol
=
5e-5
)
# Use recursive cumsum
# Use recursive cumsum
a_shape
=
[
3
,
3
]
a_shape
=
[
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
def
test_GpuCumsum3D
(
self
):
def
test_GpuCumsum3D
(
self
):
...
@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
...
@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks (along accumulation axis)
# Use multiple GPU threadblocks (along accumulation axis)
a_shape
=
[
2
,
2
,
2
]
a_shape
=
[
2
,
2
,
2
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks (not along accumulation axis)
# Use multiple GPU gridblocks (not along accumulation axis)
a_shape
=
[
5
,
5
,
5
]
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
1
)
%
3
]
=
self
.
max_grid_size1
+
1
a_shape
[(
shape_axis
+
1
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
if
axis
is
None
:
# Avoid floating point error
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
a_shape
=
[
5
,
5
,
5
]
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
2
)
%
3
]
=
self
.
max_grid_size1
+
1
a_shape
[(
shape_axis
+
2
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
if
axis
is
None
:
# Avoid floating point error
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use recursive cumsum (along accumulation axis)
# Use recursive cumsum (along accumulation axis)
a_shape
=
[
3
,
3
,
3
]
a_shape
=
[
3
,
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
def
test_GpuCumsum4D
(
self
):
def
test_GpuCumsum4D
(
self
):
...
...
theano/sandbox/cuda/tests/test_gemmcorr3d.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
unittest
import
unittest
import
numpy
import
numpy
import
copy
import
theano
import
theano
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
# Skip tests if cuda_ndarray is not available.
# Skip tests if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
if
not
cuda_ndarray
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda not available'
)
from
theano.sandbox.cuda
import
float32_shared_constructor
as
shared
from
theano.sandbox.cuda
import
float32_shared_constructor
as
shared
from
theano.sandbox.cuda.blas
import
(
from
theano.sandbox.cuda.blas
import
(
GpuCorr3dMM
,
GpuCorr3dMM_gradWeights
,
GpuCorr3dMM_gradInputs
)
GpuCorr3dMM
,
GpuCorr3dMM_gradWeights
,
GpuCorr3dMM_gradInputs
)
from
theano.sandbox.cuda.basic_ops
import
gpu_contiguous
from
theano.sandbox.cuda.basic_ops
import
gpu_contiguous
import
theano.sandbox.cuda
as
cuda_ndarray
if
not
cuda_ndarray
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda not available'
)
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
...
@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
...
@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
inputs
=
shared
(
inputs_val
)
inputs
=
shared
(
inputs_val
)
filters
=
shared
(
filters_val
)
filters
=
shared
(
filters_val
)
bias
=
shared
(
numpy
.
zeros
(
filters_shape
[
4
])
.
astype
(
'float32'
))
bias
=
shared
(
numpy
.
zeros
(
filters_shape
[
4
])
.
astype
(
'float32'
))
conv
=
theano
.
tensor
.
nnet
.
convTransp3D
(
W
=
filters
,
b
=
bias
,
d
=
subsample
,
conv
=
theano
.
tensor
.
nnet
.
convTransp3D
(
W
=
filters
,
b
=
bias
,
d
=
subsample
,
H
=
inputs
)
H
=
inputs
)
f_ref
=
theano
.
function
([],
conv
)
f_ref
=
theano
.
function
([],
conv
)
res_ref
=
f_ref
()
res_ref
=
f_ref
()
...
...
theano/sandbox/cuda/tests/test_gradient.py
浏览文件 @
b69ad54d
...
@@ -8,7 +8,7 @@ from theano.sandbox import cuda
...
@@ -8,7 +8,7 @@ from theano.sandbox import cuda
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
...
...
theano/sandbox/cuda/tests/test_memory.py
浏览文件 @
b69ad54d
...
@@ -11,7 +11,7 @@ from theano import ifelse
...
@@ -11,7 +11,7 @@ from theano import ifelse
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
if
cuda
.
cuda_available
==
False
:
if
cuda
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
...
@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
...
@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
theano_alloc
=
cuda
.
cuda_ndarray
.
cuda_ndarray
.
theano_allocated
()
theano_alloc
=
cuda
.
cuda_ndarray
.
cuda_ndarray
.
theano_allocated
()
return
(
"(n malloc/theano mem allocated in KB)"
,
return
(
"(n malloc/theano mem allocated in KB)"
,
n_mallocs
+
extra_alloc
,
n_mallocs
+
extra_alloc
,
int
(
theano_alloc
/
1024
)
+
extra_size
)
int
(
theano_alloc
/
1024
))
return
(
"n malloc on the gpu"
,
n_mallocs
+
extra_alloc
)
return
(
"n malloc on the gpu"
,
n_mallocs
+
extra_alloc
)
# I don't use the following by default as if there is other stuff running
# I don't use the following by default as if there is other stuff running
...
@@ -83,9 +83,12 @@ def test_memory():
...
@@ -83,9 +83,12 @@ def test_memory():
variables
=
cuda
.
shared_constructor
(
np
.
ones
((
shapes
[
1
],),
variables
=
cuda
.
shared_constructor
(
np
.
ones
((
shapes
[
1
],),
dtype
=
'float32'
))
dtype
=
'float32'
))
derp
=
tensor
.
sum
(
tensor
.
dot
(
some_matrix
[:
shapes
[
0
]],
variables
))
derp
=
tensor
.
sum
(
tensor
.
dot
(
some_matrix
[:
shapes
[
0
]],
variables
))
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
borrow
=
True
,
borrow
=
True
,
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
mem2
=
freemem
()
mem2
=
freemem
()
print
(
"Before compilation"
,
mem2
)
print
(
"Before compilation"
,
mem2
)
...
@@ -112,7 +115,7 @@ def test_memory():
...
@@ -112,7 +115,7 @@ def test_memory():
del
obj
del
obj
# print "After deleting function 1", freemem()
# print "After deleting function 1", freemem()
#assert mem2 == freemem(), (mem2, freemem())
#
assert mem2 == freemem(), (mem2, freemem())
del
grad
del
grad
print
(
"After deleting function 2"
,
freemem
())
print
(
"After deleting function 2"
,
freemem
())
...
@@ -155,16 +158,19 @@ def test_memory_lazy():
...
@@ -155,16 +158,19 @@ def test_memory_lazy():
derp
=
ifelse
.
IfElse
(
1
)(
branch_select
,
derp
=
ifelse
.
IfElse
(
1
)(
branch_select
,
derp
,
some_matrix
[:
shapes
[
0
]]
.
sum
())
derp
,
some_matrix
[:
shapes
[
0
]]
.
sum
())
derp
+=
1
derp
+=
1
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
borrow
=
True
,
borrow
=
True
,
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
mem2
=
freemem
()
mem2
=
freemem
()
print
(
"Before compilation"
,
mem2
)
print
(
"Before compilation"
,
mem2
)
mem2_1
=
freemem
(
extra_alloc
=
more_alloc1
)
mem2_1
=
freemem
(
extra_alloc
=
more_alloc1
)
obj
=
theano
.
function
([
some_vector
,
branch_select
],
derp
,
obj
=
theano
.
function
([
some_vector
,
branch_select
],
derp
,
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
#theano.printing.debugprint(obj, print_type=True)
#
theano.printing.debugprint(obj, print_type=True)
mem3
=
freemem
()
mem3
=
freemem
()
print
(
"After function compilation 1"
,
mem3
)
print
(
"After function compilation 1"
,
mem3
)
assert
mem2_1
==
mem3
,
(
mem2_1
,
mem3
)
assert
mem2_1
==
mem3
,
(
mem2_1
,
mem3
)
...
...
theano/sandbox/cuda/tests/test_mlp.py
浏览文件 @
b69ad54d
...
@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
...
@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
'otherwise it is too slow!'
)
'otherwise it is too slow!'
)
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
if
tcn
.
cuda_available
==
False
:
if
tcn
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
...
@@ -147,19 +147,20 @@ def test_run_nnet():
...
@@ -147,19 +147,20 @@ def test_run_nnet():
rtol
=
1e-4
rtol
=
1e-4
if
n_in
*
n_hid
>=
2048
*
4096
:
if
n_in
*
n_hid
>=
2048
*
4096
:
rtol
=
7e-4
rtol
=
7e-4
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
rtol
,
atol
=
1e-6
),
\
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
rtol
,
atol
=
1e-6
),
\
(
"max_abs_diff, max_rel_diff, n_in, n_hid"
,
max_abs_diff
,
(
"max_abs_diff, max_rel_diff, n_in, n_hid"
,
max_abs_diff
,
rel_diff
.
max
(),
n_in
,
n_hid
)
rel_diff
.
max
(),
n_in
,
n_hid
)
def
test_run_nnet_med
():
def
test_run_nnet_med
():
utt
.
seed_rng
()
utt
.
seed_rng
()
r
val_cpu
=
r
un_nnet
(
False
,
10
,
128
,
50
,
4
,
n_train
=
10000
)
run_nnet
(
False
,
10
,
128
,
50
,
4
,
n_train
=
10000
)
def
test_run_nnet_small
():
def
test_run_nnet_small
():
utt
.
seed_rng
()
utt
.
seed_rng
()
r
val_cpu
=
r
un_nnet
(
False
,
10
,
10
,
4
,
4
,
n_train
=
100000
)
run_nnet
(
False
,
10
,
10
,
4
,
4
,
n_train
=
100000
)
def
run_conv_nnet1
(
use_gpu
):
def
run_conv_nnet1
(
use_gpu
):
...
@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
...
@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
mode
=
get_mode
(
use_gpu
)
mode
=
get_mode
(
use_gpu
)
# print 'building pfunc ...'
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
train
=
pfunc
(
g
in
zip
(
params
,
gparams
)])
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
# print i, n
...
@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
...
@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
n_batch
,
1
,
1
)
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
n_batch
,
1
,
1
)
conv_op1
=
conv
.
ConvOp
((
n_kern
,
logical_hid_shape
[
0
]
//
2
,
conv_op1
=
conv
.
ConvOp
((
n_kern
,
logical_hid_shape
[
0
]
//
2
,
logical_hid_shape
[
1
]
//
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
)
logical_hid_shape
[
1
]
//
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
)
hid
=
tensor
.
tanh
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid
=
tensor
.
tanh
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
[:,
:,
::
2
,
::
2
],
w1
)
+
b1
.
dimshuffle
((
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
[:,
:,
::
2
,
::
2
],
w1
)
+
b1
.
dimshuffle
((
...
@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
...
@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
mode
=
get_mode
(
use_gpu
)
mode
=
get_mode
(
use_gpu
)
# print 'building pfunc ...'
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
train
=
pfunc
(
g
in
zip
(
params
,
gparams
)])
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
# print i, n
...
@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
...
@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
if
downsample_ops
:
if
downsample_ops
:
hid
=
tensor
.
tanh
(
ds_op
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
))))
hid
=
tensor
.
tanh
(
ds_op
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
))))
else
:
else
:
hid
=
tensor
.
tanh
((
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)
hid
=
tensor
.
tanh
(
))[:,
:,
::
2
,
::
2
])
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
(
(
0
,
'x'
,
'x'
)))[:,
:,
::
2
,
::
2
])
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
,
w1
)
+
b1
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
,
w1
)
+
b1
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid_flat
=
hid1
.
reshape
((
n_batch
,
n_hid
))
hid_flat
=
hid1
.
reshape
((
n_batch
,
n_hid
))
out
=
tensor
.
nnet
.
softmax
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
out
=
tensor
.
nnet
.
softmax
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
loss
=
tensor
.
sum
(
tensor
.
nnet
.
crossentropy_categorical_1hot
(
out
,
loss
=
tensor
.
sum
(
tensor
.
nnet
.
crossentropy_categorical_1hot
(
tensor
.
argmax
(
y
,
axis
=
1
))
*
lr
)
out
,
tensor
.
argmax
(
y
,
axis
=
1
))
*
lr
)
# print 'loss type', loss.type
# print 'loss type', loss.type
params
=
[
w0
,
b0
,
w1
,
b1
,
v
,
c
]
params
=
[
w0
,
b0
,
w1
,
b1
,
v
,
c
]
...
@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
...
@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
mode
=
get_mode
(
use_gpu
,
check_isfinite
)
mode
=
get_mode
(
use_gpu
,
check_isfinite
)
# print 'building pfunc ...'
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
train
=
pfunc
(
g
in
zip
(
params
,
gparams
)])
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
if
verbose
:
if
verbose
:
theano
.
printing
.
debugprint
(
train
)
theano
.
printing
.
debugprint
(
train
)
...
@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
...
@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
rvals
=
my_zeros
(
n_train
)
rvals
=
my_zeros
(
n_train
)
t0
=
time
.
time
()
for
i
in
xrange
(
n_train
):
for
i
in
xrange
(
n_train
):
rvals
[
i
]
=
train
(
xval
,
yval
,
lr
)[
0
]
rvals
[
i
]
=
train
(
xval
,
yval
,
lr
)[
0
]
t1
=
time
.
time
()
print_mode
(
mode
)
print_mode
(
mode
)
if
pickle
and
isinstance
(
mode
,
theano
.
compile
.
ProfileMode
):
if
pickle
and
isinstance
(
mode
,
theano
.
compile
.
ProfileMode
):
...
@@ -495,7 +506,8 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
...
@@ -495,7 +506,8 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
compare
=
True
compare
=
True
if
not
compare
:
if
not
compare
:
return
run_conv_nnet2_classif
(
use_gpu
=
use_gpu
,
return
run_conv_nnet2_classif
(
use_gpu
=
use_gpu
,
seed
=
seed
,
isize
=
isize
,
ksize
=
ksize
,
bsize
=
bsize
,
seed
=
seed
,
isize
=
isize
,
ksize
=
ksize
,
bsize
=
bsize
,
n_train
=
n_train
,
n_train
=
n_train
,
check_isfinite
=
check_isfinite
,
check_isfinite
=
check_isfinite
,
...
@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
...
@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
finally
:
finally
:
theano
.
tensor
.
basic
.
float32_atol
=
orig_float32_atol
theano
.
tensor
.
basic
.
float32_atol
=
orig_float32_atol
if
pickle
:
if
isinstance
(
cpu_mode
,
theano
.
compile
.
ProfileMode
):
import
pickle
print
(
"BEGIN CPU profile mode dump"
)
print
(
pickle
.
dumps
(
cpu_mode
))
print
(
"END CPU profile mode dump"
)
if
isinstance
(
gpu_mode
,
theano
.
compile
.
ProfileMode
):
import
pickle
print
(
"BEGIN GPU profile mode dump"
)
print
(
pickle
.
dumps
(
gpu_mode
))
print
(
"END GPU profile mode dump"
)
# print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
# print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
# (time_cpu, time_gpu, time_cpu/time_gpu))
# (time_cpu, time_gpu, time_cpu/time_gpu))
# print "Estimated time for one pass through MNIST with CPU: %f" % (
# print "Estimated time for one pass through MNIST with CPU: %f" % (
...
...
theano/sandbox/cuda/tests/test_neighbours.py
浏览文件 @
b69ad54d
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
unittest
import
theano.tensor.nnet.tests.test_neighbours
from
theano.sandbox.cuda.neighbours
import
GpuImages2Neibs
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.tensor.nnet.tests.test_neighbours
from
theano.sandbox.cuda.neighbours
import
GpuImages2Neibs
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
...
...
theano/sandbox/cuda/tests/test_opt.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_rng_curand.py
浏览文件 @
b69ad54d
...
@@ -8,7 +8,7 @@ from theano.sandbox.rng_mrg import MRG_RandomStreams
...
@@ -8,7 +8,7 @@ from theano.sandbox.rng_mrg import MRG_RandomStreams
# Skip tests if cuda_ndarray is not available.
# Skip tests if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
# The PyCObject that represents the cuda random stream object
# The PyCObject that represents the cuda random stream object
...
...
theano/sandbox/cuda/tests/test_tensor_op.py
浏览文件 @
b69ad54d
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
This file test tensor op that should also operate on CudaNdaray.
This file test tensor op that should also operate on CudaNdaray.
"""
"""
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
copy
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
numpy
import
numpy
...
@@ -14,7 +13,7 @@ import theano.tensor as T
...
@@ -14,7 +13,7 @@ import theano.tensor as T
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
import
theano.sandbox.cuda
as
cuda
import
theano.sandbox.cuda
as
cuda
from
theano.tensor.nnet.tests
import
test_conv3d2d
from
theano.tensor.nnet.tests
import
test_conv3d2d
if
cuda
.
cuda_available
==
False
:
if
cuda
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
...
@@ -57,7 +56,7 @@ def test_softmax_optimizations():
...
@@ -57,7 +56,7 @@ def test_softmax_optimizations():
one_of_n
=
tensor
.
lvector
(
'one_of_n'
)
one_of_n
=
tensor
.
lvector
(
'one_of_n'
)
op
=
crossentropy_categorical_1hot
op
=
crossentropy_categorical_1hot
xe
=
op
(
x
,
one_of_n
)
op
(
x
,
one_of_n
)
fgraph
=
theano
.
gof
.
FunctionGraph
(
fgraph
=
theano
.
gof
.
FunctionGraph
(
[
x
,
one_of_n
],
[
x
,
one_of_n
],
...
@@ -84,10 +83,10 @@ def test_may_share_memory_cuda():
...
@@ -84,10 +83,10 @@ def test_may_share_memory_cuda():
# can't test the transpose as ta._strides = is not implemented
# can't test the transpose as ta._strides = is not implemented
# manual transpose of a
# manual transpose of a
#ta = a.reshape((4,3))
#
ta = a.reshape((4,3))
# ta._strides = (ta._strides[1],ta._strides[0])#not implemented
# ta._strides = (ta._strides[1],ta._strides[0])#not implemented
#elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
#
elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
#ta.gpudata += ta.size*elem_size
#
ta.gpudata += ta.size*elem_size
for
a_
,
b_
,
rep
in
[(
a
,
a
,
True
),
(
b
,
b
,
True
),
(
a
,
b
,
False
),
for
a_
,
b_
,
rep
in
[(
a
,
a
,
True
),
(
b
,
b
,
True
),
(
a
,
b
,
False
),
(
a
,
na
,
False
),
(
b
,
nb
,
False
),
(
a
,
na
,
False
),
(
b
,
nb
,
False
),
...
@@ -95,8 +94,7 @@ def test_may_share_memory_cuda():
...
@@ -95,8 +94,7 @@ def test_may_share_memory_cuda():
(
a
,
va
,
True
),
(
b
,
vb
,
True
),
(
a
,
va
,
True
),
(
b
,
vb
,
True
),
(
va
,
b
,
False
),
(
a
,
vb
,
False
),
(
va
,
b
,
False
),
(
a
,
vb
,
False
),
(
a
,
ra
,
True
),
(
b
,
rb
,
True
),
(
a
,
ra
,
True
),
(
b
,
rb
,
True
),
(
ra
,
b
,
False
),
(
a
,
rb
,
False
),
(
ra
,
b
,
False
),
(
a
,
rb
,
False
),
]:
]:
assert
may_share_memory
(
a_
,
b_
)
==
rep
assert
may_share_memory
(
a_
,
b_
)
==
rep
assert
may_share_memory
(
b_
,
a_
)
==
rep
assert
may_share_memory
(
b_
,
a_
)
==
rep
...
...
theano/sandbox/cuda/tests/test_var.py
浏览文件 @
b69ad54d
...
@@ -10,7 +10,7 @@ from theano.sandbox.cuda.var import float32_shared_constructor as f32sc
...
@@ -10,7 +10,7 @@ from theano.sandbox.cuda.var import float32_shared_constructor as f32sc
from
theano.sandbox.cuda
import
CudaNdarrayType
,
cuda_available
from
theano.sandbox.cuda
import
CudaNdarrayType
,
cuda_available
import
theano.sandbox.cuda
as
cuda
import
theano.sandbox.cuda
as
cuda
# Skip test if cuda_ndarray is not available.
# Skip test if cuda_ndarray is not available.
if
cuda_available
==
False
:
if
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
...
@@ -26,19 +26,18 @@ def test_float32_shared_constructor():
...
@@ -26,19 +26,18 @@ def test_float32_shared_constructor():
# test that broadcastable arg is accepted, and that they
# test that broadcastable arg is accepted, and that they
# don't strictly have to be tuples
# don't strictly have to be tuples
assert
eq
(
assert
eq
(
f32sc
(
npy_row
,
f32sc
(
npy_row
,
broadcastable
=
(
True
,
False
))
.
type
,
broadcastable
=
(
True
,
False
))
.
type
,
CudaNdarrayType
((
True
,
False
)))
CudaNdarrayType
((
True
,
False
)))
assert
eq
(
assert
eq
(
f32sc
(
npy_row
,
f32sc
(
npy_row
,
broadcastable
=
[
True
,
False
])
.
type
,
broadcastable
=
[
True
,
False
])
.
type
,
CudaNdarrayType
((
True
,
False
)))
CudaNdarrayType
((
True
,
False
)))
assert
eq
(
assert
eq
(
f32sc
(
npy_row
,
f32sc
(
npy_row
,
broadcastable
=
numpy
.
array
([
True
,
False
]))
.
type
,
broadcastable
=
numpy
.
array
([
True
,
False
]))
.
type
,
CudaNdarrayType
([
True
,
False
]))
CudaNdarrayType
([
True
,
False
]))
# test that we can make non-matrix shared vars
# test that we can make non-matrix shared vars
assert
eq
(
assert
eq
(
f32sc
(
numpy
.
zeros
((
2
,
3
,
4
,
5
),
dtype
=
'float32'
))
.
type
,
f32sc
(
numpy
.
zeros
((
2
,
3
,
4
,
5
),
dtype
=
'float32'
))
.
type
,
CudaNdarrayType
((
False
,)
*
4
))
CudaNdarrayType
((
False
,)
*
4
))
...
@@ -77,7 +76,8 @@ class T_updates(unittest.TestCase):
...
@@ -77,7 +76,8 @@ class T_updates(unittest.TestCase):
x
=
tensor
.
fmatrix
(
'x'
)
x
=
tensor
.
fmatrix
(
'x'
)
output_updates
=
[(
output_var
,
x
**
2
)]
output_updates
=
[(
output_var
,
x
**
2
)]
output_givens
=
{
x
:
data
}
output_givens
=
{
x
:
data
}
output_func
=
theano
.
function
(
inputs
=
[],
outputs
=
[],
output_func
=
theano
.
function
(
inputs
=
[],
outputs
=
[],
updates
=
output_updates
,
givens
=
output_givens
)
updates
=
output_updates
,
givens
=
output_givens
)
output_func
()
output_func
()
...
...
theano/sandbox/cuda/tests/test_viewop.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
numpy
import
numpy
import
unittest
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano
import
theano
...
@@ -11,7 +10,7 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
...
@@ -11,7 +10,7 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
def
test_viewop_gpu
():
def
test_viewop_gpu
():
from
theano.sandbox
import
cuda
from
theano.sandbox
import
cuda
if
cuda
.
cuda_available
==
False
:
if
cuda
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
_x
=
theano
.
tensor
.
fvector
(
'x'
)
_x
=
theano
.
tensor
.
fvector
(
'x'
)
x
=
cuda
.
gpu_from_host
(
_x
)
x
=
cuda
.
gpu_from_host
(
_x
)
...
...
theano/sandbox/cuda/tests/walltime.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
print_function
from
__future__
import
print_function
import
sys
,
time
import
sys
import
time
from
six
import
iteritems
from
six
import
iteritems
from
theano.compile.pfunc
import
pfunc
from
theano.compile.pfunc
import
pfunc
from
theano
import
tensor
from
theano
import
tensor
...
@@ -35,35 +36,47 @@ def showtimes(times):
...
@@ -35,35 +36,47 @@ def showtimes(times):
def
cmp_sigmoids
(
shape
):
def
cmp_sigmoids
(
shape
):
def
numpy_sigmoid
(
input
):
def
numpy_sigmoid
(
input
):
rval
=
1.0
/
(
1.0
+
numpy
.
exp
(
-
input
))
1.0
/
(
1.0
+
numpy
.
exp
(
-
input
))
sinput
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,)
*
len
(
shape
))()
sinput
=
tensor
.
Tensor
(
shared_input
=
tcn
.
shared_constructor
(
numpy
.
random
.
rand
(
*
shape
),
'shared_input'
)
dtype
=
'float32'
,
broadcastable
=
(
0
,)
*
len
(
shape
))()
times
=
compare_fns
(
shared_input
=
tcn
.
shared_constructor
(
dict
(
numpy
=
numpy_sigmoid
numpy
.
random
.
rand
(
*
shape
),
,
theano_cpu
=
pfunc
([
sinput
],
1.0
/
(
1.0
+
tensor
.
exp
(
-
sinput
)))
'shared_input'
)
,
theano_gpu_onboard
=
pfunc
([
sinput
],
[],
updates
=
[(
shared_input
,
1.0
/
(
1.0
+
tensor
.
exp
(
-
shared_input
)))])
times
=
compare_fns
(
dict
(
),
numpy
=
numpy_sigmoid
,
theano_cpu
=
pfunc
([
sinput
],
1.0
/
(
1.0
+
tensor
.
exp
(
-
sinput
))),
theano_gpu_onboard
=
pfunc
(
[
sinput
],
[],
updates
=
[(
shared_input
,
1.0
/
(
1.0
+
tensor
.
exp
(
-
shared_input
)))])),
input
=
shared_input
.
value
)
input
=
shared_input
.
value
)
showtimes
(
times
)
showtimes
(
times
)
def
cmp_sigmoids_T
(
shape
):
def
cmp_sigmoids_T
(
shape
):
def
numpy_sigmoid
(
input
):
def
numpy_sigmoid
(
input
):
rval
=
1.0
/
(
1.0
+
numpy
.
exp
(
-
input
.
T
))
1.0
/
(
1.0
+
numpy
.
exp
(
-
input
.
T
))
sinput
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,)
*
len
(
shape
))()
sinput
=
tensor
.
Tensor
(
shared_input
=
tcn
.
shared_constructor
(
numpy
.
random
.
rand
(
*
shape
),
'shared_input'
)
dtype
=
'float32'
,
broadcastable
=
(
0
,)
*
len
(
shape
))()
times
=
compare_fns
(
shared_input
=
tcn
.
shared_constructor
(
dict
(
numpy
=
numpy_sigmoid
numpy
.
random
.
rand
(
*
shape
),
,
theano_cpu
=
pfunc
([
sinput
],
1.0
/
(
1.0
+
tensor
.
exp
(
-
sinput
.
T
)))
'shared_input'
)
,
theano_gpu_onboard
=
pfunc
([
sinput
],
[],
updates
=
[(
shared_input
,
1.0
/
(
1.0
+
times
=
compare_fns
(
dict
(
tensor
.
exp
(
-
shared_input
.
T
)))])
numpy
=
numpy_sigmoid
,
),
theano_cpu
=
pfunc
([
sinput
],
1.0
/
(
1.0
+
tensor
.
exp
(
-
sinput
.
T
))),
theano_gpu_onboard
=
pfunc
(
[
sinput
],
[],
updates
=
[(
shared_input
,
1.0
/
(
1.0
+
tensor
.
exp
(
-
shared_input
.
T
)))])),
input
=
shared_input
.
value
)
input
=
shared_input
.
value
)
showtimes
(
times
)
showtimes
(
times
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
eval
(
sys
.
argv
[
1
])
eval
(
sys
.
argv
[
1
])
# cmp_sigmoids((640, 64*64)) # looks great in profiler
# cmp_sigmoids((640, 64*64)) # looks great in profiler
#cmp_sigmoids((173, 74*49))
# cmp_sigmoids((173, 74*49))
#cmp_sigmoids_T((173, 74*49))
# cmp_sigmoids_T((173, 74*49))
theano/sandbox/cuda/type.py
浏览文件 @
b69ad54d
...
@@ -259,8 +259,8 @@ class CudaNdarrayType(Type):
...
@@ -259,8 +259,8 @@ class CudaNdarrayType(Type):
'complex64'
:
(
complex
,
'theano_complex64'
,
'complex64'
:
(
complex
,
'theano_complex64'
,
'NPY_COMPLEX64'
)}[
self
.
dtype
]
'NPY_COMPLEX64'
)}[
self
.
dtype
]
except
KeyError
:
except
KeyError
:
raise
TypeError
(
"Unsupported dtype for
%
s:
%
s"
%
(
raise
TypeError
(
"Unsupported dtype for
%
s:
%
s"
%
self
.
__class__
.
__name__
,
self
.
dtype
))
(
self
.
__class__
.
__name__
,
self
.
dtype
))
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
"""
"""
...
@@ -271,9 +271,10 @@ class CudaNdarrayType(Type):
...
@@ -271,9 +271,10 @@ class CudaNdarrayType(Type):
other
.
broadcastable
==
self
.
broadcastable
)
other
.
broadcastable
==
self
.
broadcastable
)
def
convert_variable
(
self
,
var
):
def
convert_variable
(
self
,
var
):
if
(
type
(
self
)
==
type
(
var
.
type
)
and
if
(
isinstance
(
self
,
type
(
var
.
type
)
)
and
self
.
ndim
==
var
.
type
.
ndim
and
self
.
ndim
==
var
.
type
.
ndim
and
all
(
sb
==
ob
or
ob
for
sb
,
ob
in
zip
(
self
.
broadcastable
,
all
(
sb
==
ob
or
ob
for
sb
,
ob
in
zip
(
self
.
broadcastable
,
var
.
type
.
broadcastable
))):
var
.
type
.
broadcastable
))):
return
theano
.
tensor
.
patternbroadcast
(
var
,
self
.
broadcastable
)
return
theano
.
tensor
.
patternbroadcast
(
var
,
self
.
broadcastable
)
...
@@ -312,7 +313,7 @@ class CudaNdarrayType(Type):
...
@@ -312,7 +313,7 @@ class CudaNdarrayType(Type):
return
self
.
name
return
self
.
name
else
:
else
:
b
=
self
.
broadcastable
b
=
self
.
broadcastable
#bcast = str(self.broadcastable)
#
bcast = str(self.broadcastable)
if
not
numpy
.
any
(
b
):
if
not
numpy
.
any
(
b
):
s
=
"
%
iD"
%
len
(
b
)
s
=
"
%
iD"
%
len
(
b
)
else
:
else
:
...
@@ -327,7 +328,7 @@ class CudaNdarrayType(Type):
...
@@ -327,7 +328,7 @@ class CudaNdarrayType(Type):
def
__repr__
(
self
):
def
__repr__
(
self
):
return
str
(
self
)
return
str
(
self
)
#"CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
#
"CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
def
c_declare
(
self
,
name
,
sub
,
check_input
=
True
):
def
c_declare
(
self
,
name
,
sub
,
check_input
=
True
):
return
""" CudaNdarray *
%(name)
s;"""
%
locals
()
return
""" CudaNdarray *
%(name)
s;"""
%
locals
()
...
@@ -563,8 +564,7 @@ theano.compile.register_deep_copy_op_c_code(
...
@@ -563,8 +564,7 @@ theano.compile.register_deep_copy_op_c_code(
CudaNdarray_HOST_DIMS(
%(oname)
s)[i]) {
CudaNdarray_HOST_DIMS(
%(oname)
s)[i]) {
alloc = true;
alloc = true;
break;
break;
}
}}
}
if(alloc) {
if(alloc) {
Py_XDECREF(
%(oname)
s);
Py_XDECREF(
%(oname)
s);
%(oname)
s = (CudaNdarray*)CudaNdarray_Copy(
%(iname)
s);
%(oname)
s = (CudaNdarray*)CudaNdarray_Copy(
%(iname)
s);
...
@@ -581,8 +581,7 @@ theano.compile.register_deep_copy_op_c_code(
...
@@ -581,8 +581,7 @@ theano.compile.register_deep_copy_op_c_code(
%(fail)
s;
%(fail)
s;
}
}
}
}
"""
,
"""
,
version
=
3
)
version
=
3
)
# THIS WORKS But CudaNdarray instances don't compare equal to one
# THIS WORKS But CudaNdarray instances don't compare equal to one
...
@@ -608,5 +607,5 @@ def CudaNdarray_pickler(cnda):
...
@@ -608,5 +607,5 @@ def CudaNdarray_pickler(cnda):
# In case cuda is not imported.
# In case cuda is not imported.
if
cuda
is
not
None
:
if
cuda
is
not
None
:
copyreg
.
pickle
(
cuda
.
CudaNdarray
,
CudaNdarray_pickler
,
copyreg
.
pickle
(
CudaNdarray_unpickler
)
cuda
.
CudaNdarray
,
CudaNdarray_pickler
,
CudaNdarray_unpickler
)
theano/sandbox/cuda/var.py
浏览文件 @
b69ad54d
...
@@ -13,7 +13,7 @@ try:
...
@@ -13,7 +13,7 @@ try:
# We must do those import to be able to create the full doc when nvcc
# We must do those import to be able to create the full doc when nvcc
# is not available
# is not available
from
theano.sandbox.cuda
import
filter
as
type_support_filter
from
theano.sandbox.cuda
import
filter
as
type_support_filter
from
theano.sandbox.cuda.basic_ops
import
HostFromGpu
,
GpuFromHost
from
theano.sandbox.cuda.basic_ops
import
HostFromGpu
except
ImportError
:
except
ImportError
:
pass
pass
...
@@ -33,6 +33,7 @@ class _operators(tensor.basic._tensor_py_operators):
...
@@ -33,6 +33,7 @@ class _operators(tensor.basic._tensor_py_operators):
def
_as_TensorVariable
(
self
):
def
_as_TensorVariable
(
self
):
return
HostFromGpu
()(
self
)
return
HostFromGpu
()(
self
)
def
_as_CudaNdarrayVariable
(
self
):
def
_as_CudaNdarrayVariable
(
self
):
return
self
return
self
...
@@ -54,6 +55,7 @@ class CudaNdarrayConstantSignature(tensor.TensorConstantSignature):
...
@@ -54,6 +55,7 @@ class CudaNdarrayConstantSignature(tensor.TensorConstantSignature):
class
CudaNdarrayConstant
(
_operators
,
Constant
):
class
CudaNdarrayConstant
(
_operators
,
Constant
):
def
signature
(
self
):
def
signature
(
self
):
return
CudaNdarrayConstantSignature
((
self
.
type
,
numpy
.
asarray
(
self
.
data
)))
return
CudaNdarrayConstantSignature
((
self
.
type
,
numpy
.
asarray
(
self
.
data
)))
def
__str__
(
self
):
def
__str__
(
self
):
if
self
.
name
is
not
None
:
if
self
.
name
is
not
None
:
return
self
.
name
return
self
.
name
...
@@ -61,7 +63,7 @@ class CudaNdarrayConstant(_operators, Constant):
...
@@ -61,7 +63,7 @@ class CudaNdarrayConstant(_operators, Constant):
data
=
str
(
numpy
.
asarray
(
self
.
data
))
data
=
str
(
numpy
.
asarray
(
self
.
data
))
except
Exception
as
e
:
except
Exception
as
e
:
data
=
"error while transferring the value: "
+
str
(
e
)
data
=
"error while transferring the value: "
+
str
(
e
)
return
"CudaNdarrayConstant{"
+
data
+
"}"
return
"CudaNdarrayConstant{"
+
data
+
"}"
CudaNdarrayType
.
Constant
=
CudaNdarrayConstant
CudaNdarrayType
.
Constant
=
CudaNdarrayConstant
...
...
theano/tests/test_flake8.py
浏览文件 @
b69ad54d
...
@@ -87,42 +87,8 @@ whitelist_flake8 = [
...
@@ -87,42 +87,8 @@ whitelist_flake8 = [
"sandbox/tests/test_theano_object.py"
,
"sandbox/tests/test_theano_object.py"
,
"sandbox/tests/test_scan.py"
,
"sandbox/tests/test_scan.py"
,
"sandbox/tests/__init__.py"
,
"sandbox/tests/__init__.py"
,
"sandbox/cuda/var.py"
,
"sandbox/cuda/GpuConvGrad3D.py"
,
"sandbox/cuda/basic_ops.py"
,
"sandbox/cuda/nnet.py"
,
"sandbox/cuda/elemwise.py"
,
"sandbox/cuda/type.py"
,
"sandbox/cuda/__init__.py"
,
"sandbox/cuda/__init__.py"
,
"sandbox/cuda/opt.py"
,
"sandbox/cuda/blas.py"
,
"sandbox/cuda/blocksparse.py"
,
"sandbox/cuda/rng_curand.py"
,
"sandbox/cuda/fftconv.py"
,
"sandbox/cuda/kernel_codegen.py"
,
"sandbox/cuda/GpuConvTransp3D.py"
,
"sandbox/cuda/nvcc_compiler.py"
,
"sandbox/cuda/neighbours.py"
,
"sandbox/cuda/tests/__init__.py"
,
"sandbox/cuda/tests/__init__.py"
,
"sandbox/cuda/tests/walltime.py"
,
"sandbox/cuda/tests/test_gradient.py"
,
"sandbox/cuda/tests/test_neighbours.py"
,
"sandbox/cuda/tests/test_conv_cuda_ndarray.py"
,
"sandbox/cuda/tests/test_var.py"
,
"sandbox/cuda/tests/test_opt.py"
,
"sandbox/cuda/tests/test_blas.py"
,
"sandbox/cuda/tests/test_driver.py"
,
"sandbox/cuda/tests/test_rng_curand.py"
,
"sandbox/cuda/tests/test_basic_ops.py"
,
"sandbox/cuda/tests/test_memory.py"
,
"sandbox/cuda/tests/test_mlp.py"
,
"sandbox/cuda/tests/test_bench_loopfusion.py"
,
"sandbox/cuda/tests/test_blocksparse.py"
,
"sandbox/cuda/tests/test_cuda_ndarray.py"
,
"sandbox/cuda/tests/test_tensor_op.py"
,
"sandbox/cuda/tests/test_extra_ops.py"
,
"sandbox/cuda/tests/test_gemmcorr3d.py"
,
"sandbox/cuda/tests/test_viewop.py"
,
"sandbox/gpuarray/tests/__init__.py"
,
"sandbox/gpuarray/tests/__init__.py"
,
"sandbox/scan_module/scan_utils.py"
,
"sandbox/scan_module/scan_utils.py"
,
"sandbox/scan_module/scan.py"
,
"sandbox/scan_module/scan.py"
,
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论