Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
b69ad54d
提交
b69ad54d
authored
5月 05, 2016
作者:
Xavier Bouthillier
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #4244 from ChihebTrabelsi/ccw2.0
flake8 sandbox/cuda/*.py
上级
200babca
58267dc2
全部展开
显示空白字符变更
内嵌
并排
正在显示
33 个修改的文件
包含
311 行增加
和
294 行删除
+311
-294
GpuConvGrad3D.py
theano/sandbox/cuda/GpuConvGrad3D.py
+6
-7
GpuConvTransp3D.py
theano/sandbox/cuda/GpuConvTransp3D.py
+40
-17
basic_ops.py
theano/sandbox/cuda/basic_ops.py
+0
-0
blas.py
theano/sandbox/cuda/blas.py
+45
-34
elemwise.py
theano/sandbox/cuda/elemwise.py
+0
-0
fftconv.py
theano/sandbox/cuda/fftconv.py
+13
-7
kernel_codegen.py
theano/sandbox/cuda/kernel_codegen.py
+6
-10
neighbours.py
theano/sandbox/cuda/neighbours.py
+1
-1
nnet.py
theano/sandbox/cuda/nnet.py
+32
-35
nvcc_compiler.py
theano/sandbox/cuda/nvcc_compiler.py
+5
-3
opt.py
theano/sandbox/cuda/opt.py
+0
-0
rng_curand.py
theano/sandbox/cuda/rng_curand.py
+13
-16
test_basic_ops.py
theano/sandbox/cuda/tests/test_basic_ops.py
+0
-0
test_bench_loopfusion.py
theano/sandbox/cuda/tests/test_bench_loopfusion.py
+0
-0
test_blas.py
theano/sandbox/cuda/tests/test_blas.py
+0
-0
test_conv_cuda_ndarray.py
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+0
-0
test_cuda_ndarray.py
theano/sandbox/cuda/tests/test_cuda_ndarray.py
+0
-0
test_driver.py
theano/sandbox/cuda/tests/test_driver.py
+4
-3
test_extra_ops.py
theano/sandbox/cuda/tests/test_extra_ops.py
+24
-23
test_gemmcorr3d.py
theano/sandbox/cuda/tests/test_gemmcorr3d.py
+6
-5
test_gradient.py
theano/sandbox/cuda/tests/test_gradient.py
+1
-1
test_memory.py
theano/sandbox/cuda/tests/test_memory.py
+14
-8
test_mlp.py
theano/sandbox/cuda/tests/test_mlp.py
+30
-30
test_neighbours.py
theano/sandbox/cuda/tests/test_neighbours.py
+4
-4
test_opt.py
theano/sandbox/cuda/tests/test_opt.py
+0
-0
test_rng_curand.py
theano/sandbox/cuda/tests/test_rng_curand.py
+1
-1
test_tensor_op.py
theano/sandbox/cuda/tests/test_tensor_op.py
+6
-8
test_var.py
theano/sandbox/cuda/tests/test_var.py
+10
-10
test_viewop.py
theano/sandbox/cuda/tests/test_viewop.py
+1
-2
walltime.py
theano/sandbox/cuda/tests/walltime.py
+34
-21
type.py
theano/sandbox/cuda/type.py
+11
-12
var.py
theano/sandbox/cuda/var.py
+4
-2
test_flake8.py
theano/tests/test_flake8.py
+0
-34
没有找到文件。
theano/sandbox/cuda/GpuConvGrad3D.py
浏览文件 @
b69ad54d
...
...
@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
d_
=
T
.
as_tensor_variable
(
d
)
WShape_
=
T
.
as_tensor_variable
(
WShape
)
dCdH_
=
as_cuda_ndarray_variable
(
dCdH
)
broad
=
(
False
,)
*
5
broad
=
(
False
,)
*
5
return
theano
.
Apply
(
self
,
inputs
=
[
V_
,
d_
,
WShape_
,
dCdH_
],
outputs
=
[
CudaNdarrayType
(
dtype
=
V_
.
dtype
,
broadcastable
=
broad
)()])
...
...
@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize
=
dCdH
.
shape
[
0
]
outputFilters
=
dCdH
.
shape
[
1
]
outputHeight
=
dCdH
.
shape
[
2
]
outputWidth
=
dCdH
.
shape
[
3
]
outputDur
=
dCdH
.
shape
[
4
]
assert
V
.
shape
[
0
]
==
batchSize
inputFilters
=
V
.
shape
[
1
]
inputHeight
=
V
.
shape
[
2
]
inputWidth
=
V
.
shape
[
3
]
inputDur
=
V
.
shape
[
4
]
dr
,
dc
,
dt
=
d
dCdW
=
numpy
.
zeros
(
WShape
,
dtype
=
V
.
dtype
)
...
...
@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
for
p
in
xrange
(
0
,
outputHeight
):
for
q
in
xrange
(
0
,
outputWidth
):
for
r
in
xrange
(
0
,
outputDur
):
dCdW
[
j
,
z
,
k
,
l
,
m
]
+=
dCdH
[
i
,
j
,
p
,
q
,
r
]
*
V
[
i
,
z
,
dr
*
p
+
k
,
dc
*
q
+
l
,
dt
*
r
+
m
]
dCdW
[
j
,
z
,
k
,
l
,
m
]
+=
dCdH
[
i
,
j
,
p
,
q
,
r
]
*
\
V
[
i
,
z
,
dr
*
p
+
k
,
dc
*
q
+
l
,
dt
*
r
+
m
]
output_storage
[
0
][
0
]
=
dCdW
...
...
theano/sandbox/cuda/GpuConvTransp3D.py
浏览文件 @
b69ad54d
...
...
@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
else
:
RShape_
=
T
.
as_tensor_variable
([
-
1
,
-
1
,
-
1
])
return
theano
.
Apply
(
self
,
inputs
=
[
W_
,
b_
,
d_
,
H_
,
RShape_
],
outputs
=
[
CudaNdarrayType
(
dtype
=
H_
.
dtype
,
broadcastable
=
(
False
,)
*
5
)()])
return
theano
.
Apply
(
self
,
inputs
=
[
W_
,
b_
,
d_
,
H_
,
RShape_
],
outputs
=
[
CudaNdarrayType
(
dtype
=
H_
.
dtype
,
broadcastable
=
(
False
,)
*
5
)()])
def
infer_shape
(
self
,
node
,
input_shapes
):
W
,
b
,
d
,
H
,
RShape
=
node
.
inputs
...
...
@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
assert
dc
>
0
assert
dt
>
0
videoHeight
=
(
outputHeight
-
1
)
*
dr
+
filterHeight
videoWidth
=
(
outputWidth
-
1
)
*
dc
+
filterWidth
videoDur
=
(
outputDur
-
1
)
*
dt
+
filterDur
videoHeight
=
(
outputHeight
-
1
)
*
dr
+
filterHeight
videoWidth
=
(
outputWidth
-
1
)
*
dc
+
filterWidth
videoDur
=
(
outputDur
-
1
)
*
dt
+
filterDur
if
Rshape
is
not
None
and
Rshape
[
0
]
!=
-
1
:
if
Rshape
[
0
]
<
videoHeight
:
...
...
@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
# else:
# print "No Rshape passed in"
# print "video size: "
+
str((videoHeight, videoWidth, videoDur))
# print "video size: "
+
str((videoHeight, videoWidth, videoDur))
R
=
numpy
.
zeros
(
(
batchSize
,
inputChannels
,
videoHeight
,
videoWidth
,
videoDur
)
,
dtype
=
H
.
dtype
)
R
=
numpy
.
zeros
((
batchSize
,
inputChannels
,
videoHeight
,
videoWidth
,
videoDur
),
dtype
=
H
.
dtype
)
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
# sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for
i
in
xrange
(
0
,
batchSize
):
# print '\texample '+str(i+1)+'/'+str(batchSize)
for
j
in
xrange
(
0
,
inputChannels
):
# print '\t\tfeature map '
+str(j+1)+'/'+
str(inputChannels)
# print '\t\tfeature map '
+ str(j+1) + '/' +
str(inputChannels)
for
r
in
xrange
(
0
,
videoHeight
):
# print '\t\t\trow '
+str(r+1)+
'/'+str(videoHeight)
# print '\t\t\trow '
+ str(r+1) +
'/'+str(videoHeight)
for
c
in
xrange
(
0
,
videoWidth
):
for
t
in
xrange
(
0
,
videoDur
):
R
[
i
,
j
,
r
,
c
,
t
]
=
b
[
j
]
ftc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
t
-
filterDur
+
1
)
/
float
(
dt
)))
])
fcc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
c
-
filterWidth
+
1
)
/
float
(
dc
)))
])
rc
=
max
([
0
,
int
(
numpy
.
ceil
(
float
(
r
-
filterHeight
+
1
)
/
float
(
dr
)))
])
ftc
=
max
(
[
0
,
int
(
numpy
.
ceil
(
float
(
t
-
filterDur
+
1
)
/
float
(
dt
)
))
]
)
fcc
=
max
(
[
0
,
int
(
numpy
.
ceil
(
float
(
c
-
filterWidth
+
1
)
/
float
(
dc
)
))
]
)
rc
=
max
(
[
0
,
int
(
numpy
.
ceil
(
float
(
r
-
filterHeight
+
1
)
/
float
(
dr
)
))
]
)
while
rc
<
outputHeight
:
rk
=
r
-
rc
*
dr
if
rk
<
0
:
...
...
@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
if
tk
<
0
:
break
R
[
i
,
j
,
r
,
c
,
t
]
+=
numpy
.
dot
(
W
[:,
j
,
rk
,
ck
,
tk
],
H
[
i
,
:,
rc
,
cc
,
tc
]
)
R
[
i
,
j
,
r
,
c
,
t
]
+=
numpy
.
dot
(
W
[:,
j
,
rk
,
ck
,
tk
],
H
[
i
,
:,
rc
,
cc
,
tc
])
tc
+=
1
""
# close loop over tc
...
...
theano/sandbox/cuda/basic_ops.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/blas.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
import
copy
import
os
import
logging
_logger
=
logging
.
getLogger
(
__name__
)
from
six
import
integer_types
from
six.moves
import
StringIO
,
reduce
import
theano
from
theano
import
Apply
from
theano
import
tensor
...
...
@@ -15,6 +11,7 @@ from theano.sandbox.cuda import GpuOp
from
theano.sandbox.cuda.basic_ops
import
(
as_cuda_ndarray_variable
,
gpu_contiguous
)
from
theano.tensor
import
as_tensor_variable
_logger
=
logging
.
getLogger
(
__name__
)
class
GpuBatchedDot
(
GpuOp
):
...
...
@@ -183,8 +180,7 @@ class GpuBatchedDot(GpuOp):
}
} else {
// copy inputs if not contiguous
"""
+
(
"
\n
"
.
join
(
"""
"""
+
(
"
\n
"
.
join
(
"""
if (( CudaNdarray_HOST_DIMS(
%(var)
s)[0] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[0] != 1
&& CudaNdarray_HOST_DIMS(
%(var)
s)[1] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[1] != 1
&& CudaNdarray_HOST_DIMS(
%(var)
s)[2] > 1 && CudaNdarray_HOST_STRIDES(
%(var)
s)[2] != 1)
...
...
@@ -198,8 +194,7 @@ class GpuBatchedDot(GpuOp):
Py_XDECREF(
%(var)
s);
%(var)
s = _copy;
}
"""
%
dict
(
var
=
var
,
fail
=
fail
)
for
var
in
(
bx
,
by
)))
+
"""
"""
%
dict
(
var
=
var
,
fail
=
fail
)
for
var
in
(
bx
,
by
)))
+
"""
// fail if the output is not contiguous; we can't copy it because we
// need to write to the original memory
...
...
@@ -537,8 +532,8 @@ class GpuGemm(GpuOp):
return
'GpuGemm{no_inplace}'
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
\
and
self
.
inplace
==
other
.
inplace
)
return
(
type
(
self
)
==
type
(
other
)
and
self
.
inplace
==
other
.
inplace
)
def
__hash__
(
self
):
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
...
...
@@ -562,7 +557,7 @@ class GpuGemm(GpuOp):
return
(
4
,)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
#z_out = alpha * dot(x,y) + beta * z_in
#
z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out.
z_in
,
a
,
x
,
y
,
b
=
inputs
...
...
@@ -657,8 +652,8 @@ class GpuGemv(GpuOp):
return
'GpuGemv{no_inplace}'
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
\
and
self
.
inplace
==
other
.
inplace
)
return
(
type
(
self
)
==
type
(
other
)
and
self
.
inplace
==
other
.
inplace
)
def
__hash__
(
self
):
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
...
...
@@ -682,7 +677,7 @@ class GpuGemv(GpuOp):
return
(
3
,)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
#z_out = alpha * dot(x,y) + beta * z_in
#
z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out.
z_in
,
a
,
x
,
y
,
b
=
inputs
...
...
@@ -757,8 +752,8 @@ class GpuGer(GpuOp):
return
'GpuGer{no_inplace}'
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
\
and
self
.
inplace
==
other
.
inplace
)
return
(
type
(
self
)
==
type
(
other
)
and
self
.
inplace
==
other
.
inplace
)
def
__hash__
(
self
):
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
...
...
@@ -782,7 +777,7 @@ class GpuGer(GpuOp):
return
(
2
,)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
#z_out = alpha * dot(x,y) + beta * z_in
#
z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out.
z_in
,
a
,
x
,
y
=
inputs
...
...
@@ -1283,11 +1278,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
bottom
,
top
=
inp
[:
2
]
weights
,
=
grads
weights
=
gpu_contiguous
(
weights
)
d_bottom
=
GpuCorrMM_gradInputs
(
self
.
border_mode
,
self
.
subsample
)(
weights
,
top
,
bottom
.
shape
[
-
2
:])
d_top
=
GpuCorrMM
(
self
.
border_mode
,
self
.
subsample
)(
bottom
,
weights
)
d_height_width
=
(
theano
.
gradient
.
DisconnectedType
()(),)
*
2
if
len
(
inp
)
==
4
else
()
d_bottom
=
GpuCorrMM_gradInputs
(
self
.
border_mode
,
self
.
subsample
)(
weights
,
top
,
bottom
.
shape
[
-
2
:])
d_top
=
GpuCorrMM
(
self
.
border_mode
,
self
.
subsample
)(
bottom
,
weights
)
d_height_width
=
(
theano
.
gradient
.
DisconnectedType
()(),
)
*
2
if
len
(
inp
)
==
4
else
()
return
(
d_bottom
,
d_top
)
+
d_height_width
def
connection_pattern
(
self
,
node
):
...
...
@@ -1342,11 +1341,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
weights
,
top
=
inp
[:
2
]
bottom
,
=
grads
bottom
=
gpu_contiguous
(
bottom
)
d_weights
=
GpuCorrMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
)(
d_weights
=
GpuCorrMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
)(
bottom
,
top
,
weights
.
shape
[
-
2
:])
d_top
=
GpuCorrMM
(
self
.
border_mode
,
self
.
subsample
)(
bottom
,
weights
)
d_height_width
=
(
theano
.
gradient
.
DisconnectedType
()(),)
*
2
if
len
(
inp
)
==
4
else
()
d_top
=
GpuCorrMM
(
self
.
border_mode
,
self
.
subsample
)(
bottom
,
weights
)
d_height_width
=
(
theano
.
gradient
.
DisconnectedType
()(),
)
*
2
if
len
(
inp
)
==
4
else
()
return
(
d_weights
,
d_top
)
+
d_height_width
def
connection_pattern
(
self
,
node
):
...
...
@@ -1755,10 +1757,16 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
bottom
,
weights
=
inp
top
,
=
grads
top
=
gpu_contiguous
(
top
)
d_bottom
=
GpuCorr3dMM_gradInputs
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
weights
,
top
,
bottom
.
shape
[
-
3
:])
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
top
,
weights
.
shape
[
-
3
:])
d_bottom
=
GpuCorr3dMM_gradInputs
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
weights
,
top
,
bottom
.
shape
[
-
3
:])
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
top
,
weights
.
shape
[
-
3
:])
return
d_bottom
,
d_weights
...
...
@@ -1863,11 +1871,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
weights
,
top
=
inp
[:
2
]
bottom
,
=
grads
bottom
=
gpu_contiguous
(
bottom
)
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
top
,
weights
.
shape
[
-
3
:])
d_top
=
GpuCorr3dMM
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
d_top
=
GpuCorr3dMM
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
weights
)
d_height_width_depth
=
(
theano
.
gradient
.
DisconnectedType
()(),)
*
3
if
len
(
inp
)
==
5
else
()
d_height_width_depth
=
(
theano
.
gradient
.
DisconnectedType
()(),)
\
*
3
if
len
(
inp
)
==
5
else
()
return
(
d_weights
,
d_top
)
+
d_height_width_depth
def
connection_pattern
(
self
,
node
):
...
...
@@ -2186,7 +2197,7 @@ class GpuDownsampleFactorMax(GpuOp):
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
# def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented')
#
raise NotImplementedError('only C is implemented')
def
c_code_cache_version
(
self
):
return
(
6
)
...
...
theano/sandbox/cuda/elemwise.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/fftconv.py
浏览文件 @
b69ad54d
...
...
@@ -5,9 +5,9 @@ import numpy as np
import
theano
import
theano.tensor
as
T
from
theano.misc.pycuda_init
import
pycuda_available
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
from
theano.ifelse
import
ifelse
from
theano.misc.pycuda_init
import
pycuda_available
if
cuda_available
:
from
theano.sandbox.cuda
import
(
basic_ops
,
CudaNdarrayType
,
...
...
@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# how much values are expected.
if
border_mode
==
'valid'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
)]
elif
border_mode
==
'full'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
)]
else
:
raise
ValueError
(
'invalid mode'
)
...
...
@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
output_fft_s
=
mult_and_reduce
(
input_fft_v
,
filters_fft_v
,
input_shape
=
input_fft_v_shape
,
filter_shape
=
filters_fft_v_shape
)
#output_fft_s = input_fft_v
#
output_fft_s = input_fft_v
# reshape for IFFT
output_fft_flat
=
output_fft_s
.
reshape
((
b
*
oc
,
o0
,
o1
,
o2
//
2
+
1
,
2
))
...
...
@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# how much values are expected.
if
border_mode
==
'valid'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
),
(
f2
-
1
):(
f2
-
1
+
i2
-
f2
+
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
-
f0
+
1
),
(
f1
-
1
):(
f1
-
1
+
i1
-
f1
+
1
),
(
f2
-
1
):(
f2
-
1
+
i2
-
f2
+
1
)]
elif
border_mode
==
'full'
:
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
),
(
f2
-
1
):(
f2
-
1
+
i2
+
f2
-
1
)]
output
=
output_circ
[:,
:,
(
f0
-
1
):(
f0
-
1
+
i0
+
f0
-
1
),
(
f1
-
1
):(
f1
-
1
+
i1
+
f1
-
1
),
(
f2
-
1
):(
f2
-
1
+
i2
+
f2
-
1
)]
else
:
raise
ValueError
(
'invalid mode'
)
#output = output_circ[:, :, :, :, :]
#
output = output_circ[:, :, :, :, :]
# Rescale manually. This is just a factor that comes in during the
# trip through FFT and inverse FFT.
...
...
theano/sandbox/cuda/kernel_codegen.py
浏览文件 @
b69ad54d
...
...
@@ -167,17 +167,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
We use __i as an int variable in a loop.
"""
return
[
# get max of buf (trashing all but buf[0])
return
[
# get max of buf (trashing all but buf[0])
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'float row_max = '
+
buf
+
'[0]'
,
'__syncthreads()'
,
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
threadCount
+
'){'
,
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = exp('
+
buf2
+
'[__i] - row_max)'
,
buf2
+
'[__i] = '
+
buf
+
'[__i]'
,
'}'
,
buf2
+
'[__i] = '
+
buf
+
'[__i]'
,
'}'
,
'__syncthreads()'
,
inline_reduce_sum
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
...
...
@@ -186,8 +184,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
# divide each exp() result by the sum to complete the job.
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = '
+
buf2
+
'[__i] / row_sum'
,
'}'
,
buf
+
'[__i] = '
+
buf2
+
'[__i] / row_sum'
,
'}'
,
'__syncthreads()'
,
]
...
...
@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
init
=
manner_init
(
"
%(x)
s[
%(pos)
s *
%(stride_x)
s]"
%
locals
())
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(x)
s[i *
%(stride_x)
s]"
%
locals
()))
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
buf
)
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
buf
)
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
...
...
theano/sandbox/cuda/neighbours.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
# This is work in progress
from
theano
import
Op
,
Apply
,
tensor
from
theano
import
Apply
,
tensor
from
theano.gof
import
local_optimizer
from
theano.sandbox.cuda
import
cuda_available
,
GpuOp
...
...
theano/sandbox/cuda/nnet.py
浏览文件 @
b69ad54d
...
...
@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
"""
%
locals
()
def
c_support_code_apply
(
self
,
node
,
nodename
):
ret1
=
nvcc_kernel
(
"kSoftmax_
%
s"
%
nodename
,
ret1
=
nvcc_kernel
(
"kSoftmax_
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
"extern __shared__ float buf[]"
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
"extern __shared__ float buf[]"
,
"float * buf2 = buf + N"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
"}"
,
])
ret2
=
nvcc_kernel
(
"kSoftmax_fixed_shared
%
s"
%
nodename
,
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
"}"
,
])
ret2
=
nvcc_kernel
(
"kSoftmax_fixed_shared
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
"extern __shared__ float buf[]"
,
body
=
[
"extern __shared__ float buf[]"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"const float *x_ptr = &x[blockIDX * sx0]"
,
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
'sm_ptr'
,
'sm_s1'
,
'threadIdx.x'
,
'blockDim.x'
),
"__syncthreads()"
,
"}"
,
])
'threadIdx.x'
,
'blockDim.x'
),
"__syncthreads()"
,
"}"
,
])
return
ret1
+
"
\n
"
+
ret2
gpu_softmax
=
GpuSoftmax
()
...
...
@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
'const float * x'
,
'const int sx0'
,
'const int sx1'
,
'const float * b'
,
'const int sb0'
,
'float * sm'
,
'const int sm_s0'
,
'const int sm_s1'
],
body
=
[
"extern __shared__ float buf[]"
,
body
=
[
"extern __shared__ float buf[]"
,
"float * buf2 = buf + N"
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] = x[blockIDX * sx0 + tx * sx1]"
,
"buf[tx] += b[tx * sb0]"
,
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
"}"
,
])
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
"}"
,
])
ret2
=
nvcc_kernel
(
"kSoftmaxWithBias_fixed_shared
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
'const float * x'
,
...
...
@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
"float *sm_ptr = &sm[blockIDX * sm_s0]"
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
'sm_ptr'
,
'sm_s1'
,
'sm_ptr'
,
'sm_s1'
,
'threadIdx.x'
,
'blockDim.x'
,
'b'
,
'sb0'
),
...
...
theano/sandbox/cuda/nvcc_compiler.py
浏览文件 @
b69ad54d
...
...
@@ -4,7 +4,6 @@ import logging
import
os
import
subprocess
import
sys
import
warnings
from
locale
import
getpreferredencoding
import
numpy
...
...
@@ -249,7 +248,8 @@ class NVCC_compiler(Compiler):
_logger
.
debug
(
'Writing module C++ code to
%
s'
,
cppfilename
)
cppfile
.
write
(
src_code
)
lib_filename
=
os
.
path
.
join
(
location
,
'
%
s.
%
s'
%
lib_filename
=
os
.
path
.
join
(
location
,
'
%
s.
%
s'
%
(
module_name
,
get_lib_extension
()))
_logger
.
debug
(
'Generating shared lib
%
s'
,
lib_filename
)
...
...
@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
indexof
=
cmd
.
index
(
'-u'
)
cmd
.
pop
(
indexof
)
# Remove -u
cmd
.
pop
(
indexof
)
# Remove argument to -u
except
ValueError
as
e
:
except
ValueError
:
done
=
True
# CUDA Toolkit v4.1 Known Issues:
...
...
@@ -364,6 +364,8 @@ class NVCC_compiler(Compiler):
console_encoding
=
getpreferredencoding
()
nvcc_stdout
=
decode_with
(
nvcc_stdout_raw
,
console_encoding
)
nvcc_stderr
=
decode_with
(
nvcc_stderr_raw
,
console_encoding
)
p
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
finally
:
os
.
chdir
(
orig_dir
)
...
...
theano/sandbox/cuda/opt.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/rng_curand.py
浏览文件 @
b69ad54d
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
from
__future__
import
absolute_import
,
print_function
,
division
__authors__
=
"James Bergstra"
__copyright__
=
"(c) 2011, University of Montreal"
__license__
=
"3-clause BSD License"
__contact__
=
"theano-dev@googlegroups.com"
import
numpy
import
theano.gof
from
theano.compat
import
PY3
...
...
@@ -17,6 +7,15 @@ from theano.tensor import (get_vector_length, cast, opt)
from
theano.compile
import
optdb
from
theano.gof
import
local_optimizer
,
Variable
__authors__
=
"James Bergstra"
__copyright__
=
"(c) 2011, University of Montreal"
__license__
=
"3-clause BSD License"
__contact__
=
"theano-dev@googlegroups.com"
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
config
=
theano
.
config
...
...
@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
Return a tuple of attributes that define the Op.
"""
return
(
self
.
destructive
,
return
(
self
.
destructive
,
self
.
output_type
,
self
.
seed
,
)
...
...
@@ -101,8 +99,7 @@ class CURAND_Base(GpuOp):
v_size
=
theano
.
tensor
.
as_tensor_variable
(
size
)
if
ndim
is
None
:
ndim
=
get_vector_length
(
v_size
)
self
=
cls
(
output_type
=
CudaNdarrayType
((
False
,)
*
ndim
),
self
=
cls
(
output_type
=
CudaNdarrayType
((
False
,)
*
ndim
),
seed
=
seed
,
destructive
=
False
)
...
...
@@ -386,5 +383,5 @@ def local_destructive(node):
return
new_op
.
make_node
(
*
node
.
inputs
)
.
outputs
return
False
optdb
.
register
(
'CURAND_destructive'
,
opt
.
in2out
(
local_destructive
,
ignore_newtrees
=
True
),
99
,
'fast_run'
,
'inplace'
)
opt
.
in2out
(
local_destructive
,
ignore_newtrees
=
True
)
,
99
,
'fast_run'
,
'inplace'
)
theano/sandbox/cuda/tests/test_basic_ops.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_bench_loopfusion.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_blas.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_cuda_ndarray.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_driver.py
浏览文件 @
b69ad54d
...
...
@@ -6,7 +6,7 @@ import theano
try
:
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
except
ImportError
:
# To have the GPU back-end work without nose, we need this file to
...
...
@@ -33,8 +33,9 @@ def test_nvidia_driver1():
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
2
if
sum
(
isinstance
(
node
.
op
,
B
.
GpuCAReduce
)
for
node
in
topo
)
!=
1
:
msg
=
'
\n\t
'
.
join
([
'Expected exactly one occurrence of GpuCAReduce '
+
'but got:'
]
+
[
str
(
app
)
for
app
in
topo
])
msg
=
'
\n\t
'
.
join
(
[
'Expected exactly one occurrence of GpuCAReduce '
+
'but got:'
]
+
[
str
(
app
)
for
app
in
topo
])
raise
AssertionError
(
msg
)
if
not
numpy
.
allclose
(
f
(),
a
.
sum
()):
raise
Exception
(
"The nvidia driver version installed with this OS "
...
...
theano/sandbox/cuda/tests/test_extra_ops.py
浏览文件 @
b69ad54d
...
...
@@ -5,24 +5,22 @@ import itertools
from
nose.plugins.skip
import
SkipTest
import
numpy
as
np
from
six.moves
import
xrange
from
theano
import
tensor
as
T
import
theano
from
theano.tensor.extra_ops
import
cumsum
,
CumsumOp
from
theano.tests
import
unittest_tools
as
utt
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
is
False
:
if
cuda_ndarray
.
cuda_available
:
import
theano.tensor.tests.test_extra_ops
from
theano.sandbox.cuda.extra_ops
import
GpuCumsum
else
:
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.tensor.tests.test_extra_ops
from
theano.sandbox.cuda.extra_ops
import
GpuCumsum
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
from
theano
import
tensor
as
T
import
theano
from
theano.tensor.extra_ops
import
cumsum
,
CumsumOp
from
theano.tests
import
unittest_tools
as
utt
class
TestGpuCumsum
(
theano
.
tensor
.
tests
.
test_extra_ops
.
TestCumsumOp
):
mode
=
mode_with_gpu
...
...
@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
utt
.
assert_allclose
(
np
.
cumsum
(
a
[:
i
]),
f
(
a
[:
i
]))
# Use multiple GPU threadblocks
a
=
np
.
random
.
random
((
block_max_size
+
2
,))
.
astype
(
"float32"
)
a
=
np
.
random
.
random
((
block_max_size
+
2
,))
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
# Use recursive cumsum
a
=
np
.
ones
((
block_max_size
*
(
block_max_size
+
1
)
+
2
,),
a
=
np
.
ones
((
block_max_size
*
(
block_max_size
+
1
)
+
2
,),
dtype
=
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
...
...
@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks
a_shape
=
[
5
,
5
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks
a_shape
=
[
4
,
4
]
a_shape
[
1
-
shape_axis
]
=
self
.
max_grid_size1
+
1
a_shape
[
1
-
shape_axis
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
),
rtol
=
5e-5
)
# Use recursive cumsum
a_shape
=
[
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
def
test_GpuCumsum3D
(
self
):
...
...
@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks (along accumulation axis)
a_shape
=
[
2
,
2
,
2
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks (not along accumulation axis)
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
1
)
%
3
]
=
self
.
max_grid_size1
+
1
a_shape
[(
shape_axis
+
1
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
2
)
%
3
]
=
self
.
max_grid_size1
+
1
a_shape
[(
shape_axis
+
2
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use recursive cumsum (along accumulation axis)
a_shape
=
[
3
,
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
def
test_GpuCumsum4D
(
self
):
...
...
theano/sandbox/cuda/tests/test_gemmcorr3d.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
import
unittest
import
numpy
import
copy
import
theano
from
theano.tests
import
unittest_tools
as
utt
# Skip tests if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
if
not
cuda_ndarray
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda not available'
)
from
theano.sandbox.cuda
import
float32_shared_constructor
as
shared
from
theano.sandbox.cuda.blas
import
(
GpuCorr3dMM
,
GpuCorr3dMM_gradWeights
,
GpuCorr3dMM_gradInputs
)
from
theano.sandbox.cuda.basic_ops
import
gpu_contiguous
import
theano.sandbox.cuda
as
cuda_ndarray
if
not
cuda_ndarray
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda not available'
)
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
...
...
@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
inputs
=
shared
(
inputs_val
)
filters
=
shared
(
filters_val
)
bias
=
shared
(
numpy
.
zeros
(
filters_shape
[
4
])
.
astype
(
'float32'
))
conv
=
theano
.
tensor
.
nnet
.
convTransp3D
(
W
=
filters
,
b
=
bias
,
d
=
subsample
,
conv
=
theano
.
tensor
.
nnet
.
convTransp3D
(
W
=
filters
,
b
=
bias
,
d
=
subsample
,
H
=
inputs
)
f_ref
=
theano
.
function
([],
conv
)
res_ref
=
f_ref
()
...
...
theano/sandbox/cuda/tests/test_gradient.py
浏览文件 @
b69ad54d
...
...
@@ -8,7 +8,7 @@ from theano.sandbox import cuda
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
...
...
theano/sandbox/cuda/tests/test_memory.py
浏览文件 @
b69ad54d
...
...
@@ -11,7 +11,7 @@ from theano import ifelse
# Skip test if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
if
cuda
.
cuda_available
==
False
:
if
cuda
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
...
...
@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
theano_alloc
=
cuda
.
cuda_ndarray
.
cuda_ndarray
.
theano_allocated
()
return
(
"(n malloc/theano mem allocated in KB)"
,
n_mallocs
+
extra_alloc
,
int
(
theano_alloc
/
1024
)
+
extra_size
)
int
(
theano_alloc
/
1024
))
return
(
"n malloc on the gpu"
,
n_mallocs
+
extra_alloc
)
# I don't use the following by default as if there is other stuff running
...
...
@@ -83,9 +83,12 @@ def test_memory():
variables
=
cuda
.
shared_constructor
(
np
.
ones
((
shapes
[
1
],),
dtype
=
'float32'
))
derp
=
tensor
.
sum
(
tensor
.
dot
(
some_matrix
[:
shapes
[
0
]],
variables
))
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
mem2
=
freemem
()
print
(
"Before compilation"
,
mem2
)
...
...
@@ -112,7 +115,7 @@ def test_memory():
del
obj
# print "After deleting function 1", freemem()
#assert mem2 == freemem(), (mem2, freemem())
#
assert mem2 == freemem(), (mem2, freemem())
del
grad
print
(
"After deleting function 2"
,
freemem
())
...
...
@@ -155,16 +158,19 @@ def test_memory_lazy():
derp
=
ifelse
.
IfElse
(
1
)(
branch_select
,
derp
,
some_matrix
[:
shapes
[
0
]]
.
sum
())
derp
+=
1
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
print
(
"Shared took "
,
np
.
prod
(
variables
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
return_internal_type
=
True
)
.
shape
)
*
4
/
1024
,
"kB"
)
mem2
=
freemem
()
print
(
"Before compilation"
,
mem2
)
mem2_1
=
freemem
(
extra_alloc
=
more_alloc1
)
obj
=
theano
.
function
([
some_vector
,
branch_select
],
derp
,
mode
=
mode_with_gpu
)
#theano.printing.debugprint(obj, print_type=True)
#
theano.printing.debugprint(obj, print_type=True)
mem3
=
freemem
()
print
(
"After function compilation 1"
,
mem3
)
assert
mem2_1
==
mem3
,
(
mem2_1
,
mem3
)
...
...
theano/sandbox/cuda/tests/test_mlp.py
浏览文件 @
b69ad54d
...
...
@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
'otherwise it is too slow!'
)
# Skip test if cuda_ndarray is not available.
if
tcn
.
cuda_available
==
False
:
if
tcn
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
...
...
@@ -147,19 +147,20 @@ def test_run_nnet():
rtol
=
1e-4
if
n_in
*
n_hid
>=
2048
*
4096
:
rtol
=
7e-4
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
rtol
,
atol
=
1e-6
),
\
assert
numpy
.
allclose
(
rval_cpu
,
rval_gpu
,
rtol
=
rtol
,
atol
=
1e-6
),
\
(
"max_abs_diff, max_rel_diff, n_in, n_hid"
,
max_abs_diff
,
rel_diff
.
max
(),
n_in
,
n_hid
)
def
test_run_nnet_med
():
utt
.
seed_rng
()
r
val_cpu
=
r
un_nnet
(
False
,
10
,
128
,
50
,
4
,
n_train
=
10000
)
run_nnet
(
False
,
10
,
128
,
50
,
4
,
n_train
=
10000
)
def
test_run_nnet_small
():
utt
.
seed_rng
()
r
val_cpu
=
r
un_nnet
(
False
,
10
,
10
,
4
,
4
,
n_train
=
100000
)
run_nnet
(
False
,
10
,
10
,
4
,
4
,
n_train
=
100000
)
def
run_conv_nnet1
(
use_gpu
):
...
...
@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
mode
=
get_mode
(
use_gpu
)
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
train
=
pfunc
(
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
...
...
@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
conv_op
=
conv
.
ConvOp
(
shape_img
[
2
:],
shape_kern
[
2
:],
n_kern
,
n_batch
,
1
,
1
)
conv_op1
=
conv
.
ConvOp
((
n_kern
,
logical_hid_shape
[
0
]
//
2
,
logical_hid_shape
[
1
]
//
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
)
logical_hid_shape
[
1
]
//
2
),
shape_kern1
[
2
:],
n_kern1
,
n_batch
,
1
,
1
)
hid
=
tensor
.
tanh
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
[:,
:,
::
2
,
::
2
],
w1
)
+
b1
.
dimshuffle
((
...
...
@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
mode
=
get_mode
(
use_gpu
)
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
train
=
pfunc
(
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
...
...
@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
if
downsample_ops
:
hid
=
tensor
.
tanh
(
ds_op
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
))))
else
:
hid
=
tensor
.
tanh
((
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
((
0
,
'x'
,
'x'
)
))[:,
:,
::
2
,
::
2
])
hid
=
tensor
.
tanh
(
(
conv_op
(
x
,
w0
)
+
b0
.
dimshuffle
(
(
0
,
'x'
,
'x'
)))[:,
:,
::
2
,
::
2
])
hid1
=
tensor
.
tanh
(
conv_op1
(
hid
,
w1
)
+
b1
.
dimshuffle
((
0
,
'x'
,
'x'
)))
hid_flat
=
hid1
.
reshape
((
n_batch
,
n_hid
))
out
=
tensor
.
nnet
.
softmax
(
tensor
.
dot
(
hid_flat
,
v
)
+
c
)
loss
=
tensor
.
sum
(
tensor
.
nnet
.
crossentropy_categorical_1hot
(
out
,
tensor
.
argmax
(
y
,
axis
=
1
))
*
lr
)
loss
=
tensor
.
sum
(
tensor
.
nnet
.
crossentropy_categorical_1hot
(
out
,
tensor
.
argmax
(
y
,
axis
=
1
))
*
lr
)
# print 'loss type', loss.type
params
=
[
w0
,
b0
,
w1
,
b1
,
v
,
c
]
...
...
@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
mode
=
get_mode
(
use_gpu
,
check_isfinite
)
# print 'building pfunc ...'
train
=
pfunc
([
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
train
=
pfunc
(
[
x
,
y
,
lr
],
[
loss
],
mode
=
mode
,
updates
=
[(
p
,
p
-
g
)
for
p
,
g
in
zip
(
params
,
gparams
)])
if
verbose
:
theano
.
printing
.
debugprint
(
train
)
...
...
@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
lr
=
theano
.
_asarray
(
0.01
,
dtype
=
'float32'
)
rvals
=
my_zeros
(
n_train
)
t0
=
time
.
time
()
for
i
in
xrange
(
n_train
):
rvals
[
i
]
=
train
(
xval
,
yval
,
lr
)[
0
]
t1
=
time
.
time
()
print_mode
(
mode
)
if
pickle
and
isinstance
(
mode
,
theano
.
compile
.
ProfileMode
):
...
...
@@ -495,7 +506,8 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
compare
=
True
if
not
compare
:
return
run_conv_nnet2_classif
(
use_gpu
=
use_gpu
,
return
run_conv_nnet2_classif
(
use_gpu
=
use_gpu
,
seed
=
seed
,
isize
=
isize
,
ksize
=
ksize
,
bsize
=
bsize
,
n_train
=
n_train
,
check_isfinite
=
check_isfinite
,
...
...
@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
finally
:
theano
.
tensor
.
basic
.
float32_atol
=
orig_float32_atol
if
pickle
:
if
isinstance
(
cpu_mode
,
theano
.
compile
.
ProfileMode
):
import
pickle
print
(
"BEGIN CPU profile mode dump"
)
print
(
pickle
.
dumps
(
cpu_mode
))
print
(
"END CPU profile mode dump"
)
if
isinstance
(
gpu_mode
,
theano
.
compile
.
ProfileMode
):
import
pickle
print
(
"BEGIN GPU profile mode dump"
)
print
(
pickle
.
dumps
(
gpu_mode
))
print
(
"END GPU profile mode dump"
)
# print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
# (time_cpu, time_gpu, time_cpu/time_gpu))
# print "Estimated time for one pass through MNIST with CPU: %f" % (
...
...
theano/sandbox/cuda/tests/test_neighbours.py
浏览文件 @
b69ad54d
# Skip test if cuda_ndarray is not available.
from
__future__
import
absolute_import
,
print_function
,
division
from
nose.plugins.skip
import
SkipTest
import
unittest
import
theano.tensor.nnet.tests.test_neighbours
from
theano.sandbox.cuda.neighbours
import
GpuImages2Neibs
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
import
theano.tensor.nnet.tests.test_neighbours
from
theano.sandbox.cuda.neighbours
import
GpuImages2Neibs
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
...
...
theano/sandbox/cuda/tests/test_opt.py
浏览文件 @
b69ad54d
差异被折叠。
点击展开。
theano/sandbox/cuda/tests/test_rng_curand.py
浏览文件 @
b69ad54d
...
...
@@ -8,7 +8,7 @@ from theano.sandbox.rng_mrg import MRG_RandomStreams
# Skip tests if cuda_ndarray is not available.
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
# The PyCObject that represents the cuda random stream object
...
...
theano/sandbox/cuda/tests/test_tensor_op.py
浏览文件 @
b69ad54d
...
...
@@ -2,7 +2,6 @@
This file test tensor op that should also operate on CudaNdaray.
"""
from
__future__
import
absolute_import
,
print_function
,
division
import
copy
from
nose.plugins.skip
import
SkipTest
import
numpy
...
...
@@ -14,7 +13,7 @@ import theano.tensor as T
# Skip test if cuda_ndarray is not available.
import
theano.sandbox.cuda
as
cuda
from
theano.tensor.nnet.tests
import
test_conv3d2d
if
cuda
.
cuda_available
==
False
:
if
cuda
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
...
...
@@ -57,7 +56,7 @@ def test_softmax_optimizations():
one_of_n
=
tensor
.
lvector
(
'one_of_n'
)
op
=
crossentropy_categorical_1hot
xe
=
op
(
x
,
one_of_n
)
op
(
x
,
one_of_n
)
fgraph
=
theano
.
gof
.
FunctionGraph
(
[
x
,
one_of_n
],
...
...
@@ -84,10 +83,10 @@ def test_may_share_memory_cuda():
# can't test the transpose as ta._strides = is not implemented
# manual transpose of a
#ta = a.reshape((4,3))
#
ta = a.reshape((4,3))
# ta._strides = (ta._strides[1],ta._strides[0])#not implemented
#elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
#ta.gpudata += ta.size*elem_size
#
elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
#
ta.gpudata += ta.size*elem_size
for
a_
,
b_
,
rep
in
[(
a
,
a
,
True
),
(
b
,
b
,
True
),
(
a
,
b
,
False
),
(
a
,
na
,
False
),
(
b
,
nb
,
False
),
...
...
@@ -95,8 +94,7 @@ def test_may_share_memory_cuda():
(
a
,
va
,
True
),
(
b
,
vb
,
True
),
(
va
,
b
,
False
),
(
a
,
vb
,
False
),
(
a
,
ra
,
True
),
(
b
,
rb
,
True
),
(
ra
,
b
,
False
),
(
a
,
rb
,
False
),
]:
(
ra
,
b
,
False
),
(
a
,
rb
,
False
),
]:
assert
may_share_memory
(
a_
,
b_
)
==
rep
assert
may_share_memory
(
b_
,
a_
)
==
rep
...
...
theano/sandbox/cuda/tests/test_var.py
浏览文件 @
b69ad54d
...
...
@@ -10,7 +10,7 @@ from theano.sandbox.cuda.var import float32_shared_constructor as f32sc
from
theano.sandbox.cuda
import
CudaNdarrayType
,
cuda_available
import
theano.sandbox.cuda
as
cuda
# Skip test if cuda_ndarray is not available.
if
cuda_available
==
False
:
if
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
...
...
@@ -26,19 +26,18 @@ def test_float32_shared_constructor():
# test that broadcastable arg is accepted, and that they
# don't strictly have to be tuples
assert
eq
(
f32sc
(
npy_row
,
broadcastable
=
(
True
,
False
))
.
type
,
assert
eq
(
f32sc
(
npy_row
,
broadcastable
=
(
True
,
False
))
.
type
,
CudaNdarrayType
((
True
,
False
)))
assert
eq
(
f32sc
(
npy_row
,
broadcastable
=
[
True
,
False
])
.
type
,
assert
eq
(
f32sc
(
npy_row
,
broadcastable
=
[
True
,
False
])
.
type
,
CudaNdarrayType
((
True
,
False
)))
assert
eq
(
f32sc
(
npy_row
,
broadcastable
=
numpy
.
array
([
True
,
False
]))
.
type
,
assert
eq
(
f32sc
(
npy_row
,
broadcastable
=
numpy
.
array
([
True
,
False
]))
.
type
,
CudaNdarrayType
([
True
,
False
]))
# test that we can make non-matrix shared vars
assert
eq
(
f32sc
(
numpy
.
zeros
((
2
,
3
,
4
,
5
),
dtype
=
'float32'
))
.
type
,
assert
eq
(
f32sc
(
numpy
.
zeros
((
2
,
3
,
4
,
5
),
dtype
=
'float32'
))
.
type
,
CudaNdarrayType
((
False
,)
*
4
))
...
...
@@ -77,7 +76,8 @@ class T_updates(unittest.TestCase):
x
=
tensor
.
fmatrix
(
'x'
)
output_updates
=
[(
output_var
,
x
**
2
)]
output_givens
=
{
x
:
data
}
output_func
=
theano
.
function
(
inputs
=
[],
outputs
=
[],
output_func
=
theano
.
function
(
inputs
=
[],
outputs
=
[],
updates
=
output_updates
,
givens
=
output_givens
)
output_func
()
...
...
theano/sandbox/cuda/tests/test_viewop.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
import
numpy
import
unittest
from
nose.plugins.skip
import
SkipTest
import
theano
...
...
@@ -11,7 +10,7 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
def
test_viewop_gpu
():
from
theano.sandbox
import
cuda
if
cuda
.
cuda_available
==
False
:
if
cuda
.
cuda_available
is
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
_x
=
theano
.
tensor
.
fvector
(
'x'
)
x
=
cuda
.
gpu_from_host
(
_x
)
...
...
theano/sandbox/cuda/tests/walltime.py
浏览文件 @
b69ad54d
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
print_function
import
sys
,
time
import
sys
import
time
from
six
import
iteritems
from
theano.compile.pfunc
import
pfunc
from
theano
import
tensor
...
...
@@ -35,35 +36,47 @@ def showtimes(times):
def
cmp_sigmoids
(
shape
):
def
numpy_sigmoid
(
input
):
rval
=
1.0
/
(
1.0
+
numpy
.
exp
(
-
input
))
sinput
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,)
*
len
(
shape
))()
shared_input
=
tcn
.
shared_constructor
(
numpy
.
random
.
rand
(
*
shape
),
'shared_input'
)
times
=
compare_fns
(
dict
(
numpy
=
numpy_sigmoid
,
theano_cpu
=
pfunc
([
sinput
],
1.0
/
(
1.0
+
tensor
.
exp
(
-
sinput
)))
,
theano_gpu_onboard
=
pfunc
([
sinput
],
[],
updates
=
[(
shared_input
,
1.0
/
(
1.0
+
tensor
.
exp
(
-
shared_input
)))])
),
1.0
/
(
1.0
+
numpy
.
exp
(
-
input
))
sinput
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,)
*
len
(
shape
))()
shared_input
=
tcn
.
shared_constructor
(
numpy
.
random
.
rand
(
*
shape
),
'shared_input'
)
times
=
compare_fns
(
dict
(
numpy
=
numpy_sigmoid
,
theano_cpu
=
pfunc
([
sinput
],
1.0
/
(
1.0
+
tensor
.
exp
(
-
sinput
))),
theano_gpu_onboard
=
pfunc
(
[
sinput
],
[],
updates
=
[(
shared_input
,
1.0
/
(
1.0
+
tensor
.
exp
(
-
shared_input
)))])),
input
=
shared_input
.
value
)
showtimes
(
times
)
def
cmp_sigmoids_T
(
shape
):
def
numpy_sigmoid
(
input
):
rval
=
1.0
/
(
1.0
+
numpy
.
exp
(
-
input
.
T
))
sinput
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,)
*
len
(
shape
))()
shared_input
=
tcn
.
shared_constructor
(
numpy
.
random
.
rand
(
*
shape
),
'shared_input'
)
times
=
compare_fns
(
dict
(
numpy
=
numpy_sigmoid
,
theano_cpu
=
pfunc
([
sinput
],
1.0
/
(
1.0
+
tensor
.
exp
(
-
sinput
.
T
)))
,
theano_gpu_onboard
=
pfunc
([
sinput
],
[],
updates
=
[(
shared_input
,
1.0
/
(
1.0
+
tensor
.
exp
(
-
shared_input
.
T
)))])
),
1.0
/
(
1.0
+
numpy
.
exp
(
-
input
.
T
))
sinput
=
tensor
.
Tensor
(
dtype
=
'float32'
,
broadcastable
=
(
0
,)
*
len
(
shape
))()
shared_input
=
tcn
.
shared_constructor
(
numpy
.
random
.
rand
(
*
shape
),
'shared_input'
)
times
=
compare_fns
(
dict
(
numpy
=
numpy_sigmoid
,
theano_cpu
=
pfunc
([
sinput
],
1.0
/
(
1.0
+
tensor
.
exp
(
-
sinput
.
T
))),
theano_gpu_onboard
=
pfunc
(
[
sinput
],
[],
updates
=
[(
shared_input
,
1.0
/
(
1.0
+
tensor
.
exp
(
-
shared_input
.
T
)))])),
input
=
shared_input
.
value
)
showtimes
(
times
)
if
__name__
==
'__main__'
:
eval
(
sys
.
argv
[
1
])
# cmp_sigmoids((640, 64*64)) # looks great in profiler
#cmp_sigmoids((173, 74*49))
#cmp_sigmoids_T((173, 74*49))
# cmp_sigmoids((173, 74*49))
# cmp_sigmoids_T((173, 74*49))
theano/sandbox/cuda/type.py
浏览文件 @
b69ad54d
...
...
@@ -259,8 +259,8 @@ class CudaNdarrayType(Type):
'complex64'
:
(
complex
,
'theano_complex64'
,
'NPY_COMPLEX64'
)}[
self
.
dtype
]
except
KeyError
:
raise
TypeError
(
"Unsupported dtype for
%
s:
%
s"
%
(
self
.
__class__
.
__name__
,
self
.
dtype
))
raise
TypeError
(
"Unsupported dtype for
%
s:
%
s"
%
(
self
.
__class__
.
__name__
,
self
.
dtype
))
def
__eq__
(
self
,
other
):
"""
...
...
@@ -271,9 +271,10 @@ class CudaNdarrayType(Type):
other
.
broadcastable
==
self
.
broadcastable
)
def
convert_variable
(
self
,
var
):
if
(
type
(
self
)
==
type
(
var
.
type
)
and
if
(
isinstance
(
self
,
type
(
var
.
type
)
)
and
self
.
ndim
==
var
.
type
.
ndim
and
all
(
sb
==
ob
or
ob
for
sb
,
ob
in
zip
(
self
.
broadcastable
,
all
(
sb
==
ob
or
ob
for
sb
,
ob
in
zip
(
self
.
broadcastable
,
var
.
type
.
broadcastable
))):
return
theano
.
tensor
.
patternbroadcast
(
var
,
self
.
broadcastable
)
...
...
@@ -312,7 +313,7 @@ class CudaNdarrayType(Type):
return
self
.
name
else
:
b
=
self
.
broadcastable
#bcast = str(self.broadcastable)
#
bcast = str(self.broadcastable)
if
not
numpy
.
any
(
b
):
s
=
"
%
iD"
%
len
(
b
)
else
:
...
...
@@ -327,7 +328,7 @@ class CudaNdarrayType(Type):
def
__repr__
(
self
):
return
str
(
self
)
#"CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
#
"CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
def
c_declare
(
self
,
name
,
sub
,
check_input
=
True
):
return
""" CudaNdarray *
%(name)
s;"""
%
locals
()
...
...
@@ -563,8 +564,7 @@ theano.compile.register_deep_copy_op_c_code(
CudaNdarray_HOST_DIMS(
%(oname)
s)[i]) {
alloc = true;
break;
}
}
}}
if(alloc) {
Py_XDECREF(
%(oname)
s);
%(oname)
s = (CudaNdarray*)CudaNdarray_Copy(
%(iname)
s);
...
...
@@ -581,8 +581,7 @@ theano.compile.register_deep_copy_op_c_code(
%(fail)
s;
}
}
"""
,
version
=
3
)
"""
,
version
=
3
)
# THIS WORKS But CudaNdarray instances don't compare equal to one
...
...
@@ -608,5 +607,5 @@ def CudaNdarray_pickler(cnda):
# In case cuda is not imported.
if
cuda
is
not
None
:
copyreg
.
pickle
(
cuda
.
CudaNdarray
,
CudaNdarray_pickler
,
CudaNdarray_unpickler
)
copyreg
.
pickle
(
cuda
.
CudaNdarray
,
CudaNdarray_pickler
,
CudaNdarray_unpickler
)
theano/sandbox/cuda/var.py
浏览文件 @
b69ad54d
...
...
@@ -13,7 +13,7 @@ try:
# We must do those import to be able to create the full doc when nvcc
# is not available
from
theano.sandbox.cuda
import
filter
as
type_support_filter
from
theano.sandbox.cuda.basic_ops
import
HostFromGpu
,
GpuFromHost
from
theano.sandbox.cuda.basic_ops
import
HostFromGpu
except
ImportError
:
pass
...
...
@@ -33,6 +33,7 @@ class _operators(tensor.basic._tensor_py_operators):
def
_as_TensorVariable
(
self
):
return
HostFromGpu
()(
self
)
def
_as_CudaNdarrayVariable
(
self
):
return
self
...
...
@@ -54,6 +55,7 @@ class CudaNdarrayConstantSignature(tensor.TensorConstantSignature):
class
CudaNdarrayConstant
(
_operators
,
Constant
):
def
signature
(
self
):
return
CudaNdarrayConstantSignature
((
self
.
type
,
numpy
.
asarray
(
self
.
data
)))
def
__str__
(
self
):
if
self
.
name
is
not
None
:
return
self
.
name
...
...
@@ -61,7 +63,7 @@ class CudaNdarrayConstant(_operators, Constant):
data
=
str
(
numpy
.
asarray
(
self
.
data
))
except
Exception
as
e
:
data
=
"error while transferring the value: "
+
str
(
e
)
return
"CudaNdarrayConstant{"
+
data
+
"}"
return
"CudaNdarrayConstant{"
+
data
+
"}"
CudaNdarrayType
.
Constant
=
CudaNdarrayConstant
...
...
theano/tests/test_flake8.py
浏览文件 @
b69ad54d
...
...
@@ -87,42 +87,8 @@ whitelist_flake8 = [
"sandbox/tests/test_theano_object.py"
,
"sandbox/tests/test_scan.py"
,
"sandbox/tests/__init__.py"
,
"sandbox/cuda/var.py"
,
"sandbox/cuda/GpuConvGrad3D.py"
,
"sandbox/cuda/basic_ops.py"
,
"sandbox/cuda/nnet.py"
,
"sandbox/cuda/elemwise.py"
,
"sandbox/cuda/type.py"
,
"sandbox/cuda/__init__.py"
,
"sandbox/cuda/opt.py"
,
"sandbox/cuda/blas.py"
,
"sandbox/cuda/blocksparse.py"
,
"sandbox/cuda/rng_curand.py"
,
"sandbox/cuda/fftconv.py"
,
"sandbox/cuda/kernel_codegen.py"
,
"sandbox/cuda/GpuConvTransp3D.py"
,
"sandbox/cuda/nvcc_compiler.py"
,
"sandbox/cuda/neighbours.py"
,
"sandbox/cuda/tests/__init__.py"
,
"sandbox/cuda/tests/walltime.py"
,
"sandbox/cuda/tests/test_gradient.py"
,
"sandbox/cuda/tests/test_neighbours.py"
,
"sandbox/cuda/tests/test_conv_cuda_ndarray.py"
,
"sandbox/cuda/tests/test_var.py"
,
"sandbox/cuda/tests/test_opt.py"
,
"sandbox/cuda/tests/test_blas.py"
,
"sandbox/cuda/tests/test_driver.py"
,
"sandbox/cuda/tests/test_rng_curand.py"
,
"sandbox/cuda/tests/test_basic_ops.py"
,
"sandbox/cuda/tests/test_memory.py"
,
"sandbox/cuda/tests/test_mlp.py"
,
"sandbox/cuda/tests/test_bench_loopfusion.py"
,
"sandbox/cuda/tests/test_blocksparse.py"
,
"sandbox/cuda/tests/test_cuda_ndarray.py"
,
"sandbox/cuda/tests/test_tensor_op.py"
,
"sandbox/cuda/tests/test_extra_ops.py"
,
"sandbox/cuda/tests/test_gemmcorr3d.py"
,
"sandbox/cuda/tests/test_viewop.py"
,
"sandbox/gpuarray/tests/__init__.py"
,
"sandbox/scan_module/scan_utils.py"
,
"sandbox/scan_module/scan.py"
,
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论