Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
4814cd99
提交
4814cd99
authored
10月 10, 2015
作者:
Pascal Lamblin
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #3482 from abergeron/multi_gpu_new2
Multi-gpu support
上级
6ca7b2b6
ec927f7d
隐藏空白字符变更
内嵌
并排
正在显示
46 个修改的文件
包含
1418 行增加
和
1178 行删除
+1418
-1178
__init__.py
theano/__init__.py
+2
-1
configdefaults.py
theano/configdefaults.py
+23
-0
check_multi_gpu.py
theano/misc/check_multi_gpu.py
+67
-0
basic_ops.py
theano/sandbox/cuda/basic_ops.py
+2
-8
opt.py
theano/sandbox/cuda/opt.py
+4
-1
__init__.py
theano/sandbox/gpuarray/__init__.py
+25
-10
basic_ops.py
theano/sandbox/gpuarray/basic_ops.py
+271
-118
blas.py
theano/sandbox/gpuarray/blas.py
+91
-78
conv.cu
theano/sandbox/gpuarray/conv.cu
+5
-238
conv.py
theano/sandbox/gpuarray/conv.py
+28
-77
conv_kernel.cu
theano/sandbox/gpuarray/conv_kernel.cu
+2
-2
dnn.py
theano/sandbox/gpuarray/dnn.py
+129
-93
dnn_base.c
theano/sandbox/gpuarray/dnn_base.c
+3
-3
dnn_fwd.c
theano/sandbox/gpuarray/dnn_fwd.c
+2
-2
dnn_gi.c
theano/sandbox/gpuarray/dnn_gi.c
+2
-2
dnn_gw.c
theano/sandbox/gpuarray/dnn_gw.c
+2
-2
dnn_pool.c
theano/sandbox/gpuarray/dnn_pool.c
+2
-2
dnn_pool_grad.c
theano/sandbox/gpuarray/dnn_pool_grad.c
+3
-3
dnn_softmax.c
theano/sandbox/gpuarray/dnn_softmax.c
+2
-2
dnn_softmax_grad.c
theano/sandbox/gpuarray/dnn_softmax_grad.c
+2
-2
elemwise.py
theano/sandbox/gpuarray/elemwise.py
+76
-81
gemm16.c
theano/sandbox/gpuarray/gemm16.c
+3
-2
neighbours.py
theano/sandbox/gpuarray/neighbours.py
+14
-7
nerv.py
theano/sandbox/gpuarray/nerv.py
+13
-30
nnet.py
theano/sandbox/gpuarray/nnet.py
+53
-35
opt.py
theano/sandbox/gpuarray/opt.py
+206
-134
opt_util.py
theano/sandbox/gpuarray/opt_util.py
+3
-2
pycuda_helper.py
theano/sandbox/gpuarray/pycuda_helper.py
+0
-22
subtensor.py
theano/sandbox/gpuarray/subtensor.py
+49
-41
config.py
theano/sandbox/gpuarray/tests/config.py
+26
-0
test_basic_ops.py
theano/sandbox/gpuarray/tests/test_basic_ops.py
+25
-53
test_blas.py
theano/sandbox/gpuarray/tests/test_blas.py
+3
-3
test_conv_cuda_ndarray.py
theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+5
-5
test_dnn.py
theano/sandbox/gpuarray/tests/test_dnn.py
+19
-19
test_elemwise.py
theano/sandbox/gpuarray/tests/test_elemwise.py
+31
-37
test_neighbours.py
theano/sandbox/gpuarray/tests/test_neighbours.py
+2
-3
test_nerv.py
theano/sandbox/gpuarray/tests/test_nerv.py
+1
-1
test_nnet.py
theano/sandbox/gpuarray/tests/test_nnet.py
+1
-3
test_opt.py
theano/sandbox/gpuarray/tests/test_opt.py
+15
-16
test_scan.py
theano/sandbox/gpuarray/tests/test_scan.py
+5
-5
test_subtensor.py
theano/sandbox/gpuarray/tests/test_subtensor.py
+3
-4
type.py
theano/sandbox/gpuarray/type.py
+180
-23
rng_mrg.py
theano/sandbox/rng_mrg.py
+3
-0
scan_opt.py
theano/scan_module/scan_opt.py
+9
-4
test_scan.py
theano/scan_module/tests/test_scan.py
+6
-0
test_flake8.py
theano/tests/test_flake8.py
+0
-4
没有找到文件。
theano/__init__.py
浏览文件 @
4814cd99
...
@@ -112,7 +112,8 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
...
@@ -112,7 +112,8 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
if
(
config
.
device
.
startswith
(
'cuda'
)
or
if
(
config
.
device
.
startswith
(
'cuda'
)
or
config
.
device
.
startswith
(
'opencl'
)
or
config
.
device
.
startswith
(
'opencl'
)
or
config
.
init_gpu_device
.
startswith
(
'cuda'
)
or
config
.
init_gpu_device
.
startswith
(
'cuda'
)
or
config
.
init_gpu_device
.
startswith
(
'opencl'
)):
config
.
init_gpu_device
.
startswith
(
'opencl'
)
or
config
.
contexts
!=
''
):
import
theano.sandbox.gpuarray
import
theano.sandbox.gpuarray
# Use config.numpy to call numpy.seterr
# Use config.numpy to call numpy.seterr
...
...
theano/configdefaults.py
浏览文件 @
4814cd99
...
@@ -111,6 +111,29 @@ AddConfigVar(
...
@@ -111,6 +111,29 @@ AddConfigVar(
BoolParam
(
False
,
allow_override
=
False
),
BoolParam
(
False
,
allow_override
=
False
),
in_c_key
=
False
)
in_c_key
=
False
)
class
ContextsParam
(
ConfigParam
):
def
__init__
(
self
):
def
filter
(
val
):
if
val
==
''
:
return
val
for
v
in
val
.
split
(
';'
):
s
=
v
.
split
(
'->'
)
if
len
(
s
)
!=
2
:
raise
ValueError
(
"Malformed context map:
%
s"
%
(
v
,))
return
val
ConfigParam
.
__init__
(
self
,
''
,
filter
,
False
)
AddConfigVar
(
'contexts'
,
"""
Context map for multi-gpu operation. Format is a
semicolon-separated list of names and device names in the
'name->dev_name' format. An example that would map name 'test' to
device 'cuda0' and name 'test2' to device 'opencl0:0' follows:
"test->cuda0;test2->opencl0:0".
"""
,
ContextsParam
(),
in_c_key
=
False
)
AddConfigVar
(
AddConfigVar
(
'print_active_device'
,
'print_active_device'
,
"Print active device at when the GPU device is initialized."
,
"Print active device at when the GPU device is initialized."
,
...
...
theano/misc/check_multi_gpu.py
0 → 100644
浏览文件 @
4814cd99
#! /usr/bin/env python
"""
This file compare the runtime of two independent dot products on one
and two GPU to measure the speedup.
This should be 2x if the GPUs are equivalent.
"""
import
time
import
numpy
import
theano
from
theano.sandbox.gpuarray
import
init_dev
from
theano.sandbox.gpuarray.type
import
gpuarray_shared_constructor
as
shared
from
theano.sandbox.gpuarray.blas
import
gpu_dot22
def
main
(
dev1
,
dev2
):
init_dev
(
dev1
,
'ctx1'
)
init_dev
(
dev2
,
'ctx2'
)
val1a
=
shared
(
numpy
.
random
.
randn
(
1024
,
1024
)
.
astype
(
'float32'
),
context_name
=
'ctx1'
)
val1b
=
shared
(
numpy
.
random
.
randn
(
1024
,
1024
)
.
astype
(
'float32'
),
context_name
=
'ctx1'
)
val1c
=
shared
(
numpy
.
random
.
randn
(
1024
,
1024
)
.
astype
(
'float32'
),
context_name
=
'ctx1'
)
val1d
=
shared
(
numpy
.
random
.
randn
(
1024
,
1024
)
.
astype
(
'float32'
),
context_name
=
'ctx1'
)
val2a
=
shared
(
numpy
.
random
.
randn
(
1024
,
1024
)
.
astype
(
'float32'
),
context_name
=
'ctx2'
)
val2b
=
shared
(
numpy
.
random
.
randn
(
1024
,
1024
)
.
astype
(
'float32'
),
context_name
=
'ctx2'
)
f1
=
theano
.
function
([],
[
gpu_dot22
(
val1a
,
val1b
),
gpu_dot22
(
val1c
,
val1d
)])
f2
=
theano
.
function
([],
[
gpu_dot22
(
val1a
,
val1b
),
gpu_dot22
(
val2a
,
val2b
)])
r
=
f1
()
r
[
0
]
.
sync
(),
r
[
1
]
.
sync
()
r
=
None
t
=
time
.
time
()
r
=
f1
()
r
[
0
]
.
sync
(),
r
[
1
]
.
sync
()
t2
=
time
.
time
()
r
=
None
print
(
"one ctx
%
f"
%
(
t2
-
t
,))
r
=
f2
()
r
[
0
]
.
sync
(),
r
[
1
]
.
sync
()
r
=
None
t
=
time
.
time
()
r
=
f2
()
r
[
0
]
.
sync
(),
r
[
1
]
.
sync
()
t2
=
time
.
time
()
r
=
None
print
(
"two ctx
%
f"
%
(
t2
-
t
,))
if
__name__
==
'__main__'
:
import
sys
if
len
(
sys
.
argv
)
!=
3
:
raise
ValueError
(
"This script require two device names."
)
main
(
sys
.
argv
[
1
],
sys
.
argv
[
2
])
theano/sandbox/cuda/basic_ops.py
浏览文件 @
4814cd99
...
@@ -92,10 +92,7 @@ class HostFromGpu(GpuOp):
...
@@ -92,10 +92,7 @@ class HostFromGpu(GpuOp):
def
R_op
(
self
,
inputs
,
eval_points
):
def
R_op
(
self
,
inputs
,
eval_points
):
ev
,
=
eval_points
ev
,
=
eval_points
if
isinstance
(
ev
,
tensor
.
TensorType
):
return
self
(
ev
)
return
[
gpu_from_host
(
ev
)]
else
:
return
[
ev
]
def
infer_shape
(
self
,
node
,
xshp
):
def
infer_shape
(
self
,
node
,
xshp
):
return
xshp
return
xshp
...
@@ -155,10 +152,7 @@ class GpuFromHost(GpuOp):
...
@@ -155,10 +152,7 @@ class GpuFromHost(GpuOp):
def
R_op
(
self
,
inputs
,
eval_points
):
def
R_op
(
self
,
inputs
,
eval_points
):
ev
,
=
eval_points
ev
,
=
eval_points
if
isinstance
(
ev
,
CudaNdarrayType
):
self
(
ev
)
return
[
host_from_gpu
(
ev
)]
else
:
return
[
ev
]
def
infer_shape
(
self
,
node
,
xshp
):
def
infer_shape
(
self
,
node
,
xshp
):
return
xshp
return
xshp
...
...
theano/sandbox/cuda/opt.py
浏览文件 @
4814cd99
...
@@ -2478,8 +2478,11 @@ def local_gpu_allocempty(node):
...
@@ -2478,8 +2478,11 @@ def local_gpu_allocempty(node):
return
False
return
False
def
typeInfer
(
node
):
return
typeConstructor
optdb
.
register
(
'gpu_scanOp_make_inplace'
,
optdb
.
register
(
'gpu_scanOp_make_inplace'
,
scan_opt
.
ScanInplaceOptimizer
(
type
Constructor
=
typeConstructo
r
,
scan_opt
.
ScanInplaceOptimizer
(
type
Infer
=
typeInfe
r
,
gpu_flag
=
True
),
gpu_flag
=
True
),
75
,
75
,
'gpu'
,
'gpu'
,
...
...
theano/sandbox/gpuarray/__init__.py
浏览文件 @
4814cd99
...
@@ -21,26 +21,30 @@ except ImportError:
...
@@ -21,26 +21,30 @@ except ImportError:
# This is for documentation not to depend on the availability of pygpu
# This is for documentation not to depend on the availability of pygpu
from
.type
import
(
GpuArrayType
,
GpuArrayVariable
,
GpuArrayConstant
,
from
.type
import
(
GpuArrayType
,
GpuArrayVariable
,
GpuArrayConstant
,
GpuArraySharedVariable
,
gpuarray_shared_constructor
)
GpuArraySharedVariable
,
gpuarray_shared_constructor
,
reg_context
)
from
.
import
opt
,
nerv
from
.
import
opt
,
nerv
def
init_dev
(
dev
):
def
init_dev
(
dev
,
name
=
None
):
if
pygpu
.
gpuarray
.
api_version
()
!=
(
-
10000
,
0
):
if
pygpu
.
gpuarray
.
api_version
()
!=
(
-
10000
,
0
):
raise
RuntimeError
(
"Wrong API version for gpuarray:"
,
raise
RuntimeError
(
"Wrong API version for gpuarray:"
,
pygpu
.
gpuarray
.
api_version
(),
pygpu
.
gpuarray
.
api_version
(),
"Make sure Theano and libgpuarray/pygpu "
"Make sure Theano and libgpuarray/pygpu "
"are in sync."
)
"are in sync."
)
global
pygpu_activated
global
pygpu_activated
context
=
pygpu
.
init
(
dev
)
if
dev
not
in
init_dev
.
devmap
:
pygpu
.
set_default_context
(
context
)
init_dev
.
devmap
[
dev
]
=
pygpu
.
init
(
dev
)
context
=
init_dev
.
devmap
[
dev
]
# This will map the context name to the real context object.
reg_context
(
name
,
context
)
pygpu_activated
=
True
pygpu_activated
=
True
if
config
.
print_active_device
:
if
config
.
print_active_device
:
print
(
"Using device
%
s:
%
s"
%
(
dev
,
context
.
devname
),
file
=
sys
.
stderr
)
print
(
"Mapped name
%
s to device
%
s:
%
s"
%
(
name
,
dev
,
context
.
devname
),
# remember the active device
file
=
sys
.
stderr
)
init_dev
.
device
=
dev
init_dev
.
device
=
None
# This maps things like 'cuda0' to the context object on that device.
init_dev
.
devmap
=
{}
if
pygpu
:
if
pygpu
:
try
:
try
:
...
@@ -52,11 +56,21 @@ if pygpu:
...
@@ -52,11 +56,21 @@ if pygpu:
optdb
.
add_tags
(
'gpuarray_opt'
,
'fast_run'
,
'fast_compile'
)
optdb
.
add_tags
(
'gpuarray_opt'
,
'fast_run'
,
'fast_compile'
)
elif
(
config
.
init_gpu_device
.
startswith
(
'cuda'
)
or
elif
(
config
.
init_gpu_device
.
startswith
(
'cuda'
)
or
config
.
init_gpu_device
.
startswith
(
'opencl'
)):
config
.
init_gpu_device
.
startswith
(
'opencl'
)):
if
config
.
device
!=
'cpu'
:
raise
ValueError
(
'you must set device=cpu to use init_gpu_device.'
)
if
config
.
contexts
!=
''
:
print
(
"Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want."
)
init_dev
(
config
.
init_gpu_device
)
init_dev
(
config
.
init_gpu_device
)
if
config
.
contexts
!=
''
:
for
n
,
d
in
(
c
.
split
(
'->'
)
for
c
in
config
.
contexts
.
split
(
';'
)):
init_dev
(
d
.
strip
(),
n
.
strip
())
import
theano.compile
theano
.
compile
.
shared_constructor
(
gpuarray_shared_constructor
)
optdb
.
add_tags
(
'gpuarray_opt'
,
'fast_run'
,
'fast_compile'
)
from
.basic_ops
import
(
GpuAlloc
,
GpuContiguous
,
GpuEye
,
GpuFromHost
,
from
.basic_ops
import
(
GpuAlloc
,
GpuContiguous
,
GpuEye
,
GpuFromHost
,
GpuJoin
,
GpuReshape
,
GpuSplit
,
HostFromGpu
)
GpuJoin
,
GpuReshape
,
GpuSplit
,
HostFromGpu
)
from
.basic_ops
import
host_from_gpu
,
gpu_from_h
ost
from
.basic_ops
import
host_from_gpu
,
GpuFromH
ost
from
.elemwise
import
GpuElemwise
from
.elemwise
import
GpuElemwise
from
.subtensor
import
(
GpuSubtensor
,
GpuIncSubtensor
,
from
.subtensor
import
(
GpuSubtensor
,
GpuIncSubtensor
,
GpuAdvancedIncSubtensor1
)
GpuAdvancedIncSubtensor1
)
...
@@ -67,5 +81,6 @@ else:
...
@@ -67,5 +81,6 @@ else:
if
(
config
.
init_gpu_device
.
startswith
(
'cuda'
)
or
if
(
config
.
init_gpu_device
.
startswith
(
'cuda'
)
or
config
.
init_gpu_device
.
startswith
(
'opencl'
)
or
config
.
init_gpu_device
.
startswith
(
'opencl'
)
or
config
.
device
.
startswith
(
'opencl'
)
or
config
.
device
.
startswith
(
'opencl'
)
or
config
.
device
.
startswith
(
'cuda'
)):
config
.
device
.
startswith
(
'cuda'
)
or
config
.
contexts
!=
''
):
error
(
"pygpu was configured but could not be imported"
,
exc_info
=
True
)
error
(
"pygpu was configured but could not be imported"
,
exc_info
=
True
)
theano/sandbox/gpuarray/basic_ops.py
浏览文件 @
4814cd99
...
@@ -9,7 +9,9 @@ from theano.tensor.basic import Alloc, Join, Split
...
@@ -9,7 +9,9 @@ from theano.tensor.basic import Alloc, Join, Split
from
theano.gof
import
HideC
from
theano.gof
import
HideC
from
theano.gof.utils
import
MethodNotDefined
from
theano.gof.utils
import
MethodNotDefined
from
theano.compat
import
PY3
from
collections
import
deque
from
six
import
string_types
from
six
import
string_types
from
six.moves
import
xrange
from
six.moves
import
xrange
...
@@ -19,27 +21,83 @@ try:
...
@@ -19,27 +21,83 @@ try:
except
ImportError
:
except
ImportError
:
pass
pass
from
.type
import
GpuArrayType
from
.type
import
GpuArrayType
,
GpuArrayConstant
,
gpu_context_type
,
get_context
from
.fp16_help
import
write_w
from
.fp16_help
import
write_w
def
as_gpuarray_variable
(
x
):
def
as_gpuarray_variable
(
x
,
context_name
):
if
getattr
(
x
,
'owner'
,
None
):
# If this is already some form of variable, try to avoid an extra transfer
if
isinstance
(
x
.
owner
.
op
,
HostFromGpu
):
if
isinstance
(
x
,
Variable
):
return
x
.
owner
.
inputs
[
0
]
while
True
:
elif
(
isinstance
(
x
.
owner
.
op
,
GpuFromHost
)
and
# If we are already a GpuArrayVariable in the right context
x
.
owner
.
inputs
[
0
]
.
owner
and
# then there is nothing to do.
isinstance
(
x
.
owner
.
inputs
[
0
]
.
owner
.
op
,
HostFromGpu
)):
if
(
isinstance
(
x
.
type
,
GpuArrayType
)
and
return
x
.
owner
.
inputs
[
0
]
.
owner
.
inputs
[
0
]
x
.
type
.
context_name
==
context_name
):
return
x
# If x is the result of a transfer, try to dig through.
if
getattr
(
x
,
'owner'
,
None
):
if
isinstance
(
x
.
owner
.
op
,
HostFromGpu
):
x
=
x
.
owner
.
inputs
[
0
]
continue
if
isinstance
(
x
.
owner
.
op
,
GpuFromHost
):
x
=
x
.
owner
.
inputs
[
0
]
continue
if
isinstance
(
x
.
owner
.
op
,
GpuToGpu
):
x
=
x
.
owner
.
inputs
[
0
]
continue
# If none of the conditions where met, then continue with
# the rest of the body
break
# If we couldn't deal with transfers, then maybe it's a tensor
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
return
GpuFromHost
(
context_name
)(
x
)
# Try _as_GpuArrayVariable if possible
if
hasattr
(
x
,
'_as_GpuArrayVariable'
):
if
hasattr
(
x
,
'_as_GpuArrayVariable'
):
return
x
.
_as_GpuArrayVariable
()
return
x
.
_as_GpuArrayVariable
(
context_name
)
# TODO we need to have the cuda -> gpu path taken care of.
tensor_x
=
tensor
.
as_tensor_variable
(
x
)
# If it didn't work try for a constant
return
gpu_from_host
(
tensor_x
)
ctx
=
get_context
(
context_name
)
if
isinstance
(
x
,
gpuarray
.
GpuArray
):
if
x
.
context
.
ptr
!=
ctx
.
ptr
:
x
=
x
.
transfer
(
ctx
)
def
as_gpuarray
(
x
):
x
=
gpuarray
.
asarray
(
x
,
context
=
ctx
)
return
gpuarray
.
array
(
x
,
copy
=
False
)
bcast
=
[(
s
==
1
)
for
s
in
x
.
shape
]
return
GpuArrayConstant
(
GpuArrayType
(
dtype
=
x
.
dtype
,
broadcastable
=
bcast
,
context_name
=
context_name
),
x
)
def
infer_context_name
(
*
vars
):
"""
Infer the context name to use from the inputs given
"""
# We try to infer the closest context first
# TODO: What to do in case of context conflicts?
# We currently use a first found wins approach.
todo
=
deque
()
todo
.
extendleft
(
vars
)
while
todo
:
v
=
todo
.
pop
()
if
isinstance
(
v
.
type
,
GpuArrayType
):
return
v
.
type
.
context_name
if
hasattr
(
v
.
tag
,
'context_name'
):
return
v
.
tag
.
context_name
if
v
.
owner
:
if
isinstance
(
v
.
owner
.
op
,
HostFromGpu
):
return
v
.
owner
.
inputs
[
0
]
.
type
.
context_name
if
len
(
v
.
owner
.
inputs
)
==
1
:
todo
.
extendleft
(
v
.
owner
.
inputs
)
# If we can't find a context we infer None, which is the default
return
None
class
Kernel
(
object
):
class
Kernel
(
object
):
...
@@ -111,10 +169,12 @@ class Kernel(object):
...
@@ -111,10 +169,12 @@ class Kernel(object):
class
GpuKernelBase
(
object
):
class
GpuKernelBase
(
object
):
context_type
=
gpu_context_type
def
gpu_kernels
(
self
,
node
,
name
):
def
gpu_kernels
(
self
,
node
,
name
):
"""
"""
This is the method to override. This should return an iterable
of Kernel
This is the method to override. This should return an iterable
objects that describe the kernels this op will need.
o
f Kernel o
bjects that describe the kernels this op will need.
"""
"""
raise
MethodNotDefined
(
'gpu_kernels'
)
raise
MethodNotDefined
(
'gpu_kernels'
)
...
@@ -126,8 +186,9 @@ class GpuKernelBase(object):
...
@@ -126,8 +186,9 @@ class GpuKernelBase(object):
o
=
[]
o
=
[]
return
o
+
[
'gpuarray/types.h'
]
return
o
+
[
'gpuarray/types.h'
]
def
_generate_kernel_bin
(
self
,
k
):
def
_generate_kernel_bin
(
self
,
k
,
ctx
):
gk
=
gpuarray
.
GpuKernel
(
k
.
code
,
k
.
name
,
k
.
params
,
**
k
.
flags
)
gk
=
gpuarray
.
GpuKernel
(
k
.
code
,
k
.
name
,
k
.
params
,
context
=
ctx
,
**
k
.
flags
)
bin
=
gk
.
_binary
bin
=
gk
.
_binary
bcode
=
','
.
join
(
hex
(
ord
(
c
))
for
c
in
bin
)
bcode
=
','
.
join
(
hex
(
ord
(
c
))
for
c
in
bin
)
return
(
"""static const char
%(bname)
s[] = {
%(bcode)
s };"""
%
return
(
"""static const char
%(bname)
s[] = {
%(bcode)
s };"""
%
...
@@ -140,7 +201,7 @@ class GpuKernelBase(object):
...
@@ -140,7 +201,7 @@ class GpuKernelBase(object):
dict
(
cname
=
k
.
codevar
,
code
=
code
))
dict
(
cname
=
k
.
codevar
,
code
=
code
))
def
_generate_kernel_vars
(
self
,
k
):
def
_generate_kernel_vars
(
self
,
k
):
return
"""
static
GpuKernel
%(kname)
s;"""
%
dict
(
kname
=
k
.
objvar
)
return
"""GpuKernel
%(kname)
s;"""
%
dict
(
kname
=
k
.
objvar
)
def
c_support_code
(
self
):
def
c_support_code
(
self
):
return
"""
return
"""
...
@@ -153,46 +214,62 @@ class GpuKernelBase(object):
...
@@ -153,46 +214,62 @@ class GpuKernelBase(object):
def
c_support_code_apply
(
self
,
node
,
name
):
def
c_support_code_apply
(
self
,
node
,
name
):
kernels
=
self
.
gpu_kernels
(
node
,
name
)
kernels
=
self
.
gpu_kernels
(
node
,
name
)
bins
=
'
\n
'
.
join
(
self
.
_generate_kernel_bin
(
k
)
for
k
in
kernels
)
ctx
=
self
.
get_context
(
node
)
bins
=
'
\n
'
.
join
(
self
.
_generate_kernel_bin
(
k
,
ctx
)
for
k
in
kernels
)
codes
=
'
\n
'
.
join
(
self
.
_generate_kernel_code
(
k
)
for
k
in
kernels
)
codes
=
'
\n
'
.
join
(
self
.
_generate_kernel_code
(
k
)
for
k
in
kernels
)
vars
=
'
\n
'
.
join
(
self
.
_generate_kernel_vars
(
k
)
for
k
in
kernels
)
return
'
\n
'
.
join
([
bins
,
codes
])
return
'
\n
'
.
join
([
bins
,
codes
,
vars
])
def
_generate_kernel_init
(
self
,
k
,
err
):
def
c_support_code_struct
(
self
,
node
,
name
):
if
PY3
:
kernels
=
self
.
gpu_kernels
(
node
,
name
)
error_out
=
"NULL"
return
'
\n
'
.
join
(
self
.
_generate_kernel_vars
(
k
)
for
k
in
kernels
)
else
:
error_out
=
""
def
_generate_zeros
(
self
,
k
):
return
"""memset(&
%(v)
s, 0, sizeof(
%(v)
s));"""
%
dict
(
v
=
k
.
objvar
)
def
_generate_kernel_init
(
self
,
k
,
fail
,
ctx
):
return
"""{
return
"""{
int err;
int types[
%(numargs)
u] = {
%(types)
s};
int types[
%(numargs)
u] = {
%(types)
s};
const char *bcode =
%(bvar)
s;
const char *bcode =
%(bvar)
s;
size_t sz = sizeof(
%(bvar)
s);
size_t sz = sizeof(
%(bvar)
s);
PyGpuContextObject *c = pygpu_default_context();
if (GpuKernel_init(&
%(ovar)
s,
%(ctx)
s->ops,
%(ctx)
s->ctx, 1, &bcode, &sz,
if (GpuKernel_init(&
%(ovar)
s, c->ops, c->ctx, 1, &bcode, &sz, "
%(kname)
s",
"
%(kname)
s",
%(numargs)
u, types, GA_USE_BINARY, NULL)
%(numargs)
u, types, GA_USE_BINARY, NULL)
!= GA_NO_ERROR) {
!= GA_NO_ERROR) {
if ((
%(err)
s = GpuKernel_init(&
%(ovar)
s, c->ops, c->ctx, 1, &
%(cname)
s
,
if ((
err = GpuKernel_init(&
%(ovar)
s,
%(ctx)
s->ops,
%(ctx)
s->ctx, 1
,
NULL, "
%(kname)
s",
%(numargs)
u, types
,
&
%(cname)
s, NULL, "
%(kname)
s",
%(numargs)
u
,
%(flags)
s, NULL)) != GA_NO_ERROR) {
types,
%(flags)
s, NULL)) != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error
%%
d:
%%
s",
PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error
%%
d:
%%
s",
%(err)
s, Gpu_error(c->ops, c->ctx,
%(err)
s
));
err, Gpu_error(
%(ctx)
s->ops,
%(ctx)
s->ctx, err
));
return
%(error_out)
s;
%(fail)
s
}
}
}
}
}"""
%
dict
(
numargs
=
len
(
k
.
params
),
types
=
k
.
_get_c_types
(),
bvar
=
k
.
binvar
,
}"""
%
dict
(
numargs
=
len
(
k
.
params
),
types
=
k
.
_get_c_types
(),
bvar
=
k
.
binvar
,
ovar
=
k
.
objvar
,
kname
=
k
.
name
,
err
=
err
,
cname
=
k
.
codevar
,
ovar
=
k
.
objvar
,
kname
=
k
.
name
,
cname
=
k
.
codevar
,
flags
=
k
.
_get_c_flags
(),
error_out
=
error_out
)
flags
=
k
.
_get_c_flags
(),
fail
=
fail
,
ctx
=
ctx
)
def
c_init_code_struct
(
self
,
node
,
name
,
sub
):
ctx
=
sub
[
'context'
]
kernels
=
self
.
gpu_kernels
(
node
,
name
)
inits_0
=
'
\n
'
.
join
(
self
.
_generate_zeros
(
k
)
for
k
in
kernels
)
inits
=
'
\n
'
.
join
(
self
.
_generate_kernel_init
(
k
,
sub
[
'fail'
],
ctx
)
for
k
in
kernels
)
return
'
\n
'
.
join
([
inits_0
,
inits
])
def
c_init_code_apply
(
self
,
node
,
name
):
def
_generate_kernel_cleanup
(
self
,
k
):
err
=
'err_'
+
name
return
"GpuKernel_clear(&
%(ovar)
s);"
%
dict
(
ovar
=
k
.
objvar
)
def
c_cleanup_code_struct
(
self
,
node
,
name
):
kernels
=
self
.
gpu_kernels
(
node
,
name
)
kernels
=
self
.
gpu_kernels
(
node
,
name
)
inits
=
'
\n
'
.
join
(
self
.
_generate_kernel_init
(
k
,
err
)
for
k
in
kernels
)
cleanups
=
'
\n
'
.
join
(
self
.
_generate_kernel_cleanup
(
k
)
for
k
in
kernels
)
return
(
"int
%(err)
s;
\n
"
%
dict
(
err
=
err
))
+
init
s
return
cleanup
s
def
_GpuKernelBase_version
(
self
):
# This is a shorthand for if your op only has a fixed version
ctx
=
gpuarray
.
get_default_context
()
# You can reimplement it, but make sure to call kernel_version()
return
(
2
,
ctx
.
kind
,
ctx
.
devname
)
def
c_code_cache_version_apply
(
self
,
node
):
return
(
self
.
c_code_cache_version
(),
self
.
kernel_version
(
node
))
GpuKernelBase_version
=
property
(
_GpuKernelBase_version
)
def
kernel_version
(
self
,
node
):
return
(
3
,
node
.
get_context
()
.
bin_id
)
class
HostFromGpu
(
Op
):
class
HostFromGpu
(
Op
):
...
@@ -259,50 +336,52 @@ class HostFromGpu(Op):
...
@@ -259,50 +336,52 @@ class HostFromGpu(Op):
def
grad
(
self
,
inputs
,
grads
):
def
grad
(
self
,
inputs
,
grads
):
gz
,
=
grads
gz
,
=
grads
return
[
gpu_from_host
(
gz
)]
return
[
GpuFromHost
(
inputs
[
0
]
.
type
.
context_name
)
(
gz
)]
def
R_op
(
self
,
inputs
,
eval_points
):
def
R_op
(
self
,
inputs
,
eval_points
):
ev
,
=
eval_points
ev
,
=
eval_points
if
isinstance
(
ev
,
tensor
.
TensorType
):
return
self
(
ev
)
return
[
gpu_from_host
(
ev
)]
else
:
return
[
ev
]
def
infer_shape
(
self
,
node
,
xshp
):
def
infer_shape
(
self
,
node
,
xshp
):
return
xshp
return
xshp
host_from_gpu
=
HostFromGpu
()
host_from_gpu
=
HostFromGpu
()
class
GpuFromHost
(
Op
):
class
GpuFromHost
(
Op
):
__props__
=
()
__props__
=
(
'context_name'
,
)
_f16_ok
=
True
_f16_ok
=
True
context_type
=
gpu_context_type
def
__init__
(
self
,
context_name
):
self
.
context_name
=
context_name
def
__str__
(
self
):
def
__str__
(
self
):
return
'GpuFromHost
(gpuarray)'
return
'GpuFromHost
<
%
s>'
%
(
self
.
context_name
,)
def
make_node
(
self
,
x
):
def
make_node
(
self
,
x
):
if
not
isinstance
(
x
.
type
,
tensor
.
TensorType
):
if
not
isinstance
(
x
.
type
,
tensor
.
TensorType
):
raise
TypeError
(
x
)
raise
TypeError
(
x
)
return
Apply
(
self
,
[
x
],
[
GpuArrayType
(
broadcastable
=
x
.
broadcastable
,
return
Apply
(
self
,
[
x
],
[
GpuArrayType
(
broadcastable
=
x
.
broadcastable
,
context_name
=
self
.
context_name
,
dtype
=
x
.
dtype
)()])
dtype
=
x
.
dtype
)()])
def
perform
(
self
,
node
,
inp
,
out
):
def
get_context
(
self
,
node
):
return
get_context
(
self
.
context_name
)
def
perform
(
self
,
node
,
inp
,
out
,
ctx
):
x
,
=
inp
x
,
=
inp
z
,
=
out
z
,
=
out
z
[
0
]
=
gpuarray
.
array
(
x
)
z
[
0
]
=
gpuarray
.
array
(
x
,
context
=
ctx
)
def
grad
(
self
,
inputs
,
grads
):
def
grad
(
self
,
inputs
,
grads
):
gz
,
=
grads
gz
,
=
grads
return
[
host_from_gpu
(
as_gpuarray_variable
(
gz
))]
return
[
host_from_gpu
(
as_gpuarray_variable
(
gz
,
context_name
=
self
.
context_name
))]
def
R_op
(
self
,
inputs
,
eval_points
):
def
R_op
(
self
,
inputs
,
eval_points
):
ev
,
=
eval_points
ev
,
=
eval_points
if
isinstance
(
ev
,
GpuArrayType
):
return
self
(
ev
)
return
[
host_from_gpu
(
ev
)]
else
:
return
[
ev
]
def
infer_shape
(
self
,
node
,
xshp
):
def
infer_shape
(
self
,
node
,
xshp
):
return
xshp
return
xshp
...
@@ -319,19 +398,67 @@ class GpuFromHost(Op):
...
@@ -319,19 +398,67 @@ class GpuFromHost(Op):
PyArray_NDIM(
%(name)
s_tmp),
PyArray_NDIM(
%(name)
s_tmp),
(size_t *)PyArray_DIMS(
%(name)
s_tmp),
(size_t *)PyArray_DIMS(
%(name)
s_tmp),
(ssize_t *)PyArray_STRIDES(
%(name)
s_tmp),
(ssize_t *)PyArray_STRIDES(
%(name)
s_tmp),
pygpu_default_context()
,
%(ctx)
s
,
Py_None);
Py_None);
Py_DECREF(
%(name)
s_tmp);
Py_DECREF(
%(name)
s_tmp);
if (
%(out)
s == NULL)
if (
%(out)
s == NULL) {
%(fail)
s
%(fail)
s
"""
%
{
'name'
:
name
,
'inp'
:
inputs
[
0
],
}
"""
%
{
'name'
:
name
,
'inp'
:
inputs
[
0
],
'ctx'
:
sub
[
'context'
],
'out'
:
outputs
[
0
],
'fail'
:
sub
[
'fail'
]}
'out'
:
outputs
[
0
],
'fail'
:
sub
[
'fail'
]}
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
5
,)
return
(
7
,)
gpu_from_host
=
GpuFromHost
()
class
GpuToGpu
(
Op
):
__props__
=
(
'context_name'
,)
_f16_ok
=
True
context_type
=
gpu_context_type
def
__init__
(
self
,
context_name
):
self
.
context_name
=
context_name
def
__str__
(
self
):
return
'GpuToGpu<
%
s>'
%
(
self
.
context_name
,)
def
make_node
(
self
,
x
):
if
not
isinstance
(
x
.
type
,
GpuArrayType
):
raise
TypeError
(
x
)
return
Apply
(
self
,
[
x
],
[
GpuArrayType
(
broadcastable
=
x
.
broadcastable
,
context_name
=
self
.
context_name
,
dtype
=
x
.
dtype
)()])
def
get_context
(
self
,
node
):
return
get_context
(
self
.
context_name
)
def
perform
(
self
,
node
,
inp
,
out
,
ctx
):
x
,
=
inp
z
,
=
out
z
[
0
]
=
x
.
transfer
(
ctx
)
def
grad
(
self
,
inputs
,
grads
):
gz
,
=
grads
return
[
GpuToGpu
(
inputs
[
0
]
.
type
.
context_name
)(
gz
)]
def
R_op
(
self
,
inputs
,
eval_points
):
return
self
(
eval_points
[
0
])
def
infer_shape
(
self
,
node
,
xshp
):
return
xshp
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
return
"""
Py_XDECREF(
%(out)
s);
%(out)
s = pygpu_transfer(
%(inp)
s,
%(ctx)
s, 0);
if (
%(out)
s == NULL) {
%(fail)
s
}
"""
%
{
'inp'
:
inputs
[
0
],
'ctx'
:
sub
[
'context'
],
'out'
:
outputs
[
0
],
'fail'
:
sub
[
'fail'
]}
def
c_code_cache_version
(
self
):
return
(
0
,)
class
GpuAlloc
(
HideC
,
Alloc
):
class
GpuAlloc
(
HideC
,
Alloc
):
...
@@ -339,28 +466,35 @@ class GpuAlloc(HideC, Alloc):
...
@@ -339,28 +466,35 @@ class GpuAlloc(HideC, Alloc):
Parameters
Parameters
----------
----------
memset_0
context_name : str
The name of the context in which to allocate memory
memset_0 : bool
It's only an optimized version. True, it means the
It's only an optimized version. True, it means the
value is always 0, so the c code call memset as it is faster.
value is always 0, so the c code call memset as it is faster.
"""
"""
__props__
=
(
'memset_0'
,)
__props__
=
(
'memset_0'
,
'context_name'
)
_f16_ok
=
True
_f16_ok
=
True
context_type
=
gpu_context_type
def
__init__
(
self
,
memset_0
=
False
):
def
__init__
(
self
,
context_name
,
memset_0
=
False
):
self
.
context_name
=
context_name
self
.
memset_0
=
memset_0
self
.
memset_0
=
memset_0
def
get_context
(
self
,
node
):
return
get_context
(
self
.
context_name
)
def
__str__
(
self
):
def
__str__
(
self
):
# Hide the memset parameter when not used to prevent confusion.
# Hide the memset parameter when not used to prevent confusion.
if
self
.
memset_0
:
if
self
.
memset_0
:
s
=
"
%
s{memset_0=
%
s}"
%
(
self
.
__class__
.
__name__
,
self
.
memset_0
)
m
=
"{memset_0=True}"
else
:
else
:
s
=
self
.
__class__
.
__name__
m
=
""
return
s
return
"
%
s<
%
s>
%
s"
%
(
self
.
__class__
.
__name__
,
self
.
context_name
,
m
)
def
make_node
(
self
,
value
,
*
shape
):
def
make_node
(
self
,
value
,
*
shape
):
value
=
as_gpuarray_variable
(
value
)
value
=
as_gpuarray_variable
(
value
,
context_name
=
self
.
context_name
)
sh
,
bcast
=
self
.
validate_shape
(
shape
)
sh
,
bcast
=
self
.
validate_shape
(
shape
)
if
value
.
ndim
>
len
(
sh
):
if
value
.
ndim
>
len
(
sh
):
TypeError
(
"The GpuAlloc value to use has more dimensions "
TypeError
(
"The GpuAlloc value to use has more dimensions "
...
@@ -371,15 +505,15 @@ class GpuAlloc(HideC, Alloc):
...
@@ -371,15 +505,15 @@ class GpuAlloc(HideC, Alloc):
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'<numpy_compat.h>'
]
return
[
'<numpy_compat.h>'
]
def
perform
(
self
,
node
,
inputs
,
outs
):
def
perform
(
self
,
node
,
inputs
,
outs
,
ctx
):
out
,
=
outs
out
,
=
outs
v
=
inputs
[
0
]
v
=
inputs
[
0
]
sh
=
tuple
(
map
(
int
,
inputs
[
1
:]))
sh
=
tuple
(
map
(
int
,
inputs
[
1
:]))
if
out
[
0
]
is
None
or
out
[
0
]
.
shape
!=
sh
:
if
out
[
0
]
is
None
or
out
[
0
]
.
shape
!=
sh
:
if
self
.
memset_0
:
if
self
.
memset_0
:
out
[
0
]
=
gpuarray
.
zeros
(
sh
,
dtype
=
v
.
dtype
)
out
[
0
]
=
gpuarray
.
zeros
(
sh
,
dtype
=
v
.
dtype
,
context
=
ctx
)
else
:
else
:
out
[
0
]
=
gpuarray
.
empty
(
sh
,
dtype
=
v
.
dtype
)
out
[
0
]
=
gpuarray
.
empty
(
sh
,
dtype
=
v
.
dtype
,
context
=
ctx
)
out
[
0
][
...
]
=
v
out
[
0
][
...
]
=
v
else
:
else
:
out
[
0
][
...
]
=
v
out
[
0
][
...
]
=
v
...
@@ -414,7 +548,7 @@ class GpuAlloc(HideC, Alloc):
...
@@ -414,7 +548,7 @@ class GpuAlloc(HideC, Alloc):
Py_XDECREF(
%(zz)
s);
Py_XDECREF(
%(zz)
s);
%(zz)
s = pygpu_zeros(
%(ndim)
s,
%(name)
s_shape,
%(zz)
s = pygpu_zeros(
%(ndim)
s,
%(name)
s_shape,
%(vv)
s->ga.typecode, GA_C_ORDER,
%(vv)
s->ga.typecode, GA_C_ORDER,
pygpu_default_context()
, Py_None);
%(ctx)
s
, Py_None);
if (!
%(zz)
s) {
if (!
%(zz)
s) {
%(fail)
s
%(fail)
s
}
}
...
@@ -423,7 +557,7 @@ class GpuAlloc(HideC, Alloc):
...
@@ -423,7 +557,7 @@ class GpuAlloc(HideC, Alloc):
Py_XDECREF(
%(zz)
s);
Py_XDECREF(
%(zz)
s);
%(zz)
s = pygpu_empty(
%(ndim)
s,
%(name)
s_shape,
%(zz)
s = pygpu_empty(
%(ndim)
s,
%(name)
s_shape,
%(vv)
s->ga.typecode, GA_C_ORDER,
%(vv)
s->ga.typecode, GA_C_ORDER,
pygpu_default_context()
, Py_None);
%(ctx)
s
, Py_None);
if (!
%(zz)
s) {
if (!
%(zz)
s) {
%(fail)
s
%(fail)
s
}
}
...
@@ -434,9 +568,9 @@ class GpuAlloc(HideC, Alloc):
...
@@ -434,9 +568,9 @@ class GpuAlloc(HideC, Alloc):
if (err != GA_NO_ERROR)
if (err != GA_NO_ERROR)
{
{
PyErr_Format(PyExc_MemoryError,
PyErr_Format(PyExc_MemoryError,
"GpuAlloc: Error memsetting
%%
d
"
"GpuAlloc: Error memsetting
%%
llu
"
" element of device memory to 0.",
" element of device memory to 0.",
PyGpuArray_SIZE(
%(zz)
s));
(unsigned long long)
PyGpuArray_SIZE(
%(zz)
s));
%(fail)
s;
%(fail)
s;
}
}
}
}
...
@@ -446,7 +580,7 @@ class GpuAlloc(HideC, Alloc):
...
@@ -446,7 +580,7 @@ class GpuAlloc(HideC, Alloc):
%(fail)
s
%(fail)
s
}
}
}
}
"""
%
dict
(
name
=
name
,
ndim
=
ndim
,
zz
=
zz
,
vv
=
vv
,
"""
%
dict
(
name
=
name
,
ndim
=
ndim
,
zz
=
zz
,
vv
=
vv
,
ctx
=
sub
[
'context'
],
fail
=
sub
[
'fail'
],
memset_0
=
memset_0
)
fail
=
sub
[
'fail'
],
memset_0
=
memset_0
)
if
config
.
gpuarray
.
sync
:
if
config
.
gpuarray
.
sync
:
...
@@ -455,7 +589,7 @@ class GpuAlloc(HideC, Alloc):
...
@@ -455,7 +589,7 @@ class GpuAlloc(HideC, Alloc):
return
code
return
code
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
2
,)
return
(
3
,)
def
do_constant_folding
(
self
,
node
):
def
do_constant_folding
(
self
,
node
):
from
.
import
subtensor
,
blas
from
.
import
subtensor
,
blas
...
@@ -488,29 +622,32 @@ class GpuAlloc(HideC, Alloc):
...
@@ -488,29 +622,32 @@ class GpuAlloc(HideC, Alloc):
return
True
return
True
gpu_alloc
=
GpuAlloc
()
class
GpuAllocEmpty
(
HideC
,
Alloc
):
class
GpuAllocEmpty
(
HideC
,
Alloc
):
__props__
=
(
'dtype'
,)
__props__
=
(
'dtype'
,
'context_name'
)
_f16_ok
=
True
_f16_ok
=
True
context_type
=
gpu_context_type
def
__init__
(
self
,
dtype
):
def
__init__
(
self
,
dtype
,
context_name
):
self
.
dtype
=
dtype
self
.
dtype
=
dtype
self
.
context_name
=
context_name
def
get_context
(
self
,
node
):
return
get_context
(
self
.
context_name
)
def
make_node
(
self
,
*
shape
):
def
make_node
(
self
,
*
shape
):
sh
,
bcast
=
self
.
validate_shape
(
shape
)
sh
,
bcast
=
self
.
validate_shape
(
shape
)
output
=
GpuArrayType
(
dtype
=
self
.
dtype
,
broadcastable
=
bcast
)()
output
=
GpuArrayType
(
dtype
=
self
.
dtype
,
broadcastable
=
bcast
,
context_name
=
self
.
context_name
)()
output
.
tag
.
values_eq_approx
=
tensor
.
type
.
values_eq_approx_always_true
output
.
tag
.
values_eq_approx
=
tensor
.
type
.
values_eq_approx_always_true
# The outut can contain nan/inf.
# The outut can contain nan/inf.
output
.
type
.
filter_checks_isfinite
=
False
output
.
type
.
filter_checks_isfinite
=
False
return
Apply
(
self
,
sh
,
[
output
])
return
Apply
(
self
,
sh
,
[
output
])
def
perform
(
self
,
node
,
inputs
,
out_
):
def
perform
(
self
,
node
,
inputs
,
out_
,
ctx
):
out
=
out_
[
0
]
out
=
out_
[
0
]
sh
=
[
int
(
i
)
for
i
in
inputs
]
sh
=
[
int
(
i
)
for
i
in
inputs
]
if
out
[
0
]
is
None
or
out
[
0
]
.
shape
!=
sh
:
if
out
[
0
]
is
None
or
out
[
0
]
.
shape
!=
sh
:
out
[
0
]
=
pygpu
.
empty
(
sh
,
dtype
=
self
.
dtype
)
out
[
0
]
=
pygpu
.
empty
(
sh
,
dtype
=
self
.
dtype
,
context
=
ctx
)
# if out[0] is the right shape, we just return it
# if out[0] is the right shape, we just return it
def
c_headers
(
self
):
def
c_headers
(
self
):
...
@@ -536,16 +673,16 @@ shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
...
@@ -536,16 +673,16 @@ shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
code
.
append
(
"""
code
.
append
(
"""
if (theano_prep_output(&
%(zz)
s,
%(ndim)
s, shape,
%(type)
s, GA_C_ORDER,
if (theano_prep_output(&
%(zz)
s,
%(ndim)
s, shape,
%(type)
s, GA_C_ORDER,
pygpu_default_context()
)) {
%(ctx)
s
)) {
%(fail)
s
%(fail)
s
}
}
"""
%
dict
(
zz
=
zz
,
ndim
=
ndim
,
type
=
gpuarray
.
dtype_to_typecode
(
self
.
dtype
),
"""
%
dict
(
zz
=
zz
,
ndim
=
ndim
,
type
=
gpuarray
.
dtype_to_typecode
(
self
.
dtype
),
fail
=
fail
))
fail
=
fail
,
ctx
=
sub
[
'context'
]
))
return
''
.
join
(
code
)
return
''
.
join
(
code
)
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
0
,)
return
(
1
,)
def
do_constant_folding
(
self
,
node
):
def
do_constant_folding
(
self
,
node
):
return
False
return
False
...
@@ -559,7 +696,7 @@ if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
...
@@ -559,7 +696,7 @@ if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
def
empty_like
(
var
):
def
empty_like
(
var
):
return
GpuAllocEmpty
(
var
.
type
.
dtype
)(
*
var
.
shape
)
return
GpuAllocEmpty
(
var
.
type
.
dtype
,
var
.
type
.
context_name
)(
*
var
.
shape
)
class
GpuContiguous
(
Op
):
class
GpuContiguous
(
Op
):
...
@@ -568,7 +705,6 @@ class GpuContiguous(Op):
...
@@ -568,7 +705,6 @@ class GpuContiguous(Op):
not already c contiguous.
not already c contiguous.
"""
"""
__props__
=
()
__props__
=
()
view_map
=
{
0
:
[
0
]}
view_map
=
{
0
:
[
0
]}
_f16_ok
=
True
_f16_ok
=
True
...
@@ -576,12 +712,13 @@ class GpuContiguous(Op):
...
@@ -576,12 +712,13 @@ class GpuContiguous(Op):
def
grad
(
self
,
inputs
,
dout
):
def
grad
(
self
,
inputs
,
dout
):
x
,
=
inputs
x
,
=
inputs
dout
,
=
dout
dout
,
=
dout
dout
=
as_gpuarray_variable
(
dout
)
dout
=
as_gpuarray_variable
(
dout
,
context_name
=
infer_context_name
(
x
)
)
return
[
dout
]
return
[
dout
]
def
make_node
(
self
,
input
):
def
make_node
(
self
,
input
):
input
=
as_gpuarray_variable
(
input
)
input
=
as_gpuarray_variable
(
input
,
context_name
=
infer_context_name
(
input
))
return
Apply
(
self
,
[
input
],
[
input
.
type
()])
return
Apply
(
self
,
[
input
],
[
input
.
type
()])
def
c_headers
(
self
):
def
c_headers
(
self
):
...
@@ -633,10 +770,12 @@ class GpuReshape(HideC, tensor.Reshape):
...
@@ -633,10 +770,12 @@ class GpuReshape(HideC, tensor.Reshape):
# __hash__, __eq__, __str__ come from tensor.Reshape
# __hash__, __eq__, __str__ come from tensor.Reshape
def
make_node
(
self
,
x
,
shp
):
def
make_node
(
self
,
x
,
shp
):
x
=
as_gpuarray_variable
(
x
)
ctx_name
=
infer_context_name
(
x
)
x
=
as_gpuarray_variable
(
x
,
context_name
=
ctx_name
)
res
=
host_from_gpu
(
x
)
.
reshape
(
shp
,
ndim
=
self
.
ndim
)
res
=
host_from_gpu
(
x
)
.
reshape
(
shp
,
ndim
=
self
.
ndim
)
otype
=
GpuArrayType
(
dtype
=
res
.
dtype
,
otype
=
GpuArrayType
(
dtype
=
res
.
dtype
,
broadcastable
=
res
.
broadcastable
)
broadcastable
=
res
.
broadcastable
,
context_name
=
ctx_name
)
return
Apply
(
self
,
[
x
,
shp
],
[
otype
()])
return
Apply
(
self
,
[
x
,
shp
],
[
otype
()])
def
perform
(
self
,
node
,
inp
,
out_
):
def
perform
(
self
,
node
,
inp
,
out_
):
...
@@ -744,22 +883,30 @@ class GpuReshape(HideC, tensor.Reshape):
...
@@ -744,22 +883,30 @@ class GpuReshape(HideC, tensor.Reshape):
class
GpuJoin
(
HideC
,
Join
):
class
GpuJoin
(
HideC
,
Join
):
_f16_ok
=
True
_f16_ok
=
True
context_type
=
gpu_context_type
def
make_node
(
self
,
axis
,
*
tensors
):
def
make_node
(
self
,
axis
,
*
tensors
):
node
=
Join
.
make_node
(
self
,
axis
,
*
tensors
)
node
=
Join
.
make_node
(
self
,
axis
,
*
tensors
)
return
Apply
(
self
,
[
node
.
inputs
[
0
]]
+
list
(
map
(
as_gpuarray_variable
,
ctx_name
=
infer_context_name
(
*
tensors
)
tensors
)),
def
agv
(
v
):
return
as_gpuarray_variable
(
v
,
context_name
=
ctx_name
)
return
Apply
(
self
,
[
node
.
inputs
[
0
]]
+
list
(
map
(
agv
,
tensors
)),
[
GpuArrayType
(
broadcastable
=
node
.
outputs
[
0
]
.
broadcastable
,
[
GpuArrayType
(
broadcastable
=
node
.
outputs
[
0
]
.
broadcastable
,
dtype
=
node
.
outputs
[
0
]
.
dtype
)()])
dtype
=
node
.
outputs
[
0
]
.
dtype
,
context_name
=
ctx_name
)()])
def
get_context
(
self
,
node
):
return
node
.
outputs
[
0
]
.
type
.
context
def
perform
(
self
,
node
,
axis_and_tensors
,
out_
):
def
perform
(
self
,
node
,
axis_and_tensors
,
out_
,
ctx
):
out
,
=
out_
out
,
=
out_
axis
=
int
(
axis_and_tensors
[
0
])
axis
=
int
(
axis_and_tensors
[
0
])
tensors
=
axis_and_tensors
[
1
:]
tensors
=
axis_and_tensors
[
1
:]
out
[
0
]
=
pygpu
.
concatenate
(
tensors
,
axis
=
axis
)
.
astype
(
out
[
0
]
=
pygpu
.
concatenate
(
tensors
,
axis
=
axis
,
context
=
ctx
)
.
astype
(
node
.
outputs
[
0
]
.
dtype
)
node
.
outputs
[
0
]
.
dtype
)
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
...
@@ -793,15 +940,14 @@ if (axis < 0) {
...
@@ -793,15 +940,14 @@ if (axis < 0) {
}
}
%(out)
s = pygpu_concatenate(als,
%(n)
s, axis,
%(out)
s = pygpu_concatenate(als,
%(n)
s, axis,
%(restype)
s, (PyObject *)&PyGpuArrayType,
%(restype)
s, (PyObject *)&PyGpuArrayType,
pygpu_default_context()
);
%(ctx)
s
);
}
}
PyMem_Free(als);
PyMem_Free(als);
if (
%(out)
s == NULL)
if (
%(out)
s == NULL)
%(fail)
s
%(fail)
s
"""
%
dict
(
n
=
len
(
inputs
[
1
:]),
fail
=
sub
[
'fail'
],
out
=
out_
[
0
],
"""
%
dict
(
n
=
len
(
inputs
[
1
:]),
fail
=
sub
[
'fail'
],
out
=
out_
[
0
],
axis
=
inputs
[
0
],
copy_inputs_to_list
=
'
\n
'
.
join
(
copy_to_list
),
axis
=
inputs
[
0
],
copy_inputs_to_list
=
'
\n
'
.
join
(
copy_to_list
),
restype
=
restype
)
restype
=
restype
,
ctx
=
sub
[
'context'
])
gpu_join
=
GpuJoin
()
gpu_join
=
GpuJoin
()
...
@@ -809,21 +955,26 @@ gpu_join = GpuJoin()
...
@@ -809,21 +955,26 @@ gpu_join = GpuJoin()
class
GpuSplit
(
HideC
,
Split
):
class
GpuSplit
(
HideC
,
Split
):
def
make_node
(
self
,
x
,
axis
,
splits
):
def
make_node
(
self
,
x
,
axis
,
splits
):
node
=
Split
.
make_node
(
self
,
x
,
axis
,
splits
)
node
=
Split
.
make_node
(
self
,
x
,
axis
,
splits
)
x
=
as_gpuarray_variable
(
x
)
x
=
as_gpuarray_variable
(
x
,
infer_context_name
(
x
))
outs
=
[
GpuArrayType
(
dtype
=
o
.
dtype
,
broadcastable
=
o
.
broadcastable
)()
outs
=
[
GpuArrayType
(
dtype
=
o
.
dtype
,
broadcastable
=
o
.
broadcastable
,
context_name
=
x
.
type
.
context_name
)()
for
o
in
node
.
outputs
]
for
o
in
node
.
outputs
]
return
Apply
(
self
,
[
x
]
+
node
.
inputs
[
1
:],
outs
)
return
Apply
(
self
,
[
x
]
+
node
.
inputs
[
1
:],
outs
)
# we reuse the perform of the CPU op, which is suitable
# we reuse the perform of the CPU op, which is suitable
class
GpuEye
(
GpuKernelBase
,
Op
):
class
GpuEye
(
GpuKernelBase
,
Op
):
__props__
=
(
'dtype'
,)
__props__
=
(
'dtype'
,
'context_name'
)
_f16_ok
=
True
_f16_ok
=
True
def
__init__
(
self
,
dtype
=
None
):
def
__init__
(
self
,
dtype
=
None
,
context_name
=
None
):
if
dtype
is
None
:
if
dtype
is
None
:
dtype
=
config
.
floatX
dtype
=
config
.
floatX
self
.
dtype
=
dtype
self
.
dtype
=
dtype
self
.
context_name
=
context_name
def
get_context
(
self
,
node
):
return
get_context
(
self
.
context_name
)
def
make_node
(
self
,
n
,
m
,
k
):
def
make_node
(
self
,
n
,
m
,
k
):
n
=
tensor
.
as_tensor_variable
(
n
)
n
=
tensor
.
as_tensor_variable
(
n
)
...
@@ -833,7 +984,8 @@ class GpuEye(GpuKernelBase, Op):
...
@@ -833,7 +984,8 @@ class GpuEye(GpuKernelBase, Op):
assert
m
.
ndim
==
0
assert
m
.
ndim
==
0
assert
k
.
ndim
==
0
assert
k
.
ndim
==
0
otype
=
GpuArrayType
(
dtype
=
self
.
dtype
,
otype
=
GpuArrayType
(
dtype
=
self
.
dtype
,
broadcastable
=
(
False
,
False
))
broadcastable
=
(
False
,
False
),
context_name
=
self
.
context_name
)
# k != 0 isn't implemented on the GPU yet.
# k != 0 isn't implemented on the GPU yet.
assert
tensor
.
get_scalar_constant_value
(
k
)
==
0
assert
tensor
.
get_scalar_constant_value
(
k
)
==
0
...
@@ -866,6 +1018,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
...
@@ -866,6 +1018,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
n
,
m
=
inp
n
,
m
=
inp
z
,
=
out
z
,
=
out
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'context'
]
typecode
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
self
.
dtype
)
typecode
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
self
.
dtype
)
sync
=
bool
(
config
.
gpuarray
.
sync
)
sync
=
bool
(
config
.
gpuarray
.
sync
)
kname
=
self
.
gpu_kernels
(
node
,
name
)[
0
]
.
objvar
kname
=
self
.
gpu_kernels
(
node
,
name
)[
0
]
.
objvar
...
@@ -882,7 +1035,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
...
@@ -882,7 +1035,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
%(z)
s = pygpu_zeros(2, dims,
%(z)
s = pygpu_zeros(2, dims,
%(typecode)
s,
%(typecode)
s,
GA_C_ORDER,
GA_C_ORDER,
pygpu_default_context()
, Py_None);
%(ctx)
s
, Py_None);
if (
%(z)
s == NULL) {
if (
%(z)
s == NULL) {
%(fail)
s
%(fail)
s
}
}
...
@@ -908,4 +1061,4 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
...
@@ -908,4 +1061,4 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
return
s
return
s
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
4
,
self
.
GpuKernelBase_version
)
return
(
5
,
)
theano/sandbox/gpuarray/blas.py
浏览文件 @
4814cd99
import
os.path
import
os.path
from
theano
import
Apply
,
config
from
theano
import
Apply
,
config
,
Op
from
theano.compile
import
optdb
from
theano.compile
import
optdb
from
theano.gof
import
local_optimizer
,
LocalOptGroup
from
theano.gof
import
LocalOptGroup
from
theano.tensor.basic
import
as_tensor_variable
from
theano.tensor.basic
import
as_tensor_variable
from
theano.tensor.blas
import
Dot22
,
Gemv
,
Gemm
,
Ger
from
theano.tensor.opt
import
in2out
from
theano.tensor.opt
import
in2out
from
.basic_ops
import
HideC
,
as_gpuarray_variable
,
GpuAllocEmpty
from
.basic_ops
import
as_gpuarray_variable
,
infer_context_name
from
.opt_util
import
inplace_allocempty
try
:
try
:
import
pygpu
import
pygpu
...
@@ -18,7 +19,7 @@ except ImportError as e:
...
@@ -18,7 +19,7 @@ except ImportError as e:
pass
pass
class
BlasOp
(
HideC
):
class
BlasOp
(
Op
):
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'<blas_api.h>'
,
'<numpy_compat.h>'
,
'<gpuarray_helper.h>'
]
return
[
'<blas_api.h>'
,
'<numpy_compat.h>'
,
'<gpuarray_helper.h>'
]
...
@@ -28,34 +29,27 @@ class BlasOp(HideC):
...
@@ -28,34 +29,27 @@ class BlasOp(HideC):
def
c_init_code
(
self
):
def
c_init_code
(
self
):
return
[
'import_pygpu__blas();'
]
return
[
'import_pygpu__blas();'
]
def
c_support_code
(
self
):
return
"""
class
GpuGemv
(
BlasOp
):
PyGpuArrayObject *gpublas_try_copy(PyGpuArrayObject *out,
__props__
=
(
'inplace'
,)
PyGpuArrayObject *y) {
if (out &&
def
__init__
(
self
,
inplace
=
False
):
GpuArray_CHKFLAGS(&out->ga, GA_CARRAY) &&
self
.
inplace
=
inplace
theano_size_check(out, PyGpuArray_NDIM(y),
if
self
.
inplace
:
PyGpuArray_DIMS(y),
self
.
destroy_map
=
{
0
:
[
0
]}
y->ga.typecode)) {
if (pygpu_move(out, y)) {
Py_XDECREF(out);
return NULL;
}
} else {
Py_XDECREF(out);
out = pygpu_copy(y, GA_ANY_ORDER);
}
return out;
}
"""
class
GpuGemv
(
BlasOp
,
Gemv
):
def
make_node
(
self
,
y
,
alpha
,
A
,
x
,
beta
):
def
make_node
(
self
,
y
,
alpha
,
A
,
x
,
beta
):
Gemv
.
make_node
(
self
,
y
,
alpha
,
A
,
x
,
beta
)
ctx_name
=
infer_context_name
(
y
,
A
,
x
)
A
=
as_gpuarray_variable
(
A
)
A
=
as_gpuarray_variable
(
A
,
ctx_name
)
x
=
as_gpuarray_variable
(
x
)
x
=
as_gpuarray_variable
(
x
,
ctx_name
)
y
=
as_gpuarray_variable
(
y
)
y
=
as_gpuarray_variable
(
y
,
ctx_name
)
alpha
=
as_tensor_variable
(
alpha
)
beta
=
as_tensor_variable
(
beta
)
assert
alpha
.
ndim
==
0
assert
beta
.
ndim
==
0
assert
A
.
ndim
==
2
assert
x
.
ndim
==
1
assert
y
.
ndim
==
1
assert
A
.
dtype
==
x
.
dtype
==
y
.
dtype
assert
A
.
dtype
==
x
.
dtype
==
y
.
dtype
return
Apply
(
self
,
[
y
,
alpha
,
A
,
x
,
beta
],
[
y
.
type
()])
return
Apply
(
self
,
[
y
,
alpha
,
A
,
x
,
beta
],
[
y
.
type
()])
...
@@ -73,7 +67,7 @@ class GpuGemv(BlasOp, Gemv):
...
@@ -73,7 +67,7 @@ class GpuGemv(BlasOp, Gemv):
if
self
.
inplace
:
if
self
.
inplace
:
code
=
"""
code
=
"""
if (
%(y)
s->ga.strides[0] <= 0) {
if (
%(y)
s->ga.strides[0] <= 0) {
%(out)
s =
gpublas
_try_copy(
%(out)
s,
%(y)
s);
%(out)
s =
theano
_try_copy(
%(out)
s,
%(y)
s);
if (
%(out)
s == NULL) {
if (
%(out)
s == NULL) {
%(fail)
s
%(fail)
s
}
}
...
@@ -85,7 +79,7 @@ class GpuGemv(BlasOp, Gemv):
...
@@ -85,7 +79,7 @@ class GpuGemv(BlasOp, Gemv):
"""
%
vars
"""
%
vars
else
:
else
:
code
=
"""
code
=
"""
%(out)
s =
gpublas
_try_copy(
%(out)
s,
%(y)
s);
%(out)
s =
theano
_try_copy(
%(out)
s,
%(y)
s);
if (
%(out)
s == NULL) {
if (
%(out)
s == NULL) {
%(fail)
s
%(fail)
s
}
}
...
@@ -106,21 +100,33 @@ class GpuGemv(BlasOp, Gemv):
...
@@ -106,21 +100,33 @@ class GpuGemv(BlasOp, Gemv):
return
code
return
code
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
3
,)
return
(
4
,)
gpugemv_no_inplace
=
GpuGemv
(
inplace
=
False
)
gpugemv_no_inplace
=
GpuGemv
(
inplace
=
False
)
gpugemv_inplace
=
GpuGemv
(
inplace
=
True
)
gpugemv_inplace
=
GpuGemv
(
inplace
=
True
)
class
GpuGemm
(
BlasOp
,
Gemm
):
class
GpuGemm
(
BlasOp
):
__props__
=
(
'inplace'
,)
_f16_ok
=
True
_f16_ok
=
True
def
__init__
(
self
,
inplace
=
False
):
self
.
inplace
=
inplace
if
self
.
inplace
:
self
.
destroy_map
=
{
0
:
[
0
]}
def
make_node
(
self
,
C
,
alpha
,
A
,
B
,
beta
):
def
make_node
(
self
,
C
,
alpha
,
A
,
B
,
beta
):
ctx_name
=
infer_context_name
(
C
,
A
,
B
)
A
=
as_gpuarray_variable
(
A
,
ctx_name
)
B
=
as_gpuarray_variable
(
B
,
ctx_name
)
C
=
as_gpuarray_variable
(
C
,
ctx_name
)
alpha
=
as_tensor_variable
(
alpha
)
alpha
=
as_tensor_variable
(
alpha
)
beta
=
as_tensor_variable
(
beta
)
beta
=
as_tensor_variable
(
beta
)
A
=
as_gpuarray_variable
(
A
)
assert
alpha
.
ndim
==
0
B
=
as_gpuarray_variable
(
B
)
assert
beta
.
ndim
==
0
C
=
as_gpuarray_variable
(
C
)
assert
A
.
ndim
==
2
assert
B
.
ndim
==
2
assert
C
.
ndim
==
2
assert
A
.
dtype
==
B
.
dtype
==
C
.
dtype
assert
A
.
dtype
==
B
.
dtype
==
C
.
dtype
return
Apply
(
self
,
[
C
,
alpha
,
A
,
B
,
beta
],
[
C
.
type
()])
return
Apply
(
self
,
[
C
,
alpha
,
A
,
B
,
beta
],
[
C
.
type
()])
...
@@ -138,7 +144,7 @@ class GpuGemm(BlasOp, Gemm):
...
@@ -138,7 +144,7 @@ class GpuGemm(BlasOp, Gemm):
if
self
.
inplace
:
if
self
.
inplace
:
code
=
"""
code
=
"""
if (!GpuArray_ISONESEGMENT(&
%(C)
s->ga)) {
if (!GpuArray_ISONESEGMENT(&
%(C)
s->ga)) {
%(out)
s =
gpublas
_try_copy(
%(out)
s,
%(C)
s);
%(out)
s =
theano
_try_copy(
%(out)
s,
%(C)
s);
if (
%(out)
s == NULL) {
if (
%(out)
s == NULL) {
%(fail)
s
%(fail)
s
}
}
...
@@ -150,7 +156,7 @@ class GpuGemm(BlasOp, Gemm):
...
@@ -150,7 +156,7 @@ class GpuGemm(BlasOp, Gemm):
"""
%
vars
"""
%
vars
else
:
else
:
code
=
"""
code
=
"""
%(out)
s =
gpublas
_try_copy(
%(out)
s,
%(C)
s);
%(out)
s =
theano
_try_copy(
%(out)
s,
%(C)
s);
if (
%(out)
s == NULL) {
if (
%(out)
s == NULL) {
%(fail)
s
%(fail)
s
}
}
...
@@ -171,25 +177,36 @@ class GpuGemm(BlasOp, Gemm):
...
@@ -171,25 +177,36 @@ class GpuGemm(BlasOp, Gemm):
return
code
return
code
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
4
,)
return
(
5
,)
gpugemm_no_inplace
=
GpuGemm
(
inplace
=
False
)
gpugemm_no_inplace
=
GpuGemm
(
inplace
=
False
)
gpugemm_inplace
=
GpuGemm
(
inplace
=
True
)
gpugemm_inplace
=
GpuGemm
(
inplace
=
True
)
class
GpuGer
(
BlasOp
,
Ger
):
class
GpuGer
(
BlasOp
):
__props__
=
(
'inplace'
,)
def
__init__
(
self
,
inplace
=
False
):
self
.
inplace
=
inplace
if
self
.
inplace
:
self
.
destroy_map
=
{
0
:
[
0
]}
def
make_node
(
self
,
A
,
alpha
,
x
,
y
):
def
make_node
(
self
,
A
,
alpha
,
x
,
y
):
Ger
.
make_node
(
self
,
A
,
alpha
,
x
,
y
)
ctx_name
=
infer_context_name
(
A
,
x
,
y
)
A
=
as_gpuarray_variable
(
A
)
A
=
as_gpuarray_variable
(
A
,
ctx_name
)
x
=
as_gpuarray_variable
(
x
)
x
=
as_gpuarray_variable
(
x
,
ctx_name
)
y
=
as_gpuarray_variable
(
y
)
y
=
as_gpuarray_variable
(
y
,
ctx_name
)
alpha
=
as_tensor_variable
(
alpha
)
assert
alpha
.
ndim
==
0
assert
A
.
ndim
==
2
assert
x
.
ndim
==
1
assert
y
.
ndim
==
1
assert
A
.
dtype
==
x
.
dtype
==
y
.
dtype
assert
A
.
dtype
==
x
.
dtype
==
y
.
dtype
return
Apply
(
self
,
[
A
,
alpha
,
x
,
y
],
[
A
.
type
()])
return
Apply
(
self
,
[
A
,
alpha
,
x
,
y
],
[
A
.
type
()])
def
perform
(
self
,
node
,
inp
,
out
):
def
perform
(
self
,
node
,
inp
,
out
):
A
,
alpha
,
x
,
y
=
inp
A
,
alpha
,
x
,
y
=
inp
inplace
=
self
.
destructiv
e
inplace
=
self
.
inplac
e
if
inplace
and
not
A
.
flags
.
forc
:
if
inplace
and
not
A
.
flags
.
forc
:
inplace
=
False
inplace
=
False
out
[
0
][
0
]
=
blas
.
ger
(
alpha
,
x
,
y
,
A
,
out
[
0
][
0
]
=
blas
.
ger
(
alpha
,
x
,
y
,
A
,
...
@@ -198,10 +215,10 @@ class GpuGer(BlasOp, Ger):
...
@@ -198,10 +215,10 @@ class GpuGer(BlasOp, Ger):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
vars
=
dict
(
out
=
out
[
0
],
A
=
inp
[
0
],
alpha
=
inp
[
1
],
x
=
inp
[
2
],
y
=
inp
[
3
],
vars
=
dict
(
out
=
out
[
0
],
A
=
inp
[
0
],
alpha
=
inp
[
1
],
x
=
inp
[
2
],
y
=
inp
[
3
],
fail
=
sub
[
'fail'
],
name
=
name
)
fail
=
sub
[
'fail'
],
name
=
name
)
if
self
.
destructiv
e
:
if
self
.
inplac
e
:
code
=
"""
code
=
"""
if (!GpuArray_ISONESEGMENT(&
%(A)
s->ga)) {
if (!GpuArray_ISONESEGMENT(&
%(A)
s->ga)) {
%(out)
s =
gpublas
_try_copy(
%(out)
s,
%(A)
s);
%(out)
s =
theano
_try_copy(
%(out)
s,
%(A)
s);
if (
%(out)
s == NULL) {
if (
%(out)
s == NULL) {
%(fail)
s
%(fail)
s
}
}
...
@@ -213,7 +230,7 @@ class GpuGer(BlasOp, Ger):
...
@@ -213,7 +230,7 @@ class GpuGer(BlasOp, Ger):
"""
%
vars
"""
%
vars
else
:
else
:
code
=
"""
code
=
"""
%(out)
s =
gpublas
_try_copy(
%(out)
s,
%(A)
s);
%(out)
s =
theano
_try_copy(
%(out)
s,
%(A)
s);
if (
%(out)
s == NULL) {
if (
%(out)
s == NULL) {
%(fail)
s
%(fail)
s
}
}
...
@@ -231,18 +248,22 @@ class GpuGer(BlasOp, Ger):
...
@@ -231,18 +248,22 @@ class GpuGer(BlasOp, Ger):
return
code
return
code
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
2
,)
return
(
3
,)
gpuger_no_inplace
=
GpuGer
(
inplace
=
False
)
gpuger_inplace
=
GpuGer
(
inplace
=
True
)
gpuger_no_inplace
=
GpuGer
(
destructive
=
False
)
gpuger_inplace
=
GpuGer
(
destructive
=
True
)
class
GpuDot22
(
BlasOp
):
__props__
=
()
class
GpuDot22
(
BlasOp
,
Dot22
):
def
make_node
(
self
,
x
,
y
):
def
make_node
(
self
,
x
,
y
):
Dot22
.
make_node
(
self
,
x
,
y
)
ctx_name
=
infer_context_name
(
x
,
y
)
x
=
as_gpuarray_variable
(
x
)
x
=
as_gpuarray_variable
(
x
,
ctx_name
)
y
=
as_gpuarray_variable
(
y
)
y
=
as_gpuarray_variable
(
y
,
ctx_name
)
assert
x
.
ndim
==
2
assert
y
.
ndim
==
2
assert
x
.
dtype
==
y
.
dtype
assert
x
.
dtype
==
y
.
dtype
return
Apply
(
self
,
[
x
,
y
],
[
x
.
type
()])
return
Apply
(
self
,
[
x
,
y
],
[
x
.
type
()])
...
@@ -268,7 +289,7 @@ class GpuDot22(BlasOp, Dot22):
...
@@ -268,7 +289,7 @@ class GpuDot22(BlasOp, Dot22):
dims[1] = PyGpuArray_DIMS(
%(B)
s)[1];
dims[1] = PyGpuArray_DIMS(
%(B)
s)[1];
if (theano_prep_output(&
%(out)
s, 2, dims,
%(typecode)
s, GA_C_ORDER,
if (theano_prep_output(&
%(out)
s, 2, dims,
%(typecode)
s, GA_C_ORDER,
pygpu_default_context()
)) {
%(A)
s->context
)) {
%(fail)
s
%(fail)
s
}
}
...
@@ -287,32 +308,24 @@ class GpuDot22(BlasOp, Dot22):
...
@@ -287,32 +308,24 @@ class GpuDot22(BlasOp, Dot22):
return
code
return
code
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
3
,)
return
(
4
,)
gpu_dot22
=
GpuDot22
()
gpu_dot22
=
GpuDot22
()
@local_optimizer
([
gpugemv_no_inplace
],
inplace
=
True
)
@inplace_allocempty
(
GpuGemv
,
0
)
def
local_inplace_gpuagemv
(
node
):
def
local_inplace_gpuagemv
(
node
,
inputs
):
if
node
.
op
==
gpugemv_no_inplace
:
return
[
gpugemv_inplace
(
*
inputs
)]
return
[
gpugemv_inplace
(
*
node
.
inputs
)]
@local_optimizer
([
gpugemm_no_inplace
],
inplace
=
True
)
@inplace_allocempty
(
GpuGemm
,
0
)
def
local_inplace_gpuagemm
(
node
):
def
local_inplace_gpuagemm
(
node
,
inputs
):
if
node
.
op
==
gpugemm_no_inplace
:
return
[
gpugemm_inplace
(
*
inputs
)]
inputs
=
list
(
node
.
inputs
)
C
=
inputs
[
0
]
if
(
C
.
owner
and
isinstance
(
C
.
owner
.
op
,
GpuAllocEmpty
)
and
len
(
C
.
clients
)
>
1
):
inputs
[
0
]
=
C
.
owner
.
op
(
*
C
.
owner
.
inputs
)
return
[
gpugemm_inplace
(
*
inputs
)]
@local_optimizer
([
gpuger_no_inplace
],
inplace
=
True
)
@inplace_allocempty
(
GpuGer
,
0
)
def
local_inplace_gpuager
(
node
):
def
local_inplace_gpuager
(
node
,
inputs
):
if
node
.
op
==
gpuger_no_inplace
:
return
[
gpuger_inplace
(
*
inputs
)]
return
[
gpuger_inplace
(
*
node
.
inputs
)]
gpuablas_opt_inplace
=
in2out
(
LocalOptGroup
(
local_inplace_gpuagemv
,
gpuablas_opt_inplace
=
in2out
(
LocalOptGroup
(
local_inplace_gpuagemv
,
local_inplace_gpuagemm
,
local_inplace_gpuagemm
,
...
...
theano/sandbox/gpuarray/conv.cu
浏览文件 @
4814cd99
...
@@ -134,7 +134,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -134,7 +134,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
const int out_size_byte = out_size*sizeof(float);
const int out_size_byte = out_size*sizeof(float);
if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received had %llu
d
columns!"
,
" %d kernel columns, but the kernel we received had %llu columns!",
THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]);
THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]);
return -1;
return -1;
}
}
...
@@ -217,13 +217,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -217,13 +217,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
}
}
else
else
{
{
if
(
verbose
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i, nb_split=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
],
nb_split
);
if (verbose)
if (verbose)
fprintf(stderr,
fprintf(stderr,
"INFO: impl 'conv_patch' failed (%s),"
"INFO: impl 'conv_patch' failed (%s),"
...
@@ -307,21 +300,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -307,21 +300,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (err == GA_NO_ERROR)
if (err == GA_NO_ERROR)
{
{
if
(
verbose
>
1
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i,"
" img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
],
THEANO_KERN_WID
,
img_contiguous_2d
,
kern_contiguous_2d
,
nb_split
,
preload_full_kernel
,
(
unsigned
long
long
)
subsample_rows
,
(
unsigned
long
long
)
subsample_cols
);
if (verbose)
if (verbose)
fprintf(stderr,
fprintf(stderr,
"INFO: used 'conv_patch_stack' version with nb_split=%i"
"INFO: used 'conv_patch_stack' version with nb_split=%i"
...
@@ -334,21 +312,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -334,21 +312,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
}
}
else
else
{
{
if
(
verbose
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false,"
" kern_width=%i, img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
],
THEANO_KERN_WID
,
img_contiguous_2d
,
kern_contiguous_2d
,
nb_split
,
preload_full_kernel
,
(
unsigned
long
long
)
subsample_rows
,
(
unsigned
long
long
)
subsample_cols
);
if (verbose)
if (verbose)
fprintf(stderr,
fprintf(stderr,
"INFO: impl 'conv_patch_stack' failed (%s),"
"INFO: impl 'conv_patch_stack' failed (%s),"
...
@@ -394,12 +357,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -394,12 +357,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
}
}
else
else
{
{
if
(
verbose
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]);
if (verbose)
if (verbose)
fprintf(stderr,
fprintf(stderr,
"INFO: impl 'conv_rows' failed (%s),"
"INFO: impl 'conv_rows' failed (%s),"
...
@@ -428,19 +385,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -428,19 +385,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
if
(
0
)
fprintf
(
stderr
,
"IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)
\n
"
,
img_contiguous_2d
,
kern_contiguous_2d
,
threads_per_block
[
0
],
threads_per_block
[
1
],
threads_per_block
[
2
],
n_blocks
[
0
],
n_blocks
[
1
],
n_blocks
[
2
]);
GpuKernel *k = NULL;
GpuKernel *k = NULL;
if(!img_contiguous_2d || !kern_contiguous_2d) {
if(!img_contiguous_2d || !kern_contiguous_2d) {
//fprintf(stderr, "using false version\n");
k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
} else {
} else {
//fprintf(stderr, "using true version\n");
k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
}
}
...
@@ -460,23 +408,11 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -460,23 +408,11 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (err == GA_NO_ERROR)
if (err == GA_NO_ERROR)
{
{
work_complete = true;
work_complete = true;
if
(
verbose
>
1
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]);
if (verbose)
if (verbose)
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
}
}
else
else
{
{
if
(
verbose
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]);
if (verbose)
if (verbose)
fprintf(stderr,
fprintf(stderr,
"INFO: impl 'conv_rows_stack' failed (%s),"
"INFO: impl 'conv_rows_stack' failed (%s),"
...
@@ -543,12 +479,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -543,12 +479,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (err == GA_NO_ERROR)
if (err == GA_NO_ERROR)
{
{
work_complete = true;
work_complete = true;
if
(
verbose
>
1
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]);
if (verbose)
if (verbose)
fprintf(stderr,
fprintf(stderr,
"INFO: used 'conv_rows_stack2' version %s with"
"INFO: used 'conv_rows_stack2' version %s with"
...
@@ -558,12 +488,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -558,12 +488,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
}
}
else
else
{
{
if
(
verbose
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i version=%d
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
],(
version
==
9
?
2
:
3
));
if (verbose)
if (verbose)
fprintf(stderr,
fprintf(stderr,
"INFO: impl 'conv_rows_stack2' failed (%s),"
"INFO: impl 'conv_rows_stack2' failed (%s),"
...
@@ -680,13 +604,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -680,13 +604,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (err == GA_NO_ERROR)
if (err == GA_NO_ERROR)
{
{
if
(
verbose
>
1
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i, "
"n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i,"
" nb_threads=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
threads_per_block
[
2
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
*
threads_per_block
[
2
]);
if (verbose)
if (verbose)
fprintf(stderr,
fprintf(stderr,
"INFO: used 'conv_patch_stack_reduce' version"
"INFO: used 'conv_patch_stack_reduce' version"
...
@@ -697,14 +614,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -697,14 +614,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
}
}
else
else
{
{
if
(
verbose
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i,"
" nb_threads=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
threads_per_block
[
2
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
*
threads_per_block
[
2
]);
if (verbose)
if (verbose)
fprintf(stderr,
fprintf(stderr,
"INFO: impl 'conv_patch_stack_reduce' failed (%s),"
"INFO: impl 'conv_patch_stack_reduce' failed (%s),"
...
@@ -714,7 +623,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -714,7 +623,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} // else no good nb_splits was found
} // else no good nb_splits was found
}
}
if
(
1
&&
(
version
==
6
||
version
==-
1
)
&&
if ((version==6||version==-1) &&
kern_len<=320 &&
kern_len<=320 &&
!work_complete) //conv_valid_row_reduce
!work_complete) //conv_valid_row_reduce
{
{
...
@@ -782,12 +691,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -782,12 +691,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
}
}
else
else
{
{
if
(
verbose
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i,"
" shmem_sz=%i, nb_threads=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_reduce_buf
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]);
if (verbose)
if (verbose)
fprintf(stderr,
fprintf(stderr,
"INFO: impl 'conv_valid_row_reduce' failed (%s),"
"INFO: impl 'conv_valid_row_reduce' failed (%s),"
...
@@ -805,43 +708,8 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
...
@@ -805,43 +708,8 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
(size_t)256),
(size_t)256),
(size_t)1, (size_t)1};
(size_t)1, (size_t)1};
if
(
1
)
if (verbose)
{
fprintf(stderr, "INFO: launching conv_reference_valid\n");
if
(
verbose
)
fprintf
(
stderr
,
"INFO: launching conv_reference_valid
\n
"
);
if
(
verbose
>
1
)
fprintf
(
stderr
,
" img : %i %llu %i %i %p "
"%lld %lld %lld %lld
\n
"
,
nbatch
,
(
unsigned
long
long
)
stack_len
,
img_len
,
img_wid
,
(
void
*
)(
cuda_get_ptr
(
img
->
ga
.
data
)
+
img
->
ga
.
offset
),
(
long
long
)
img_stride_batch
,
(
long
long
)
img_stride_stack
,
(
long
long
)
img_stride_row
,
(
long
long
)
img_stride_col
);
if
(
verbose
>
1
)
fprintf
(
stderr
,
" kern: %i %i %i %i %p "
"%lld %lld %lld %lld
\n
"
,
nkern
,
nstack
,
kern_len
,
kern_wid
,
(
void
*
)(
cuda_get_ptr
(
kern
->
ga
.
data
)
+
kern
->
ga
.
offset
),
(
long
long
)
kern_stride_nkern
,
(
long
long
)
kern_stride_stack
,
(
long
long
)
kern_stride_row
,
(
long
long
)
kern_stride_col
);
if
(
verbose
>
1
)
fprintf
(
stderr
,
" out : %llu %llu %i %i %p "
"%lld %lld %lld %lld
\n
"
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
0
],
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
1
],
out_len
,
out_wid
,
(
void
*
)(
cuda_get_ptr
(
out
->
ga
.
data
)
+
out
->
ga
.
offset
),
(
long
long
)
out_stride_batch
,
(
long
long
)
out_stride_nkern
,
(
long
long
)
out_stride_row
,
(
long
long
)
out_stride_col
);
if
(
verbose
>
1
)
fprintf
(
stderr
,
" launch params: %i %i %i
\n
"
,
outsize
,
n_blocks
[
0
],
threads_per_block
[
0
]);
}
void *kernel_params[] = {
void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&nbatch, (void *)&nkern, (void *)&stack_len,
...
@@ -1113,15 +981,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
...
@@ -1113,15 +981,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
if (err == GA_NO_ERROR)
if (err == GA_NO_ERROR)
{
{
if
(
verbose
>
1
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
threads_per_block
[
2
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
*
threads_per_block
[
2
],
out_len
,
nb_split
,
version
);
if (verbose)
if (verbose)
fprintf(stderr,
fprintf(stderr,
"INFO: used 'conv_full_patch_stack_padded'"
"INFO: used 'conv_full_patch_stack_padded'"
...
@@ -1131,15 +990,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
...
@@ -1131,15 +990,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
}
}
else
else
{
{
if
(
verbose
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
threads_per_block
[
2
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
*
threads_per_block
[
2
],
out_len
,
nb_split
,
version
);
if (verbose)
if (verbose)
fprintf(stderr,
fprintf(stderr,
"INFO: impl 'conv_full_patch_stack_padded' %s %s"
"INFO: impl 'conv_full_patch_stack_padded' %s %s"
...
@@ -1179,12 +1029,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
...
@@ -1179,12 +1029,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
}
}
else
else
{
{
if
(
verbose
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]);
if (verbose)
if (verbose)
fprintf(stderr,
fprintf(stderr,
"INFO: impl 'conv_full_patch' failed (%s),"
"INFO: impl 'conv_full_patch' failed (%s),"
...
@@ -1225,12 +1069,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
...
@@ -1225,12 +1069,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
}
}
else
else
{
{
if
(
verbose
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]);
if (verbose)
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation\n",
" failed (%s), trying next implementation\n",
...
@@ -1276,12 +1114,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
...
@@ -1276,12 +1114,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
}
}
else
else
{
{
if
(
verbose
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]);
if (verbose)
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
GpuKernel_error(k, err));
GpuKernel_error(k, err));
...
@@ -1298,55 +1130,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
...
@@ -1298,55 +1130,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
(size_t)256),
(size_t)256),
(size_t)1, (size_t)1};
(size_t)1, (size_t)1};
if
(
0
)
{
if
(
verbose
)
fprintf
(
stderr
,
"INFO: launching conv_reference_valid
\n
"
);
if
(
verbose
)
fprintf
(
stderr
,
" img : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld
\n
"
,
(
unsigned
long
long
)
nbatch
,
(
unsigned
long
long
)
stack_len
,
(
unsigned
long
long
)
img_len
,
(
unsigned
long
long
)
img_wid
,
(
void
*
)(
cuda_get_ptr
(
img
->
ga
.
data
)
+
img
->
ga
.
offset
),
(
long
long
)
img_stride_batch
,
(
long
long
)
img_stride_stack
,
(
long
long
)
img_stride_row
,
(
long
long
)
img_stride_col
);
if
(
verbose
)
fprintf
(
stderr
,
" kern: %llu %llu %llu %llu %p "
"%lld %lld %lld %lld
\n
"
,
(
unsigned
long
long
)
nkern
,
(
unsigned
long
long
)
nstack
,
(
unsigned
long
long
)
kern_len
,
(
unsigned
long
long
)
kern_wid
,
(
void
*
)(
cuda_get_ptr
(
kern
->
ga
.
data
)
+
kern
->
ga
.
offset
),
(
long
long
)
kern_stride_nkern
,
(
long
long
)
kern_stride_stack
,
(
long
long
)
kern_stride_row
,
(
long
long
)
kern_stride_col
);
if
(
verbose
)
fprintf
(
stderr
,
" out : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld
\n
"
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
0
],
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
1
],
(
unsigned
long
long
)
out_len
,
(
unsigned
long
long
)
out_wid
,
(
void
*
)(
cuda_get_ptr
(
out
->
ga
.
data
)
+
out
->
ga
.
offset
),
(
long
long
)
out_stride_batch
,
(
long
long
)
out_stride_nkern
,
(
long
long
)
out_stride_row
,
(
long
long
)
out_stride_col
);
if
(
verbose
)
fprintf
(
stderr
,
" launch params: %i %i %i
\n
"
,
outsize
,
n_blocks
[
0
],
threads_per_block
[
0
]);
if
(
verbose
)
fprintf
(
stderr
,
" subsample params: %llu %llu
\n
"
,
(
unsigned
long
long
)
subsample_rows
,
(
unsigned
long
long
)
subsample_cols
);
}
void *kernel_params[] = {
void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&img_len, (void *)&img_wid,
(void *)&img_len, (void *)&img_wid,
...
@@ -1377,11 +1160,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
...
@@ -1377,11 +1160,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
}
}
else
else
{
{
if
(
verbose
)
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i
\n
"
,
threads_per_block
[
0
],
1
,
n_blocks
[
0
],
1
,
0
,
threads_per_block
[
0
]);
if (verbose)
if (verbose)
fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation\n",
" trying next implementation\n",
...
@@ -1465,7 +1243,7 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
...
@@ -1465,7 +1243,7 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
rval = pygpu_zeros(4, out_dim,
rval = pygpu_zeros(4, out_dim,
img->ga.typecode, GA_C_ORDER,
img->ga.typecode, GA_C_ORDER,
pygpu_default_context
()
,
Py_None
);
img->context
, Py_None);
//rval might be null
//rval might be null
}
}
if ((rval==NULL)
if ((rval==NULL)
...
@@ -1488,14 +1266,3 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
...
@@ -1488,14 +1266,3 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
}
}
return (PyObject*)rval;
return (PyObject*)rval;
}
}
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
theano/sandbox/gpuarray/conv.py
浏览文件 @
4814cd99
import
copy
import
copy
import
os
import
os
import
theano
from
theano
import
gof
from
theano
import
config
,
gof
try
:
try
:
from
pygpu
import
gpuarray
from
pygpu
import
gpuarray
...
@@ -10,7 +9,8 @@ except ImportError:
...
@@ -10,7 +9,8 @@ except ImportError:
pass
pass
from
.type
import
GpuArrayType
from
.type
import
GpuArrayType
from
.basic_ops
import
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
infer_context_name
)
from
theano.gof
import
utils
from
theano.gof
import
utils
...
@@ -58,6 +58,9 @@ class GpuConv(GpuKernelBase, gof.Op):
...
@@ -58,6 +58,9 @@ class GpuConv(GpuKernelBase, gof.Op):
them.
them.
"""
"""
__props__
=
(
'border_mode'
,
'subsample'
,
'logical_img_hw'
,
'logical_kern_hw'
,
'logical_kern_align_top'
,
'version'
,
'verbose'
,
'kshp'
,
'imshp'
,
'max_threads_dim0'
)
@staticmethod
@staticmethod
def
logical_output_shape_2d
(
imshp
,
kshp
,
mode
):
def
logical_output_shape_2d
(
imshp
,
kshp
,
mode
):
...
@@ -67,20 +70,13 @@ class GpuConv(GpuKernelBase, gof.Op):
...
@@ -67,20 +70,13 @@ class GpuConv(GpuKernelBase, gof.Op):
return
imshp
[
0
]
+
kshp
[
0
]
-
1
,
imshp
[
1
]
+
kshp
[
1
]
-
1
return
imshp
[
0
]
+
kshp
[
0
]
-
1
,
imshp
[
1
]
+
kshp
[
1
]
-
1
raise
ValueError
(
mode
)
raise
ValueError
(
mode
)
def
__init__
(
self
,
border_mode
,
def
__init__
(
self
,
border_mode
,
subsample
=
(
1
,
1
),
subsample
=
(
1
,
1
),
logical_img_hw
=
None
,
logical_kern_hw
=
None
,
logical_img_hw
=
None
,
logical_kern_hw
=
None
,
logical_kern_align_top
=
True
,
logical_kern_align_top
=
True
,
version
=-
1
,
version
=-
1
,
direction_hint
=
None
,
direction_hint
=
None
,
verbose
=
0
,
kshp
=
None
,
imshp
=
None
,
verbose
=
0
,
kshp
=
None
,
imshp
=
None
,
max_threads_dim0
=
None
,
max_threads_dim0
=
None
,
nkern
=
None
,
nkern
=
None
,
bsize
=
None
,
fft_opt
=
True
):
bsize
=
None
,
fft_opt
=
True
):
self
.
border_mode
=
border_mode
self
.
border_mode
=
border_mode
self
.
subsample
=
subsample
self
.
subsample
=
subsample
if
logical_img_hw
is
not
None
:
if
logical_img_hw
is
not
None
:
...
@@ -108,19 +104,6 @@ class GpuConv(GpuKernelBase, gof.Op):
...
@@ -108,19 +104,6 @@ class GpuConv(GpuKernelBase, gof.Op):
self
.
bsize
=
bsize
self
.
bsize
=
bsize
self
.
fft_opt
=
fft_opt
self
.
fft_opt
=
fft_opt
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
\
and
self
.
border_mode
==
other
.
border_mode
\
and
self
.
subsample
==
other
.
subsample
\
and
self
.
logical_img_hw
==
other
.
logical_img_hw
\
and
self
.
logical_kern_hw
==
other
.
logical_kern_hw
\
and
self
.
logical_kern_align_top
==
other
.
logical_kern_align_top
\
and
self
.
version
==
other
.
version
\
and
self
.
verbose
==
other
.
verbose
\
and
self
.
kshp
==
other
.
kshp
\
and
self
.
imshp
==
other
.
imshp
\
and
self
.
max_threads_dim0
==
other
.
max_threads_dim0
def
__setstate__
(
self
,
d
):
def
__setstate__
(
self
,
d
):
self
.
__dict__
.
update
(
d
)
self
.
__dict__
.
update
(
d
)
if
not
hasattr
(
self
,
"imshp"
):
if
not
hasattr
(
self
,
"imshp"
):
...
@@ -136,32 +119,6 @@ class GpuConv(GpuKernelBase, gof.Op):
...
@@ -136,32 +119,6 @@ class GpuConv(GpuKernelBase, gof.Op):
if
not
hasattr
(
self
,
"fft_opt"
):
if
not
hasattr
(
self
,
"fft_opt"
):
self
.
fft_opt
=
True
self
.
fft_opt
=
True
def
__hash__
(
self
):
# don't use hash(self.version) as hash(-1)==-2 and
# hash(-2)==-2 in python!
return
hash
(
type
(
self
))
\
^
hash
(
self
.
border_mode
)
\
^
hash
(
self
.
subsample
)
\
^
hash
(
self
.
logical_img_hw
)
\
^
hash
(
self
.
logical_kern_hw
)
\
^
hash
(
self
.
logical_kern_align_top
)
\
^
self
.
version
\
^
hash
(
self
.
verbose
)
\
^
hash
(
self
.
kshp
)
\
^
hash
(
self
.
imshp
)
\
^
hash
(
self
.
max_threads_dim0
)
def
__str__
(
self
):
return
'
%
s{
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s}'
%
(
self
.
__class__
.
__name__
,
self
.
border_mode
,
str
(
self
.
subsample
),
str
(
self
.
logical_img_hw
),
str
(
self
.
logical_kern_hw
),
str
(
self
.
logical_kern_align_top
),
str
(
self
.
imshp
),
str
(
self
.
kshp
))
def
make_node
(
self
,
img
,
kern
):
def
make_node
(
self
,
img
,
kern
):
if
img
.
dtype
!=
"float32"
or
kern
.
dtype
!=
"float32"
:
if
img
.
dtype
!=
"float32"
or
kern
.
dtype
!=
"float32"
:
raise
NotImplementedError
(
"GpuConv currently only work"
raise
NotImplementedError
(
"GpuConv currently only work"
...
@@ -170,13 +127,17 @@ class GpuConv(GpuKernelBase, gof.Op):
...
@@ -170,13 +127,17 @@ class GpuConv(GpuKernelBase, gof.Op):
raise
TypeError
(
'img must be 4D tensor'
)
raise
TypeError
(
'img must be 4D tensor'
)
if
kern
.
type
.
ndim
!=
4
:
if
kern
.
type
.
ndim
!=
4
:
raise
TypeError
(
'kern must be 4D tensor'
)
raise
TypeError
(
'kern must be 4D tensor'
)
img
=
as_gpuarray_variable
(
img
)
ctx_name
=
infer_context_name
(
img
,
kern
)
kern
=
as_gpuarray_variable
(
kern
)
img
=
as_gpuarray_variable
(
img
,
ctx_name
)
kern
=
as_gpuarray_variable
(
kern
,
ctx_name
)
broadcastable
=
[
img
.
type
.
broadcastable
[
0
],
kern
.
type
.
broadcastable
[
0
],
broadcastable
=
[
img
.
type
.
broadcastable
[
0
],
kern
.
type
.
broadcastable
[
0
],
False
,
False
]
False
,
False
]
out
=
GpuArrayType
(
img
.
dtype
,
broadcastable
)()
out
=
GpuArrayType
(
img
.
dtype
,
broadcastable
,
context_name
=
ctx_name
)()
return
gof
.
Apply
(
self
,
[
img
,
kern
],
[
out
])
return
gof
.
Apply
(
self
,
[
img
,
kern
],
[
out
])
def
get_context
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
def
flops
(
self
,
inputs
,
outputs
):
def
flops
(
self
,
inputs
,
outputs
):
"""
"""
Useful with the hack in profilemode to print the MFlops.
Useful with the hack in profilemode to print the MFlops.
...
@@ -202,22 +163,8 @@ class GpuConv(GpuKernelBase, gof.Op):
...
@@ -202,22 +163,8 @@ class GpuConv(GpuKernelBase, gof.Op):
def
make_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
def
make_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
node_
=
copy
.
copy
(
node
)
node_
=
copy
.
copy
(
node
)
assert
node
.
op
is
node_
.
op
assert
node
.
op
is
node_
.
op
if
config
.
gpuarray
.
sync
:
raise
NotImplementedError
(
"GpuConv do not implement gpuarray.sync Theano flag"
)
if
node_
.
op
.
max_threads_dim0
is
None
:
if
node_
.
op
.
max_threads_dim0
is
None
:
cuda
=
theano
.
sandbox
.
cuda
node_
.
op
.
max_threads_dim0
=
node_
.
inputs
[
0
]
.
type
.
context
.
maxlsize
device_id
=
cuda
.
use
.
device_number
if
device_id
is
None
:
cuda
.
use
(
"gpu"
,
force
=
False
,
default_to_move_computation_to_gpu
=
False
,
move_shared_float32_to_gpu
=
False
,
enable_cuda
=
False
,
test_driver
=
True
)
device_id
=
cuda
.
use
.
device_number
cuda_ndarray
=
theano
.
sandbox
.
cuda
.
cuda_ndarray
.
cuda_ndarray
prop
=
cuda_ndarray
.
device_properties
(
device_id
)
node_
.
op
.
max_threads_dim0
=
prop
[
'maxThreadsDim0'
]
return
super
(
GpuConv
,
node_
.
op
)
.
make_thunk
(
node_
,
storage_map
,
return
super
(
GpuConv
,
node_
.
op
)
.
make_thunk
(
node_
,
storage_map
,
compute_map
,
no_recycling
)
compute_map
,
no_recycling
)
...
@@ -232,9 +179,11 @@ class GpuConv(GpuKernelBase, gof.Op):
...
@@ -232,9 +179,11 @@ class GpuConv(GpuKernelBase, gof.Op):
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
# raise this whenever modifying any of the support_code_files
# raise this whenever modifying any of the support_code_files
return
(
0
,
2
2
)
return
(
0
,
2
3
)
def
c_code
(
self
,
node
,
nodename
,
inp
,
out_
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out_
,
sub
):
if
node
.
inputs
[
0
]
.
type
.
context
.
kind
!=
"cuda"
:
raise
NotImplementedError
(
"GpuConv only works for cuda devices"
)
img
,
kern
=
inp
img
,
kern
=
inp
out
,
=
out_
out
,
=
out_
dx
=
self
.
subsample
[
0
]
dx
=
self
.
subsample
[
0
]
...
@@ -302,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op):
...
@@ -302,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op):
"""
%
locals
()
"""
%
locals
()
code
+=
"
\n
"
.
join
([
open
(
os
.
path
.
join
(
os
.
path
.
split
(
__file__
)[
0
],
f
))
.
read
()
code
+=
"
\n
"
.
join
([
open
(
os
.
path
.
join
(
os
.
path
.
split
(
__file__
)[
0
],
f
))
.
read
()
for
f
in
[
"conv_kernel.cu"
,
"conv_full_kernel.cu"
]])
for
f
in
[
"conv_kernel.cu"
,
"conv_full_kernel.cu"
]])
kname
=
"conv_full_load_everything"
gk
=
gpuarray
.
GpuKernel
(
code
,
k
.
name
,
k
.
params
,
**
k
.
flags
)
gk
=
gpuarray
.
GpuKernel
(
code
,
k
.
name
,
k
.
params
,
**
k
.
flags
)
bin
=
gk
.
_binary
bin
=
gk
.
_binary
bcode
=
','
.
join
(
hex
(
ord
(
c
))
for
c
in
bin
)
bcode
=
','
.
join
(
hex
(
ord
(
c
))
for
c
in
bin
)
...
@@ -313,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op):
...
@@ -313,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op):
static const char conv_bcode[] = {
%(bcode)
s};
static const char conv_bcode[] = {
%(bcode)
s};
static const char *conv_code = "
%(code)
s";
static const char *conv_code = "
%(code)
s";
"""
%
locals
()
"""
%
locals
()
for
k
in
kernels
:
return
mod
mod
+=
"static GpuKernel "
+
k
.
name
+
'_'
+
name
+
";
\n
"
mod
+=
open
(
os
.
path
.
join
(
os
.
path
.
split
(
__file__
)[
0
],
"conv.cu"
))
.
read
()
def
c_support_code_struct
(
self
,
node
,
name
):
mod
=
GpuKernelBase
.
c_support_code_struct
(
self
,
node
,
name
)
with
open
(
os
.
path
.
join
(
os
.
path
.
split
(
__file__
)[
0
],
"conv.cu"
))
as
f
:
mod
+=
f
.
read
()
return
mod
return
mod
@utils.memoize
@utils.memoize
...
...
theano/sandbox/gpuarray/conv_kernel.cu
浏览文件 @
4814cd99
...
@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
...
@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
//Must be the same size as a ptr. We can't use unsigned long as on Windows 64
//Must be the same size as a ptr. We can't use unsigned long as on Windows 64
//bit, it is 32 bit.
//bit, it is 32 bit.
const
uintptr
_t
COALESCED_ALIGN
=
0xFFFFFFFFFFFFFF00
;
// zero-out the trailing bits of pointers
const
size
_t
COALESCED_ALIGN
=
0xFFFFFFFFFFFFFF00
;
// zero-out the trailing bits of pointers
__device__
void
load_to_shared
(
float
*
dst
,
const
float
*
src
,
const
int
thread_id
,
int
nb_thread
,
const
int
N
,
const
bool
flipped
=
false
){
__device__
void
load_to_shared
(
float
*
dst
,
const
float
*
src
,
const
int
thread_id
,
int
nb_thread
,
const
int
N
,
const
bool
flipped
=
false
){
if
(
nb_thread
<
64
)
if
(
nb_thread
<
64
)
...
@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
...
@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
if
(
thread_id
<
nb_thread
)
if
(
thread_id
<
nb_thread
)
{
{
const
float
*
my_src_ptr
=
(
const
float
*
)(
const
float
*
my_src_ptr
=
(
const
float
*
)(
((
uintptr
_t
)
src
)
&
COALESCED_ALIGN
);
((
size
_t
)
src
)
&
COALESCED_ALIGN
);
my_src_ptr
+=
thread_id
;
my_src_ptr
+=
thread_id
;
while
(
my_src_ptr
<
src
+
N
)
while
(
my_src_ptr
<
src
+
N
)
{
{
...
...
theano/sandbox/gpuarray/dnn.py
浏览文件 @
4814cd99
...
@@ -15,8 +15,9 @@ from theano.tensor.nnet import SoftmaxGrad
...
@@ -15,8 +15,9 @@ from theano.tensor.nnet import SoftmaxGrad
from
theano.tensor.signal.downsample
import
(
from
theano.tensor.signal.downsample
import
(
DownsampleFactorMax
,
MaxPoolGrad
,
AveragePoolGrad
)
DownsampleFactorMax
,
MaxPoolGrad
,
AveragePoolGrad
)
from
.
import
pygpu
,
init_dev
from
.
import
pygpu
from
.basic_ops
import
(
as_gpuarray_variable
,
from
.type
import
get_context
,
gpu_context_type
,
list_contexts
from
.basic_ops
import
(
as_gpuarray_variable
,
infer_context_name
,
gpu_contiguous
,
HostFromGpu
,
gpu_contiguous
,
HostFromGpu
,
GpuAllocEmpty
,
empty_like
)
GpuAllocEmpty
,
empty_like
)
from
.elemwise
import
GpuElemwise
from
.elemwise
import
GpuElemwise
...
@@ -29,28 +30,14 @@ from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
...
@@ -29,28 +30,14 @@ from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
from
.opt_util
import
alpha_merge
,
output_merge
,
inplace_allocempty
from
.opt_util
import
alpha_merge
,
output_merge
,
inplace_allocempty
def
dnn_available
():
def
_dnn_check_compile
():
if
dnn_available
.
avail
is
not
None
:
return
dnn_available
.
avail
if
pygpu
is
None
:
dnn_available
.
msg
=
"PyGPU not available"
dnn_available
.
avail
=
False
return
False
if
not
init_dev
.
device
.
startswith
(
'cuda'
):
dnn_available
.
msg
=
"Not on a CUDA device. Got
%
s."
%
init_dev
.
device
dnn_available
.
avail
=
False
return
False
# This is a hack because bin_id is in the from of
# "sm_<major><minor>" for cuda devices.
if
pygpu
.
get_default_context
()
.
bin_id
[:
-
2
]
<
'30'
:
dnn_available
.
msg
=
"Device not supported by cuDNN"
dnn_available
.
avail
=
False
preambule
=
"""
preambule
=
"""
#include <stdio.h>
#include <stdio.h>
#include <cudnn.h>
#include <cudnn.h>
#include <cudnn_helper.h>
#include <cudnn_helper.h>
"""
"""
# No need for the context in here since we won't execute that code
body
=
"""
body
=
"""
cudnnHandle_t _handle = NULL;
cudnnHandle_t _handle = NULL;
cudnnStatus_t err;
cudnnStatus_t err;
...
@@ -70,35 +57,71 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
...
@@ -70,35 +57,71 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
# default gpu, not the one selected by the user. If mixed
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
# exclusive mode, this cause bad detection.
comp
,
out
,
err
=
GCC_compiler
.
try_flags
(
avail
,
out
,
err
=
GCC_compiler
.
try_flags
(
params
,
preambule
=
preambule
,
body
=
body
,
params
,
preambule
=
preambule
,
body
=
body
,
try_run
=
False
,
output
=
True
)
try_run
=
False
,
output
=
True
)
dnn_available
.
avail
=
comp
if
not
avail
:
if
not
dnn_available
.
avail
:
return
False
,
(
"Theano cannot compile with cuDNN. "
dnn_available
.
msg
=
(
"We got this error:
\n
"
+
str
(
err
))
"Theano cannot compile with cuDNN. We got this error:
\n
"
+
return
True
,
None
str
(
err
))
else
:
# If we can compile, check that we can import and run.
def
_dnn_check_version
():
v
=
version
()
v
=
version
()
if
v
<
2000
:
if
v
<
2000
:
dnn_available
.
avail
=
False
return
False
,
(
dnn_available
.
msg
=
(
"You have an old release of CuDNN (or a release candidate) "
"You have an old release of CuDNN (or a release candidate) "
"that isn't supported. Please update to at least v2 final "
"that isn't supported. Please update to at least v2 final "
"version."
)
"version."
)
if
v
>=
3000
and
v
<
3007
:
raise
RuntimeError
(
dnn_available
.
msg
)
return
False
,
(
if
v
>=
3000
and
v
<
3007
:
"You have installed a release candidate of CuDNN v3. This "
dnn_available
.
avail
=
False
"isn't supported. Please update to v3 final version."
)
dnn_available
.
msg
=
(
"You have installed a release candidate of CuDNN v3. This "
return
True
,
None
"isn't supported. Please update to v3 final version."
)
raise
RuntimeError
(
dnn_available
.
msg
)
def
dnn_present
():
return
dnn_available
.
avail
if
dnn_present
.
avail
is
not
None
:
return
dnn_present
.
avail
dnn_available
.
avail
=
None
if
pygpu
is
None
:
dnn_present
.
msg
=
"PyGPU not available"
dnn_present
.
avail
=
False
return
False
dnn_present
.
avail
,
dnn_present
.
msg
=
_dnn_check_compile
()
if
dnn_present
.
avail
:
dnn_present
.
avail
,
dnn_present
.
msg
=
_dnn_check_version
()
if
not
dnn_present
.
avail
:
raise
RuntimeError
(
dnn_present
.
msg
)
return
dnn_present
.
avail
dnn_present
.
avail
=
None
dnn_present
.
msg
=
None
def
dnn_available
(
context_name
):
if
not
dnn_present
():
dnn_available
.
msg
=
dnn_present
.
msg
return
False
ctx
=
get_context
(
context_name
)
if
not
ctx
.
kind
==
'cuda'
:
dnn_available
.
msg
=
"Not on a CUDA device."
return
False
# This is a hack because bin_id is in the from of
# "<something>_<major><minor>" for cuda devices.
if
ctx
.
bin_id
[:
-
2
]
<
'30'
:
dnn_available
.
msg
=
"Device not supported by cuDNN"
return
False
return
True
dnn_available
.
msg
=
None
dnn_available
.
msg
=
None
...
@@ -110,6 +133,10 @@ class DnnBase(COp):
...
@@ -110,6 +133,10 @@ class DnnBase(COp):
# dnn does not know about broadcasting, so we do not need to assert
# dnn does not know about broadcasting, so we do not need to assert
# the input broadcasting pattern.
# the input broadcasting pattern.
check_broadcast
=
False
check_broadcast
=
False
context_type
=
gpu_context_type
def
get_context
(
self
,
node
):
return
node
.
outputs
[
0
]
.
type
.
context
def
__init__
(
self
,
files
=
None
,
c_func
=
None
):
def
__init__
(
self
,
files
=
None
,
c_func
=
None
):
if
files
is
None
:
if
files
is
None
:
...
@@ -181,7 +208,7 @@ def version():
...
@@ -181,7 +208,7 @@ def version():
This also does a check that the header version matches the runtime version.
This also does a check that the header version matches the runtime version.
"""
"""
if
not
dnn_
available
():
if
not
dnn_
present
():
raise
Exception
(
raise
Exception
(
"We can't determine the cudnn version as it is not available"
,
"We can't determine the cudnn version as it is not available"
,
dnn_available
.
msg
)
dnn_available
.
msg
)
...
@@ -390,9 +417,10 @@ class GpuDnnConv(DnnBase):
...
@@ -390,9 +417,10 @@ class GpuDnnConv(DnnBase):
return
defs
return
defs
def
make_node
(
self
,
img
,
kern
,
output
,
desc
,
alpha
=
None
,
beta
=
None
):
def
make_node
(
self
,
img
,
kern
,
output
,
desc
,
alpha
=
None
,
beta
=
None
):
img
=
as_gpuarray_variable
(
img
)
ctx_name
=
infer_context_name
(
img
,
kern
,
output
)
kern
=
as_gpuarray_variable
(
kern
)
img
=
as_gpuarray_variable
(
img
,
ctx_name
)
output
=
as_gpuarray_variable
(
output
)
kern
=
as_gpuarray_variable
(
kern
,
ctx_name
)
output
=
as_gpuarray_variable
(
output
,
ctx_name
)
if
img
.
type
.
ndim
not
in
(
4
,
5
):
if
img
.
type
.
ndim
not
in
(
4
,
5
):
raise
TypeError
(
'img must be 4D or 5D tensor'
)
raise
TypeError
(
'img must be 4D or 5D tensor'
)
if
kern
.
type
.
ndim
not
in
(
4
,
5
):
if
kern
.
type
.
ndim
not
in
(
4
,
5
):
...
@@ -574,9 +602,10 @@ class GpuDnnConvGradW(DnnBase):
...
@@ -574,9 +602,10 @@ class GpuDnnConvGradW(DnnBase):
return
defs
return
defs
def
make_node
(
self
,
img
,
topgrad
,
output
,
desc
,
alpha
=
None
,
beta
=
None
):
def
make_node
(
self
,
img
,
topgrad
,
output
,
desc
,
alpha
=
None
,
beta
=
None
):
img
=
as_gpuarray_variable
(
img
)
ctx_name
=
infer_context_name
(
img
,
topgrad
,
output
)
topgrad
=
as_gpuarray_variable
(
topgrad
)
img
=
as_gpuarray_variable
(
img
,
ctx_name
)
output
=
as_gpuarray_variable
(
output
)
topgrad
=
as_gpuarray_variable
(
topgrad
,
ctx_name
)
output
=
as_gpuarray_variable
(
output
,
ctx_name
)
if
img
.
type
.
ndim
not
in
(
4
,
5
):
if
img
.
type
.
ndim
not
in
(
4
,
5
):
raise
TypeError
(
'img must be 4D or 5D tensor'
)
raise
TypeError
(
'img must be 4D or 5D tensor'
)
if
topgrad
.
type
.
ndim
not
in
(
4
,
5
):
if
topgrad
.
type
.
ndim
not
in
(
4
,
5
):
...
@@ -689,9 +718,10 @@ class GpuDnnConvGradI(DnnBase):
...
@@ -689,9 +718,10 @@ class GpuDnnConvGradI(DnnBase):
return
defs
return
defs
def
make_node
(
self
,
kern
,
topgrad
,
output
,
desc
,
alpha
=
None
,
beta
=
None
):
def
make_node
(
self
,
kern
,
topgrad
,
output
,
desc
,
alpha
=
None
,
beta
=
None
):
kern
=
as_gpuarray_variable
(
kern
)
ctx_name
=
infer_context_name
(
kern
,
topgrad
,
output
)
topgrad
=
as_gpuarray_variable
(
topgrad
)
kern
=
as_gpuarray_variable
(
kern
,
ctx_name
)
output
=
as_gpuarray_variable
(
output
)
topgrad
=
as_gpuarray_variable
(
topgrad
,
ctx_name
)
output
=
as_gpuarray_variable
(
output
,
ctx_name
)
if
kern
.
type
.
ndim
not
in
(
4
,
5
):
if
kern
.
type
.
ndim
not
in
(
4
,
5
):
raise
TypeError
(
'kern must be 4D or 5D tensor'
)
raise
TypeError
(
'kern must be 4D or 5D tensor'
)
if
topgrad
.
type
.
ndim
not
in
(
4
,
5
):
if
topgrad
.
type
.
ndim
not
in
(
4
,
5
):
...
@@ -770,6 +800,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
...
@@ -770,6 +800,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
warnings
.
warn
(
"workmem is deprecated, use algo instead"
,
stacklevel
=
2
)
warnings
.
warn
(
"workmem is deprecated, use algo instead"
,
stacklevel
=
2
)
algo
=
workmem
algo
=
workmem
fgraph
=
getattr
(
img
,
'fgraph'
,
None
)
or
getattr
(
kerns
,
'fgraph'
,
None
)
fgraph
=
getattr
(
img
,
'fgraph'
,
None
)
or
getattr
(
kerns
,
'fgraph'
,
None
)
ctx_name
=
infer_context_name
(
img
,
kerns
)
if
(
border_mode
==
'valid'
and
subsample
==
(
1
,
1
)
and
if
(
border_mode
==
'valid'
and
subsample
==
(
1
,
1
)
and
direction_hint
==
'bprop weights'
):
direction_hint
==
'bprop weights'
):
# Special case: We are asked to use GpuDnnConvGradW. We need to set
# Special case: We are asked to use GpuDnnConvGradW. We need to set
...
@@ -782,12 +813,13 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
...
@@ -782,12 +813,13 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
kerns
=
gpu_contiguous
(
kerns
.
dimshuffle
(
1
,
0
,
2
,
3
))
kerns
=
gpu_contiguous
(
kerns
.
dimshuffle
(
1
,
0
,
2
,
3
))
shape2
=
shape_i
(
img
,
2
,
fgraph
)
-
shape_i
(
kerns
,
2
,
fgraph
)
+
1
shape2
=
shape_i
(
img
,
2
,
fgraph
)
-
shape_i
(
kerns
,
2
,
fgraph
)
+
1
shape3
=
shape_i
(
img
,
3
,
fgraph
)
-
shape_i
(
kerns
,
3
,
fgraph
)
+
1
shape3
=
shape_i
(
img
,
3
,
fgraph
)
-
shape_i
(
kerns
,
3
,
fgraph
)
+
1
out
=
GpuAllocEmpty
(
img
.
dtype
)(
shape_i
(
kerns
,
1
,
fgraph
),
out
=
GpuAllocEmpty
(
img
.
dtype
,
ctx_name
)(
shape_i
(
img
,
1
,
fgraph
),
shape2
,
shape3
)
shape_i
(
kerns
,
1
,
fgraph
),
shape_i
(
img
,
1
,
fgraph
),
shape2
,
shape3
)
desc
=
GpuDnnConvDesc
(
border_mode
=
'valid'
,
subsample
=
(
1
,
1
),
desc
=
GpuDnnConvDesc
(
border_mode
=
'valid'
,
subsample
=
(
1
,
1
),
conv_mode
=
'cross'
)(
out
.
shape
)
conv_mode
=
'cross'
)(
out
.
shape
)
conv
=
GpuDnnConvGradW
()(
img
,
kerns
,
out
,
desc
)
conv
=
GpuDnnConvGradW
()(
img
,
kerns
,
out
,
desc
)
return
as_gpuarray_variable
(
conv
.
dimshuffle
(
1
,
0
,
2
,
3
))
return
as_gpuarray_variable
(
conv
.
dimshuffle
(
1
,
0
,
2
,
3
)
,
ctx_name
)
elif
(
border_mode
==
'full'
and
subsample
==
(
1
,
1
)
and
elif
(
border_mode
==
'full'
and
subsample
==
(
1
,
1
)
and
direction_hint
!=
'forward!'
):
direction_hint
!=
'forward!'
):
...
@@ -799,9 +831,9 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
...
@@ -799,9 +831,9 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode
=
'cross'
if
conv_mode
==
'conv'
else
'conv'
conv_mode
=
'cross'
if
conv_mode
==
'conv'
else
'conv'
shape2
=
shape_i
(
img
,
2
,
fgraph
)
+
shape_i
(
kerns
,
2
,
fgraph
)
-
1
shape2
=
shape_i
(
img
,
2
,
fgraph
)
+
shape_i
(
kerns
,
2
,
fgraph
)
-
1
shape3
=
shape_i
(
img
,
3
,
fgraph
)
+
shape_i
(
kerns
,
3
,
fgraph
)
-
1
shape3
=
shape_i
(
img
,
3
,
fgraph
)
+
shape_i
(
kerns
,
3
,
fgraph
)
-
1
out
=
GpuAllocEmpty
(
img
.
dtype
)(
shape_i
(
img
,
0
,
fgraph
),
out
=
GpuAllocEmpty
(
img
.
dtype
,
ctx_name
)(
shape_i
(
img
,
0
,
fgraph
),
shape_i
(
kerns
,
1
,
fgraph
),
shape_i
(
kerns
,
1
,
fgraph
),
shape2
,
shape3
)
shape2
,
shape3
)
desc
=
GpuDnnConvDesc
(
border_mode
=
'valid'
,
subsample
=
(
1
,
1
),
desc
=
GpuDnnConvDesc
(
border_mode
=
'valid'
,
subsample
=
(
1
,
1
),
conv_mode
=
conv_mode
)(
kerns
.
shape
)
conv_mode
=
conv_mode
)(
kerns
.
shape
)
return
GpuDnnConvGradI
()(
kerns
,
img
,
out
,
desc
)
return
GpuDnnConvGradI
()(
kerns
,
img
,
out
,
desc
)
...
@@ -817,7 +849,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
...
@@ -817,7 +849,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
out_shp
=
GpuDnnConv
.
get_out_shape
(
img
.
shape
,
kerns
.
shape
,
out_shp
=
GpuDnnConv
.
get_out_shape
(
img
.
shape
,
kerns
.
shape
,
desc_op
.
border_mode
,
desc_op
.
border_mode
,
desc_op
.
subsample
)
desc_op
.
subsample
)
out
=
GpuAllocEmpty
(
img
.
dtype
)(
*
out_shp
)
out
=
GpuAllocEmpty
(
img
.
dtype
,
ctx_name
)(
*
out_shp
)
return
GpuDnnConv
(
algo
=
algo
)(
img
,
kerns
,
out
,
desc
)
return
GpuDnnConv
(
algo
=
algo
)(
img
,
kerns
,
out
,
desc
)
...
@@ -948,7 +980,7 @@ class GpuDnnPool(DnnBase):
...
@@ -948,7 +980,7 @@ class GpuDnnPool(DnnBase):
DnnBase
.
__init__
(
self
,
[
"dnn_pool.c"
],
"APPLY_SPECIFIC(dnn_pool)"
)
DnnBase
.
__init__
(
self
,
[
"dnn_pool.c"
],
"APPLY_SPECIFIC(dnn_pool)"
)
def
make_node
(
self
,
img
,
desc
):
def
make_node
(
self
,
img
,
desc
):
img
=
as_gpuarray_variable
(
img
)
img
=
as_gpuarray_variable
(
img
,
infer_context_name
(
img
)
)
if
desc
.
owner
is
not
None
:
if
desc
.
owner
is
not
None
:
e_ndim
=
desc
.
owner
.
op
.
get_ndim
()
+
2
e_ndim
=
desc
.
owner
.
op
.
get_ndim
()
+
2
...
@@ -1002,7 +1034,7 @@ class GpuDnnPoolGrad(DnnBase):
...
@@ -1002,7 +1034,7 @@ class GpuDnnPoolGrad(DnnBase):
The input of the pooling.
The input of the pooling.
out
out
The output of the pooling in the forward.
The output of the pooling in the forward.
inp
_grad
out
_grad
Same size as out, but is the corresponding gradient information.
Same size as out, but is the corresponding gradient information.
desc
desc
The pooling descriptor.
The pooling descriptor.
...
@@ -1016,9 +1048,10 @@ class GpuDnnPoolGrad(DnnBase):
...
@@ -1016,9 +1048,10 @@ class GpuDnnPoolGrad(DnnBase):
"APPLY_SPECIFIC(dnn_pool_grad)"
)
"APPLY_SPECIFIC(dnn_pool_grad)"
)
def
make_node
(
self
,
inp
,
out
,
out_grad
,
desc
):
def
make_node
(
self
,
inp
,
out
,
out_grad
,
desc
):
inp
=
as_gpuarray_variable
(
inp
)
ctx_name
=
infer_context_name
(
inp
,
out
,
out_grad
)
out_grad
=
as_gpuarray_variable
(
out_grad
)
inp
=
as_gpuarray_variable
(
inp
,
ctx_name
)
out
=
as_gpuarray_variable
(
out
)
out_grad
=
as_gpuarray_variable
(
out_grad
,
ctx_name
)
out
=
as_gpuarray_variable
(
out
,
ctx_name
)
if
desc
.
owner
is
not
None
:
if
desc
.
owner
is
not
None
:
nd
=
desc
.
owner
.
op
.
get_ndim
()
+
2
nd
=
desc
.
owner
.
op
.
get_ndim
()
+
2
...
@@ -1147,7 +1180,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
...
@@ -1147,7 +1180,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
c_func
=
"APPLY_SPECIFIC(softmax)"
c_func
=
"APPLY_SPECIFIC(softmax)"
def
make_node
(
self
,
x
):
def
make_node
(
self
,
x
):
x
=
as_gpuarray_variable
(
x
)
x
=
as_gpuarray_variable
(
x
,
infer_context_name
(
x
)
)
assert
x
.
ndim
==
4
assert
x
.
ndim
==
4
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
...
@@ -1181,8 +1214,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
...
@@ -1181,8 +1214,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
c_func
=
"APPLY_SPECIFIC(softmax_grad)"
c_func
=
"APPLY_SPECIFIC(softmax_grad)"
def
make_node
(
self
,
dy
,
sm
):
def
make_node
(
self
,
dy
,
sm
):
dy
=
as_gpuarray_variable
(
dy
)
ctx_name
=
infer_context_name
(
dy
,
sm
)
sm
=
as_gpuarray_variable
(
sm
)
dy
=
as_gpuarray_variable
(
dy
,
ctx_name
)
sm
=
as_gpuarray_variable
(
sm
,
ctx_name
)
assert
dy
.
ndim
==
4
assert
dy
.
ndim
==
4
assert
sm
.
ndim
==
4
assert
sm
.
ndim
==
4
return
Apply
(
self
,
[
dy
,
sm
],
[
sm
.
type
()])
return
Apply
(
self
,
[
dy
,
sm
],
[
sm
.
type
()])
...
@@ -1191,9 +1225,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
...
@@ -1191,9 +1225,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
# @register_opt('cudnn') # this optimizer is registered in opt.py instead.
# @register_opt('cudnn') # this optimizer is registered in opt.py instead.
@local_optimizer
([
GpuConv
])
@local_optimizer
([
GpuConv
])
def
local_conv_dnn
(
node
):
def
local_conv_dnn
(
node
):
if
not
dnn_available
():
return
if
isinstance
(
node
.
op
,
GpuConv
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
not
dnn_available
(
node
.
outputs
[
0
]
.
type
.
context_name
):
return
if
node
.
op
.
border_mode
not
in
[
'full'
,
'valid'
]:
if
node
.
op
.
border_mode
not
in
[
'full'
,
'valid'
]:
return
return
img
,
kern
=
node
.
inputs
img
,
kern
=
node
.
inputs
...
@@ -1211,9 +1245,9 @@ def local_conv_dnn(node):
...
@@ -1211,9 +1245,9 @@ def local_conv_dnn(node):
# because for some input/kernel shape configurations, this is faster.
# because for some input/kernel shape configurations, this is faster.
@local_optimizer
([
GpuConv
])
@local_optimizer
([
GpuConv
])
def
local_conv_dnn_alternative
(
node
):
def
local_conv_dnn_alternative
(
node
):
if
not
dnn_available
():
return
if
isinstance
(
node
.
op
,
GpuConv
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
not
dnn_available
(
node
.
outputs
[
0
]
.
type
.
context_name
):
return
border_mode
=
node
.
op
.
border_mode
border_mode
=
node
.
op
.
border_mode
subsample
=
node
.
op
.
subsample
subsample
=
node
.
op
.
subsample
if
border_mode
not
in
[
'full'
,
'valid'
]
or
subsample
!=
(
1
,
1
):
if
border_mode
not
in
[
'full'
,
'valid'
]
or
subsample
!=
(
1
,
1
):
...
@@ -1304,8 +1338,8 @@ def local_dnn_convi_output_merge(node, *inputs):
...
@@ -1304,8 +1338,8 @@ def local_dnn_convi_output_merge(node, *inputs):
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@op_lifter
([
DownsampleFactorMax
])
@op_lifter
([
DownsampleFactorMax
])
def
local_pool_dnn_alternative
(
node
):
def
local_pool_dnn_alternative
(
node
,
ctx_name
):
if
not
dnn_available
():
if
not
dnn_available
(
ctx_name
):
return
return
if
not
node
.
op
.
ignore_border
:
if
not
node
.
op
.
ignore_border
:
return
return
...
@@ -1320,8 +1354,8 @@ def local_pool_dnn_alternative(node):
...
@@ -1320,8 +1354,8 @@ def local_pool_dnn_alternative(node):
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@op_lifter
([
MaxPoolGrad
])
@op_lifter
([
MaxPoolGrad
])
def
local_pool_dnn_grad_stride
(
node
):
def
local_pool_dnn_grad_stride
(
node
,
ctx_name
):
if
not
dnn_available
():
if
not
dnn_available
(
ctx_name
):
return
return
if
not
node
.
op
.
ignore_border
:
if
not
node
.
op
.
ignore_border
:
return
return
...
@@ -1340,8 +1374,8 @@ def local_pool_dnn_grad_stride(node):
...
@@ -1340,8 +1374,8 @@ def local_pool_dnn_grad_stride(node):
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@op_lifter
([
AveragePoolGrad
])
@op_lifter
([
AveragePoolGrad
])
def
local_avg_pool_dnn_grad_stride
(
node
):
def
local_avg_pool_dnn_grad_stride
(
node
,
ctx_name
):
if
not
dnn_available
():
if
not
dnn_available
(
ctx_name
):
return
return
if
not
node
.
op
.
ignore_border
:
if
not
node
.
op
.
ignore_border
:
return
return
...
@@ -1363,22 +1397,23 @@ def local_avg_pool_dnn_grad_stride(node):
...
@@ -1363,22 +1397,23 @@ def local_avg_pool_dnn_grad_stride(node):
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@local_optimizer
([
GpuSoftmax
])
@local_optimizer
([
GpuSoftmax
])
def
local_softmax_dnn
(
node
):
def
local_softmax_dnn
(
node
):
if
not
dnn_available
():
return
if
isinstance
(
node
.
op
,
GpuSoftmax
):
if
isinstance
(
node
.
op
,
GpuSoftmax
):
if
not
dnn_available
(
node
.
outputs
[
0
]
.
type
.
context_name
):
return
ins
=
node
.
inputs
[
0
]
.
dimshuffle
(
0
,
1
,
'x'
,
'x'
)
ins
=
node
.
inputs
[
0
]
.
dimshuffle
(
0
,
1
,
'x'
,
'x'
)
ins
=
gpu_contiguous
(
ins
)
ins
=
gpu_contiguous
(
ins
)
out
=
GpuDnnSoftmax
(
'accurate'
,
'channel'
)(
ins
)
out
=
GpuDnnSoftmax
(
'accurate'
,
'channel'
)(
ins
)
out
=
as_gpuarray_variable
(
out
.
dimshuffle
(
0
,
1
))
out
=
as_gpuarray_variable
(
out
.
dimshuffle
(
0
,
1
)
,
out
.
type
.
context_name
)
return
[
out
]
return
[
out
]
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@local_optimizer
([
GpuElemwise
])
@local_optimizer
([
GpuElemwise
])
def
local_log_softmax_dnn
(
node
):
def
local_log_softmax_dnn
(
node
):
if
not
dnn_available
()
or
version
()
<
3000
:
if
version
()
<
3000
:
# No log-softmax before cudnn v3
# No log-softmax before cudnn v3
return
return
# This looks for GpuDnnSoftmax so we know that we have cudnn.
if
(
isinstance
(
node
.
op
,
GpuElemwise
)
and
if
(
isinstance
(
node
.
op
,
GpuElemwise
)
and
isinstance
(
node
.
op
.
scalar_op
,
Log
)
and
isinstance
(
node
.
op
.
scalar_op
,
Log
)
and
node
.
inputs
[
0
]
.
owner
and
node
.
inputs
[
0
]
.
owner
and
...
@@ -1392,24 +1427,25 @@ def local_log_softmax_dnn(node):
...
@@ -1392,24 +1427,25 @@ def local_log_softmax_dnn(node):
class
NoCuDNNRaise
(
Optimizer
):
class
NoCuDNNRaise
(
Optimizer
):
def
apply
(
self
,
fgraph
):
def
apply
(
self
,
fgraph
):
"""
"""
Raise a
RuntimeE
rror if cudnn can't be used.
Raise a
e
rror if cudnn can't be used.
"""
"""
if
not
dnn_available
():
for
c
in
list_contexts
():
# Make an assert error as we want Theano to fail, not
if
not
dnn_available
(
c
):
# just skip this optimization.
# Make an assert error as we want Theano to fail, not
raise
AssertionError
(
# just skip this optimization.
"cuDNN optimization was enabled, but Theano was not able"
raise
AssertionError
(
" to use it. We got this error:
\n
"
+
"cuDNN optimization was enabled, but Theano was not able "
dnn_available
.
msg
)
"to use it for context "
+
c
+
". We got this error:
\n
"
+
dnn_available
.
msg
)
gpu_seqopt
.
register
(
"NoCuDNNRaise"
,
NoCuDNNRaise
(),
0
,
'cudnn'
)
gpu_seqopt
.
register
(
"NoCuDNNRaise"
,
NoCuDNNRaise
(),
0
,
'cudnn'
)
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@op_lifter
([
SoftmaxGrad
])
@op_lifter
([
SoftmaxGrad
])
def
local_softmax_dnn_grad
(
node
):
def
local_softmax_dnn_grad
(
node
,
ctx_name
):
if
not
dnn_available
():
if
not
dnn_available
(
ctx_name
):
return
return
ins
=
[]
ins
=
[]
for
n
in
node
.
inputs
:
for
n
in
node
.
inputs
:
...
...
theano/sandbox/gpuarray/dnn_base.c
浏览文件 @
4814cd99
...
@@ -107,14 +107,14 @@ cudnnHandle_t APPLY_SPECIFIC(_handle);
...
@@ -107,14 +107,14 @@ cudnnHandle_t APPLY_SPECIFIC(_handle);
#section init_code_struct
#section init_code_struct
{
{
cuda_enter
(
pygpu_default_context
()
->
ctx
);
cuda_enter
(
CONTEXT
->
ctx
);
cudnnStatus_t
err
;
cudnnStatus_t
err
;
APPLY_SPECIFIC
(
_handle
)
=
NULL
;
APPLY_SPECIFIC
(
_handle
)
=
NULL
;
if
((
err
=
cudnnCreate
(
&
APPLY_SPECIFIC
(
_handle
)))
!=
CUDNN_STATUS_SUCCESS
)
{
if
((
err
=
cudnnCreate
(
&
APPLY_SPECIFIC
(
_handle
)))
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"could not create cuDNN handle: %s"
,
PyErr_Format
(
PyExc_RuntimeError
,
"could not create cuDNN handle: %s"
,
cudnnGetErrorString
(
err
));
cudnnGetErrorString
(
err
));
cuda_exit
(
pygpu_default_context
()
->
ctx
);
cuda_exit
(
CONTEXT
->
ctx
);
FAIL
;
FAIL
;
}
}
cuda_exit
(
pygpu_default_context
()
->
ctx
);
cuda_exit
(
CONTEXT
->
ctx
);
}
}
theano/sandbox/gpuarray/dnn_fwd.c
浏览文件 @
4814cd99
...
@@ -5,12 +5,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -5,12 +5,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyGpuArrayObject
*
om
,
PyGpuArrayObject
*
om
,
cudnnConvolutionDescriptor_t
desc
,
cudnnConvolutionDescriptor_t
desc
,
double
alpha
,
double
beta
,
double
alpha
,
double
beta
,
PyGpuArrayObject
**
output
)
{
PyGpuArrayObject
**
output
,
PyGpuContextObject
*
c
)
{
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
float
af
=
alpha
,
bf
=
beta
;
float
af
=
alpha
,
bf
=
beta
;
void
*
alpha_p
;
void
*
alpha_p
;
void
*
beta_p
;
void
*
beta_p
;
PyGpuContextObject
*
c
=
pygpu_default_context
();
if
(
PyGpuArray_DIMS
(
input
)[
1
]
!=
PyGpuArray_DIMS
(
kerns
)[
1
])
{
if
(
PyGpuArray_DIMS
(
input
)[
1
]
!=
PyGpuArray_DIMS
(
kerns
)[
1
])
{
PyErr_SetString
(
PyExc_ValueError
,
PyErr_SetString
(
PyExc_ValueError
,
...
...
theano/sandbox/gpuarray/dnn_gi.c
浏览文件 @
4814cd99
...
@@ -4,12 +4,12 @@ int
...
@@ -4,12 +4,12 @@ int
APPLY_SPECIFIC
(
conv_gi
)(
PyGpuArrayObject
*
kerns
,
PyGpuArrayObject
*
output
,
APPLY_SPECIFIC
(
conv_gi
)(
PyGpuArrayObject
*
kerns
,
PyGpuArrayObject
*
output
,
PyGpuArrayObject
*
im
,
PyGpuArrayObject
*
im
,
cudnnConvolutionDescriptor_t
desc
,
cudnnConvolutionDescriptor_t
desc
,
double
alpha
,
double
beta
,
PyGpuArrayObject
**
input
)
{
double
alpha
,
double
beta
,
PyGpuArrayObject
**
input
,
PyGpuContextObject
*
c
)
{
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
float
af
=
alpha
,
bf
=
beta
;
float
af
=
alpha
,
bf
=
beta
;
void
*
alpha_p
;
void
*
alpha_p
;
void
*
beta_p
;
void
*
beta_p
;
PyGpuContextObject
*
c
=
pygpu_default_context
();
if
(
PyGpuArray_DIMS
(
im
)[
1
]
!=
PyGpuArray_DIMS
(
kerns
)[
1
])
{
if
(
PyGpuArray_DIMS
(
im
)[
1
]
!=
PyGpuArray_DIMS
(
kerns
)[
1
])
{
PyErr_SetString
(
PyExc_ValueError
,
"images and kernel must have the same "
PyErr_SetString
(
PyExc_ValueError
,
"images and kernel must have the same "
...
...
theano/sandbox/gpuarray/dnn_gw.c
浏览文件 @
4814cd99
...
@@ -4,12 +4,12 @@ int
...
@@ -4,12 +4,12 @@ int
APPLY_SPECIFIC
(
conv_gw
)(
PyGpuArrayObject
*
input
,
PyGpuArrayObject
*
output
,
APPLY_SPECIFIC
(
conv_gw
)(
PyGpuArrayObject
*
input
,
PyGpuArrayObject
*
output
,
PyGpuArrayObject
*
km
,
PyGpuArrayObject
*
km
,
cudnnConvolutionDescriptor_t
desc
,
cudnnConvolutionDescriptor_t
desc
,
double
alpha
,
double
beta
,
PyGpuArrayObject
**
kerns
)
{
double
alpha
,
double
beta
,
PyGpuArrayObject
**
kerns
,
PyGpuContextObject
*
c
)
{
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
float
af
=
alpha
,
bf
=
beta
;
float
af
=
alpha
,
bf
=
beta
;
void
*
alpha_p
;
void
*
alpha_p
;
void
*
beta_p
;
void
*
beta_p
;
PyGpuContextObject
*
c
=
pygpu_default_context
();
if
(
PyGpuArray_DIMS
(
input
)[
1
]
!=
PyGpuArray_DIMS
(
km
)[
1
])
{
if
(
PyGpuArray_DIMS
(
input
)[
1
]
!=
PyGpuArray_DIMS
(
km
)[
1
])
{
PyErr_SetString
(
PyExc_ValueError
,
PyErr_SetString
(
PyExc_ValueError
,
...
...
theano/sandbox/gpuarray/dnn_pool.c
浏览文件 @
4814cd99
...
@@ -29,10 +29,10 @@ if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFI
...
@@ -29,10 +29,10 @@ if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFI
int
APPLY_SPECIFIC
(
dnn_pool
)(
PyGpuArrayObject
*
img
,
int
APPLY_SPECIFIC
(
dnn_pool
)(
PyGpuArrayObject
*
img
,
cudnnPoolingDescriptor_t
desc
,
cudnnPoolingDescriptor_t
desc
,
PyGpuArrayObject
**
out
)
{
PyGpuArrayObject
**
out
,
PyGpuContextObject
*
c
)
{
cudnnStatus_t
err
;
cudnnStatus_t
err
;
size_t
dims
[
5
];
size_t
dims
[
5
];
PyGpuContextObject
*
c
=
pygpu_default_context
();
if
(
!
GpuArray_IS_C_CONTIGUOUS
(
&
img
->
ga
))
{
if
(
!
GpuArray_IS_C_CONTIGUOUS
(
&
img
->
ga
))
{
PyErr_SetString
(
PyExc_ValueError
,
"Only contiguous inputs are supported."
);
PyErr_SetString
(
PyExc_ValueError
,
"Only contiguous inputs are supported."
);
...
...
theano/sandbox/gpuarray/dnn_pool_grad.c
浏览文件 @
4814cd99
...
@@ -53,9 +53,9 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
...
@@ -53,9 +53,9 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyGpuArrayObject
*
out
,
PyGpuArrayObject
*
out
,
PyGpuArrayObject
*
out_grad
,
PyGpuArrayObject
*
out_grad
,
cudnnPoolingDescriptor_t
desc
,
cudnnPoolingDescriptor_t
desc
,
PyGpuArrayObject
**
inp_grad
)
{
PyGpuArrayObject
**
inp_grad
,
PyGpuContextObject
*
c
)
{
cudnnStatus_t
err
;
cudnnStatus_t
err
;
PyGpuContextObject
*
c
=
pygpu_default_context
();
if
(
!
GpuArray_IS_C_CONTIGUOUS
(
&
inp
->
ga
))
{
if
(
!
GpuArray_IS_C_CONTIGUOUS
(
&
inp
->
ga
))
{
PyErr_SetString
(
PyExc_ValueError
,
"Only contiguous inputs are supported."
);
PyErr_SetString
(
PyExc_ValueError
,
"Only contiguous inputs are supported."
);
...
@@ -81,7 +81,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
...
@@ -81,7 +81,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
if
(
theano_prep_output
(
inp_grad
,
PyGpuArray_NDIM
(
inp
),
if
(
theano_prep_output
(
inp_grad
,
PyGpuArray_NDIM
(
inp
),
PyGpuArray_DIMS
(
inp
),
inp
->
ga
.
typecode
,
PyGpuArray_DIMS
(
inp
),
inp
->
ga
.
typecode
,
GA_C_ORDER
,
pygpu_default_context
()
)
!=
0
)
{
GA_C_ORDER
,
c
)
!=
0
)
{
return
1
;
return
1
;
}
}
...
...
theano/sandbox/gpuarray/dnn_softmax.c
浏览文件 @
4814cd99
...
@@ -34,9 +34,9 @@ if (APPLY_SPECIFIC(output) != NULL)
...
@@ -34,9 +34,9 @@ if (APPLY_SPECIFIC(output) != NULL)
#section support_code_struct
#section support_code_struct
int
APPLY_SPECIFIC
(
softmax
)(
PyGpuArrayObject
*
x
,
int
APPLY_SPECIFIC
(
softmax
)(
PyGpuArrayObject
*
x
,
PyGpuArrayObject
**
out
)
{
PyGpuArrayObject
**
out
,
PyGpuContextObject
*
c
)
{
cudnnStatus_t
err
;
cudnnStatus_t
err
;
PyGpuContextObject
*
c
=
pygpu_default_context
();
if
(
c_set_tensorNd
(
x
,
APPLY_SPECIFIC
(
input
))
!=
0
)
if
(
c_set_tensorNd
(
x
,
APPLY_SPECIFIC
(
input
))
!=
0
)
return
1
;
return
1
;
...
...
theano/sandbox/gpuarray/dnn_softmax_grad.c
浏览文件 @
4814cd99
...
@@ -45,9 +45,9 @@ if (APPLY_SPECIFIC(dx) != NULL)
...
@@ -45,9 +45,9 @@ if (APPLY_SPECIFIC(dx) != NULL)
int
APPLY_SPECIFIC
(
softmax_grad
)(
PyGpuArrayObject
*
dy
,
int
APPLY_SPECIFIC
(
softmax_grad
)(
PyGpuArrayObject
*
dy
,
PyGpuArrayObject
*
sm
,
PyGpuArrayObject
*
sm
,
PyGpuArrayObject
**
dx
)
{
PyGpuArrayObject
**
dx
,
PyGpuContextObject
*
c
)
{
cudnnStatus_t
err
;
cudnnStatus_t
err
;
PyGpuContextObject
*
c
=
pygpu_default_context
();
if
(
c_set_tensorNd
(
dy
,
APPLY_SPECIFIC
(
dy
))
!=
0
)
if
(
c_set_tensorNd
(
dy
,
APPLY_SPECIFIC
(
dy
))
!=
0
)
return
1
;
return
1
;
...
...
theano/sandbox/gpuarray/elemwise.py
浏览文件 @
4814cd99
...
@@ -20,8 +20,8 @@ try:
...
@@ -20,8 +20,8 @@ try:
except
ImportError
:
except
ImportError
:
pass
pass
from
.basic_ops
import
(
as_gpuarray_variable
,
HideC
,
from
.basic_ops
import
(
as_gpuarray_variable
,
HideC
,
GpuKernelBase
,
Kernel
,
GpuKernelBase
,
Kernel
)
infer_context_name
)
from
.type
import
GpuArrayType
from
.type
import
GpuArrayType
from
.fp16_help
import
load_w
,
write_w
from
.fp16_help
import
load_w
,
write_w
...
@@ -37,7 +37,7 @@ def make_argument(v, name):
...
@@ -37,7 +37,7 @@ def make_argument(v, name):
return
ArrayArg
(
numpy
.
dtype
(
v
.
type
.
dtype
),
name
)
return
ArrayArg
(
numpy
.
dtype
(
v
.
type
.
dtype
),
name
)
def
ensure_allocated
(
storage
,
shape
,
dtype
):
def
ensure_allocated
(
storage
,
shape
,
dtype
,
ctx
):
odat
=
storage
[
0
]
odat
=
storage
[
0
]
if
odat
is
not
None
:
if
odat
is
not
None
:
if
odat
.
shape
!=
shape
:
if
odat
.
shape
!=
shape
:
...
@@ -45,7 +45,7 @@ def ensure_allocated(storage, shape, dtype):
...
@@ -45,7 +45,7 @@ def ensure_allocated(storage, shape, dtype):
# we have to allocate output storage.
# we have to allocate output storage.
odat
=
None
odat
=
None
if
odat
is
None
:
if
odat
is
None
:
odat
=
pygpu
.
empty
(
shape
,
dtype
=
dtype
)
odat
=
pygpu
.
empty
(
shape
,
dtype
=
dtype
,
context
=
ctx
)
storage
[
0
]
=
odat
storage
[
0
]
=
odat
return
odat
return
odat
...
@@ -67,12 +67,14 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -67,12 +67,14 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return
"GpuElemwise{
%
s}
%
s<gpuarray>"
%
(
self
.
scalar_op
,
items
)
return
"GpuElemwise{
%
s}
%
s<gpuarray>"
%
(
self
.
scalar_op
,
items
)
def
make_node
(
self
,
*
inputs
):
def
make_node
(
self
,
*
inputs
):
ctx_name
=
infer_context_name
(
*
inputs
)
res
=
Elemwise
.
make_node
(
self
,
*
inputs
)
res
=
Elemwise
.
make_node
(
self
,
*
inputs
)
outputs
=
[
GpuArrayType
(
broadcastable
=
o
.
type
.
broadcastable
,
outputs
=
[
GpuArrayType
(
broadcastable
=
o
.
type
.
broadcastable
,
context_name
=
ctx_name
,
dtype
=
o
.
type
.
dtype
)()
for
o
in
res
.
outputs
]
dtype
=
o
.
type
.
dtype
)()
for
o
in
res
.
outputs
]
if
len
(
outputs
)
>
1
:
if
len
(
outputs
)
>
1
:
raise
NotImplementedError
()
raise
NotImplementedError
()
inputs
=
[
as_gpuarray_variable
(
i
)
for
i
in
inputs
]
inputs
=
[
as_gpuarray_variable
(
i
,
ctx_name
)
for
i
in
inputs
]
node
=
Apply
(
self
,
inputs
,
outputs
)
node
=
Apply
(
self
,
inputs
,
outputs
)
# Try to generate the kernel to catch SupportCodeErrors
# Try to generate the kernel to catch SupportCodeErrors
...
@@ -99,6 +101,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -99,6 +101,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return
node
return
node
def
get_context
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
def
generate_kernel
(
self
,
node
,
nodename
):
def
generate_kernel
(
self
,
node
,
nodename
):
inps
=
[
make_argument
(
i
,
'i
%
d'
%
(
n
,))
for
n
,
i
in
inps
=
[
make_argument
(
i
,
'i
%
d'
%
(
n
,))
for
n
,
i
in
enumerate
(
node
.
inputs
)]
enumerate
(
node
.
inputs
)]
...
@@ -168,7 +173,8 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -168,7 +173,8 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
(
"npy_float64"
,
"ga_double"
),
(
"npy_float64"
,
"ga_double"
),
]:
]:
kop
=
kop
.
replace
(
npy
,
ga
)
kop
=
kop
.
replace
(
npy
,
ga
)
return
ElemwiseKernel
(
None
,
inps
+
outs
,
kop
,
preamble
=
support_code
)
return
ElemwiseKernel
(
self
.
get_context
(
node
),
inps
+
outs
,
kop
,
preamble
=
support_code
)
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
...
@@ -177,8 +183,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -177,8 +183,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return
self
.
scalar_op
.
c_support_code
()
return
self
.
scalar_op
.
c_support_code
()
def
_gpu_kernel_code
(
self
,
node
,
nodename
):
def
_gpu_kernel_code
(
self
,
node
,
nodename
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
# This is useless by itself, but will serve an eventual c_code
# This is useless by itself, but will serve an eventual c_code
# implementation
# implementation
k
=
self
.
generate_kernel
(
node
,
nodename
)
k
=
self
.
generate_kernel
(
node
,
nodename
)
...
@@ -191,8 +195,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -191,8 +195,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return
'
\n
'
.
join
(
res
)
return
'
\n
'
.
join
(
res
)
def
gpu_kernels
(
self
,
node
,
nodename
):
def
gpu_kernels
(
self
,
node
,
nodename
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
src
=
self
.
_gpu_kernel_code
(
node
,
nodename
)
src
=
self
.
_gpu_kernel_code
(
node
,
nodename
)
nd
=
node
.
outputs
[
0
]
.
ndim
nd
=
node
.
outputs
[
0
]
.
ndim
params
=
[
'uintp'
]
params
=
[
'uintp'
]
...
@@ -214,12 +216,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -214,12 +216,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
objvar
=
'elem_
%
d_
%
s'
%
(
nd
,
nodename
))]
objvar
=
'elem_
%
d_
%
s'
%
(
nd
,
nodename
))]
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl
'
:
if
node
.
inputs
[
0
]
.
type
.
context
.
kind
!=
'cuda
'
:
raise
MethodNotDefined
(
'cuda only'
)
raise
MethodNotDefined
(
'cuda only'
)
nd
=
node
.
outputs
[
0
]
.
ndim
nd
=
node
.
outputs
[
0
]
.
ndim
fail
=
sub
[
"fail"
]
fail
=
sub
[
"fail"
]
initial_dims
=
','
.
join
(
'1'
for
i
in
xrange
(
nd
))
initial_dims
=
','
.
join
(
'1'
for
i
in
xrange
(
nd
))
opname
=
str
(
self
.
scalar_op
)
opname
=
str
(
self
.
scalar_op
)
ctx
=
sub
[
'context'
]
# check that all inputs have valid dimensions
# check that all inputs have valid dimensions
emitted_inames
=
{}
emitted_inames
=
{}
...
@@ -264,11 +267,10 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -264,11 +267,10 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
if
iname
in
emitted_inames
:
if
iname
in
emitted_inames
:
continue
continue
code
+=
"""
code
+=
"""
//std::cerr << "C_CODE
%(opname)
s checking input
%(iname)
s
\\
n";
if (
%(nd)
s != PyGpuArray_NDIM(
%(iname)
s))
if (
%(nd)
s != PyGpuArray_NDIM(
%(iname)
s))
{
{
PyErr_Format(PyExc_TypeError,
PyErr_Format(PyExc_TypeError,
"need
%(nd)
s dims, not
%%
i
",
"need
%(nd)
s dims, not
%%
u
",
PyGpuArray_NDIM(
%(iname)
s));
PyGpuArray_NDIM(
%(iname)
s));
%(fail)
s;
%(fail)
s;
}
}
...
@@ -279,14 +281,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -279,14 +281,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
PyGpuArray_DIMS(
%(iname)
s)[i] == 1)) &&
PyGpuArray_DIMS(
%(iname)
s)[i] == 1)) &&
(dims[i] != PyGpuArray_DIMS(
%(iname)
s)[i]))
(dims[i] != PyGpuArray_DIMS(
%(iname)
s)[i]))
{
{
//std::cerr << "C_CODE
%(opname)
s checking input
%(iname)
s failed
\\
n";
PyErr_Format(PyExc_ValueError,
PyErr_Format(PyExc_ValueError,
"GpuElemwise. Input dimension mis-match. Input"
"GpuElemwise. Input dimension mis-match. Input"
"
%(idx)
d (indices start at 0) has shape[
%%
i] ==
%%
i
"
"
%(idx)
d (indices start at 0) has shape[
%%
d] ==
%%
llu
"
", but the output's size on that axis is
%%
i
.",
", but the output's size on that axis is
%%
llu
.",
i,
i,
PyGpuArray_DIMS(
%(iname)
s)[i],
(unsigned long long)
PyGpuArray_DIMS(
%(iname)
s)[i],
dims[i]
(unsigned long long)
dims[i]
);
);
%(fail)
s;
%(fail)
s;
}
}
...
@@ -314,15 +315,11 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -314,15 +315,11 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
{
{
%(oname)
s = pygpu_empty(
%(nd)
d, dims,
%(oname)
s = pygpu_empty(
%(nd)
d, dims,
%(typecode)
s, GA_C_ORDER,
%(typecode)
s, GA_C_ORDER,
pygpu_default_context()
, Py_None);
%(ctx)
s
, Py_None);
if (!
%(oname)
s) {
if (!
%(oname)
s) {
//TODO, this check don't seam good.
%(fail)
s
//TODO, set exception?
%(fail)
s
}
}
}
}
//std::cerr << "ELEMWISE NEW
%(oname)
s nd" << PyGpuArray_NDIM(
%(oname)
s) << "
\\
n";
//std::cerr << "ELEMWISE NEW
%(oname)
s data" <<
%(oname)
s->devdata << "
\\
n";
"""
%
locals
()
"""
%
locals
()
else
:
else
:
input_idx
=
self
.
inplace_pattern
[
idx
]
input_idx
=
self
.
inplace_pattern
[
idx
]
...
@@ -337,19 +334,17 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -337,19 +334,17 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
PyErr_Format(PyExc_ValueError,
PyErr_Format(PyExc_ValueError,
"GpuElemwise. Output dimension mis-match. Output"
"GpuElemwise. Output dimension mis-match. Output"
"
%(idx)
d (indices start at 0), working inplace"
"
%(idx)
d (indices start at 0), working inplace"
" on input
%(input_idx)
s, has shape[
%%
i] ==
%%
i
"
" on input
%(input_idx)
s, has shape[
%%
i] ==
%%
llu
"
", but the output's size on that axis is
%%
i
.",
", but the output's size on that axis is
%%
llu
.",
i,
i,
PyGpuArray_DIMS(
%(oname)
s)[i],
(unsigned long long)
PyGpuArray_DIMS(
%(oname)
s)[i],
dims[i]
(unsigned long long)
dims[i]
);
);
Py_DECREF(
%(oname)
s);
Py_DECREF(
%(oname)
s);
%(oname)
s = NULL;
%(oname)
s = NULL;
%(fail)
s;
%(fail)
s;
}
}
}
}
//std::cerr << "ELEMWISE NEW
%(oname)
s nd" << PyGpuArray_NDIM(
%(oname)
s) << "
\\
n";
//std::cerr << "ELEMWISE NEW
%(oname)
s data" <<
%(oname)
s->devdata << "
\\
n";
"""
%
locals
()
"""
%
locals
()
z
=
outputs
[
0
]
z
=
outputs
[
0
]
code
+=
"""numEls = PyGpuArray_SIZE(
%(z)
s);
code
+=
"""numEls = PyGpuArray_SIZE(
%(z)
s);
...
@@ -367,7 +362,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -367,7 +362,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
if (threads_per_block * n_blocks < numEls)
if (threads_per_block * n_blocks < numEls)
threads_per_block = std::min(numEls/n_blocks, (size_t) 256);
threads_per_block = std::min(numEls/n_blocks, (size_t) 256);
//std::cerr << "calling callkernel returned
\\
n";
"""
%
locals
()
"""
%
locals
()
kname
=
'elem_
%
d_
%
s'
%
(
nd
,
name
)
kname
=
'elem_
%
d_
%
s'
%
(
nd
,
name
)
...
@@ -407,7 +401,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -407,7 +401,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
"""
%
locals
()
"""
%
locals
()
return
str
(
code
)
return
str
(
code
)
def
perform
(
self
,
node
,
inputs
,
output_storage
):
def
perform
(
self
,
node
,
inputs
,
output_storage
,
ctx
):
# Try to reuse the kernel from a previous call to hopefully
# Try to reuse the kernel from a previous call to hopefully
# avoid recompiling
# avoid recompiling
if
not
hasattr
(
node
,
'_cache_elemwise_k'
):
if
not
hasattr
(
node
,
'_cache_elemwise_k'
):
...
@@ -428,7 +422,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -428,7 +422,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
if
n
in
self
.
inplace_pattern
:
if
n
in
self
.
inplace_pattern
:
stor
[
0
]
=
inputs
[
self
.
inplace_pattern
[
n
]]
stor
[
0
]
=
inputs
[
self
.
inplace_pattern
[
n
]]
else
:
else
:
args
.
append
(
ensure_allocated
(
stor
,
out_shape
,
out
.
type
.
dtype
))
args
.
append
(
ensure_allocated
(
stor
,
out_shape
,
out
.
type
.
dtype
,
ctx
))
node
.
_cache_elemwise_k
(
*
args
,
broadcast
=
True
)
node
.
_cache_elemwise_k
(
*
args
,
broadcast
=
True
)
if
config
.
gpuarray
.
sync
:
if
config
.
gpuarray
.
sync
:
...
@@ -453,10 +447,12 @@ class GpuDimShuffle(HideC, DimShuffle):
...
@@ -453,10 +447,12 @@ class GpuDimShuffle(HideC, DimShuffle):
_f16_ok
=
True
_f16_ok
=
True
def
make_node
(
self
,
input
):
def
make_node
(
self
,
input
):
ctx_name
=
infer_context_name
(
input
)
res
=
DimShuffle
.
make_node
(
self
,
input
)
res
=
DimShuffle
.
make_node
(
self
,
input
)
otype
=
GpuArrayType
(
dtype
=
res
.
outputs
[
0
]
.
type
.
dtype
,
otype
=
GpuArrayType
(
dtype
=
res
.
outputs
[
0
]
.
type
.
dtype
,
broadcastable
=
res
.
outputs
[
0
]
.
type
.
broadcastable
)
broadcastable
=
res
.
outputs
[
0
]
.
type
.
broadcastable
,
input
=
as_gpuarray_variable
(
input
)
context_name
=
ctx_name
)
input
=
as_gpuarray_variable
(
input
,
ctx_name
)
return
Apply
(
self
,
[
input
],
[
otype
()])
return
Apply
(
self
,
[
input
],
[
otype
()])
def
__str__
(
self
):
def
__str__
(
self
):
...
@@ -588,7 +584,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -588,7 +584,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
quite possible that the GPU might be slower for some cases.
quite possible that the GPU might be slower for some cases.
"""
"""
__props__
=
(
'axis'
,
'reduce_mask'
,
'dtype'
,
'acc_dtype'
,
'scalar_op'
,
'pre_scalar_op'
)
_f16_ok
=
True
_f16_ok
=
True
def
__init__
(
self
,
scalar_op
,
axis
=
None
,
def
__init__
(
self
,
scalar_op
,
axis
=
None
,
...
@@ -607,24 +604,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -607,24 +604,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if
pre_scalar_op
:
if
pre_scalar_op
:
assert
pre_scalar_op
.
nin
==
1
assert
pre_scalar_op
.
nin
==
1
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
and
self
.
axis
==
other
.
axis
and
self
.
reduce_mask
==
other
.
reduce_mask
and
self
.
dtype
==
other
.
dtype
and
self
.
acc_dtype
==
other
.
acc_dtype
and
self
.
scalar_op
==
other
.
scalar_op
and
self
.
pre_scalar_op
==
other
.
pre_scalar_op
)
def
__hash__
(
self
):
return
(
hash
(
type
(
self
))
^
hash
(
self
.
axis
)
^
hash
(
self
.
reduce_mask
)
^
hash
(
self
.
dtype
)
^
hash
(
self
.
acc_dtype
)
^
hash
(
type
(
self
.
scalar_op
))
^
hash
(
type
(
self
.
pre_scalar_op
)))
def
__str__
(
self
):
def
__str__
(
self
):
pre
=
""
pre
=
""
if
self
.
pre_scalar_op
:
if
self
.
pre_scalar_op
:
...
@@ -641,7 +620,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -641,7 +620,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
self
.
pre_scalar_op
=
None
self
.
pre_scalar_op
=
None
def
make_node
(
self
,
x
):
def
make_node
(
self
,
x
):
x
=
as_gpuarray_variable
(
x
)
x
=
as_gpuarray_variable
(
x
,
infer_context_name
(
x
))
if
x
.
type
.
context
.
kind
!=
'cuda'
:
raise
TypeError
(
"GpuCAReduceCuda doesn't work for non-cuda devices"
)
ret
=
super
(
GpuCAReduceCuda
,
self
)
.
make_node
(
x
)
ret
=
super
(
GpuCAReduceCuda
,
self
)
.
make_node
(
x
)
self
=
copy
.
copy
(
self
)
self
=
copy
.
copy
(
self
)
self
.
axis
=
ret
.
op
.
axis
self
.
axis
=
ret
.
op
.
axis
...
@@ -666,9 +647,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -666,9 +647,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
"complex"
in
self
.
_acc_dtype
(
x
.
dtype
)):
"complex"
in
self
.
_acc_dtype
(
x
.
dtype
)):
raise
NotImplementedError
(
"We don't support complex in gpu reduction"
)
raise
NotImplementedError
(
"We don't support complex in gpu reduction"
)
return
Apply
(
self
,
[
x
],
[
GpuArrayType
(
ret
.
outputs
[
0
]
.
dtype
,
return
Apply
(
self
,
[
x
],
[
GpuArrayType
(
ret
.
outputs
[
0
]
.
dtype
,
ret
.
outputs
[
0
]
.
type
.
broadcastable
)()])
ret
.
outputs
[
0
]
.
type
.
broadcastable
,
context_name
=
x
.
type
.
context_name
)()])
def
perform
(
self
,
node
,
inp
,
out
):
def
get_context
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
def
perform
(
self
,
node
,
inp
,
out
,
ctx
):
raise
MethodNotDefined
(
""
)
raise
MethodNotDefined
(
""
)
def
supports_c_code
(
self
,
inputs
):
def
supports_c_code
(
self
,
inputs
):
...
@@ -698,7 +683,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -698,7 +683,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
inp
=
[
'fake_input_name_
%
d'
%
i
for
i
in
xrange
(
len
(
inputs
))]
inp
=
[
'fake_input_name_
%
d'
%
i
for
i
in
xrange
(
len
(
inputs
))]
out
=
[
'fake_output_name_
%
d'
%
i
for
i
in
xrange
(
len
(
node
.
outputs
))]
out
=
[
'fake_output_name_
%
d'
%
i
for
i
in
xrange
(
len
(
node
.
outputs
))]
sub
=
{
'fail'
:
'fake failure code'
}
sub
=
{
'fail'
:
'fake failure code'
,
'context'
:
'fake context'
}
try
:
try
:
self
.
c_code
(
node
,
name
,
inp
,
out
,
sub
)
self
.
c_code
(
node
,
name
,
inp
,
out
,
sub
)
...
@@ -733,7 +718,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -733,7 +718,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if (PyGpuArray_NDIM(
%(x)
s) !=
%(nd_in)
s)
if (PyGpuArray_NDIM(
%(x)
s) !=
%(nd_in)
s)
{
{
PyErr_Format(PyExc_TypeError,
PyErr_Format(PyExc_TypeError,
"required nd=
%(nd_in)
s, got nd=
%%
i
", PyGpuArray_NDIM(
%(x)
s));
"required nd=
%(nd_in)
s, got nd=
%%
u
", PyGpuArray_NDIM(
%(x)
s));
%(fail)
s;
%(fail)
s;
}
}
"""
%
locals
(),
file
=
sio
)
"""
%
locals
(),
file
=
sio
)
...
@@ -791,7 +776,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -791,7 +776,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
Py_XDECREF(
%(z)
s);
Py_XDECREF(
%(z)
s);
%(z)
s = pygpu_empty(
%(nd_out)
s, new_dims,
%(z)
s = pygpu_empty(
%(nd_out)
s, new_dims,
%(out_typecode)
s, GA_C_ORDER,
%(out_typecode)
s, GA_C_ORDER,
pygpu_default_context()
, Py_None);
%(ctx)
s
, Py_None);
if (NULL ==
%(z)
s)
if (NULL ==
%(z)
s)
{
{
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
...
@@ -1338,8 +1323,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1338,8 +1323,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)
%(z)
s->ga.data,
(void *)
%(z)
s->ga.data,
(void *)&
%(z)
s->ga.offset};
(void *)&
%(z)
s->ga.offset};
if (verbose) printf("running kernel_reduce_ccontig_
%(name)
s"
if (verbose) printf("running kernel_reduce_ccontig_
%(name)
s"
" n_threads=
%%
l
u, size=
%%
lu, ndim=
%%
d
\\
n",
" n_threads=
%%
l
lu, size=
%%
llu, ndim=
%%
u
\\
n",
n_threads,numEls,
n_threads,
numEls,
PyGpuArray_NDIM(
%(x)
s));
PyGpuArray_NDIM(
%(x)
s));
size_t n_shared = sizeof(
%(acc_dtype)
s) * n_threads;
size_t n_shared = sizeof(
%(acc_dtype)
s) * n_threads;
int err = GpuKernel_call(&
%(k_var)
s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
int err = GpuKernel_call(&
%(k_var)
s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
...
@@ -1521,9 +1506,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1521,9 +1506,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
size_t n_blocks[3] = {1, std::min(PyGpuArray_DIMS(
%(x)
s)[1], (size_t) 4096), 1};
size_t n_blocks[3] = {1, std::min(PyGpuArray_DIMS(
%(x)
s)[1], (size_t) 4096), 1};
if (verbose) {
if (verbose) {
fprintf(stderr,
fprintf(stderr,
"running kernel_reduce_10_
%(name)
s n_blocks=(
%%
i,
%%
i
)
\\
n",
"running kernel_reduce_10_
%(name)
s n_blocks=(
%%
llu,
%%
llu
)
\\
n",
n_blocks[0],
(unsigned long long)
n_blocks[0],
n_blocks[1]);
(unsigned long long)
n_blocks[1]);
}
}
assert(PyGpuArray_DIMS(
%(x)
s)[1] == PyGpuArray_DIMS(
%(z)
s)[0]);
assert(PyGpuArray_DIMS(
%(x)
s)[1] == PyGpuArray_DIMS(
%(z)
s)[0]);
size_t n_shared = sizeof(
%(acc_dtype)
s) * n_threads[0];
size_t n_shared = sizeof(
%(acc_dtype)
s) * n_threads[0];
...
@@ -1911,12 +1896,17 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1911,12 +1896,17 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
"""
%
locals
(),
file
=
sio
)
"""
%
locals
(),
file
=
sio
)
def
c_code_cache_version_apply
(
self
,
node
):
def
c_code_cache_version_apply
(
self
,
node
):
version
=
[
1
7
]
# the version corresponding to the c code in this Op
version
=
[
1
8
]
# the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
# now we insert versions for the ops on which we depend...
version
.
extend
(
self
.
scalar_op
.
c_code_cache_version
())
scalar_node
=
Apply
(
self
.
scalar_op
,
[
Scalar
(
dtype
=
input
.
type
.
dtype
)()
for
input
in
node
.
inputs
],
[
Scalar
(
dtype
=
output
.
type
.
dtype
)()
for
output
in
node
.
outputs
])
version
.
extend
(
self
.
scalar_op
.
c_code_cache_version_apply
(
scalar_node
))
for
i
in
node
.
inputs
+
node
.
outputs
:
for
i
in
node
.
inputs
+
node
.
outputs
:
version
.
extend
(
Scalar
(
dtype
=
i
.
type
.
dtype
)
.
c_code_cache_version
())
version
.
extend
(
Scalar
(
dtype
=
i
.
type
.
dtype
)
.
c_code_cache_version
())
version
.
extend
(
self
.
kernel_version
(
node
))
if
all
(
version
):
if
all
(
version
):
return
tuple
(
version
)
return
tuple
(
version
)
else
:
else
:
...
@@ -2644,7 +2634,6 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2644,7 +2634,6 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
Too slow for now as it only have a python interface.
Too slow for now as it only have a python interface.
"""
"""
def
__init__
(
self
,
scalar_op
,
axis
=
None
,
dtype
=
None
,
acc_dtype
=
None
):
def
__init__
(
self
,
scalar_op
,
axis
=
None
,
dtype
=
None
,
acc_dtype
=
None
):
if
not
hasattr
(
scalar_op
,
'identity'
):
if
not
hasattr
(
scalar_op
,
'identity'
):
raise
ValueError
(
"No identity on scalar op"
)
raise
ValueError
(
"No identity on scalar op"
)
...
@@ -2658,10 +2647,12 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2658,10 +2647,12 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
return
"GpuReduce{
%
s}
%
s"
%
(
self
.
scalar_op
,
ax
)
return
"GpuReduce{
%
s}
%
s"
%
(
self
.
scalar_op
,
ax
)
def
make_node
(
self
,
input
):
def
make_node
(
self
,
input
):
ctx_name
=
infer_context_name
(
input
)
res
=
CAReduceDtype
.
make_node
(
self
,
input
)
res
=
CAReduceDtype
.
make_node
(
self
,
input
)
input
=
as_gpuarray_variable
(
input
)
input
=
as_gpuarray_variable
(
input
,
ctx_name
)
otype
=
GpuArrayType
(
dtype
=
res
.
outputs
[
0
]
.
dtype
,
otype
=
GpuArrayType
(
dtype
=
res
.
outputs
[
0
]
.
dtype
,
broadcastable
=
res
.
outputs
[
0
]
.
broadcastable
)
broadcastable
=
res
.
outputs
[
0
]
.
broadcastable
,
context_name
=
ctx_name
)
if
res
.
op
.
axis
is
not
None
:
if
res
.
op
.
axis
is
not
None
:
redux
=
[]
redux
=
[]
...
@@ -2673,11 +2664,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2673,11 +2664,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
return
Apply
(
res
.
op
,
[
input
],
[
otype
()])
return
Apply
(
res
.
op
,
[
input
],
[
otype
()])
def
get_context
(
self
,
node
):
return
node
.
outputs
[
0
]
.
type
.
context
def
make_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
def
make_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
# cache the kernel object
# cache the kernel object
self
.
get_kernel_cache
(
node
)
self
.
get_kernel_cache
(
node
)
return
super
(
GpuCAReduceCPY
,
self
)
.
make_thunk
(
node
,
storage_map
,
return
super
(
GpuCAReduceCPY
,
self
)
.
make_thunk
(
compute_map
,
no_recycling
)
node
,
storage_map
,
compute_map
,
no_recycling
)
def
get_kernel_cache
(
self
,
node
):
def
get_kernel_cache
(
self
,
node
):
attr
=
'@cache_reduction_k'
attr
=
'@cache_reduction_k'
...
@@ -2776,33 +2770,33 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2776,33 +2770,33 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
j
+=
1
j
+=
1
code
+=
"""
code
+=
"""
if (need_out) {
if (need_out) {
%(output)
s = pygpu_empty(
%(nd_out)
s, out_dims,
%(out_type)
s, GA_C_ORDER,
pygpu_default_context()
, Py_None);
%(output)
s = pygpu_empty(
%(nd_out)
s, out_dims,
%(out_type)
s, GA_C_ORDER,
%(ctx)
s
, Py_None);
if (!
%(output)
s) {
if (!
%(output)
s) {
%(fail)
s
%(fail)
s
}
}
}
}
"""
%
dict
(
output
=
output
,
nd_out
=
nd_out
,
fail
=
sub
[
'fail'
],
"""
%
dict
(
output
=
output
,
nd_out
=
nd_out
,
fail
=
sub
[
'fail'
],
ctx
=
sub
[
'context'
],
out_type
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
type
.
dtype
))
out_type
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
type
.
dtype
))
else
:
else
:
code
+=
"""
code
+=
"""
if (
%(output)
s == NULL ||
%(output)
s->ga.nd != 0) {
if (
%(output)
s == NULL ||
%(output)
s->ga.nd != 0) {
Py_XDECREF(
%(output)
s);
Py_XDECREF(
%(output)
s);
%(output)
s = pygpu_empty(0, NULL,
%(out_type)
s, GA_C_ORDER,
%(output)
s = pygpu_empty(0, NULL,
%(out_type)
s, GA_C_ORDER,
pygpu_default_context()
, Py_None);
%(ctx)
s
, Py_None);
if (!
%(output)
s) {
if (!
%(output)
s) {
%(fail)
s
%(fail)
s
}
}
}
}
"""
%
dict
(
output
=
output
,
fail
=
sub
[
'fail'
],
"""
%
dict
(
output
=
output
,
fail
=
sub
[
'fail'
],
ctx
=
sub
[
'context'
],
out_type
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
type
.
dtype
))
out_type
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
type
.
dtype
))
if
acc_dtype
!=
node
.
outputs
[
0
]
.
type
.
dtype
:
if
acc_dtype
!=
node
.
outputs
[
0
]
.
type
.
dtype
:
code
+=
"""
code
+=
"""
tmp = pygpu_empty(
%(output)
s->ga.nd,
%(output)
s->ga.dimensions,
tmp = pygpu_empty(
%(output)
s->ga.nd,
%(output)
s->ga.dimensions,
%(acc_type)
s, GA_C_ORDER, pygpu_default_context(),
%(acc_type)
s, GA_C_ORDER,
%(ctx)
s, Py_None);
Py_None);
if (!tmp)
%(fail)
s
if (!tmp)
%(fail)
s
"""
%
dict
(
output
=
output
,
fail
=
sub
[
'fail'
],
"""
%
dict
(
output
=
output
,
fail
=
sub
[
'fail'
],
ctx
=
sub
[
'context'
],
acc_type
=
dtype_to_typecode
(
acc_dtype
))
acc_type
=
dtype_to_typecode
(
acc_dtype
))
else
:
else
:
code
+=
"""
code
+=
"""
...
@@ -2893,12 +2887,12 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2893,12 +2887,12 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
reduce_expr
=
"a * b"
reduce_expr
=
"a * b"
else
:
else
:
raise
NotImplementedError
()
raise
NotImplementedError
()
return
ReductionKernel
(
pygpu
.
get_default_context
()
,
odtype
,
return
ReductionKernel
(
node
.
inputs
[
0
]
.
type
.
context
,
odtype
,
self
.
scalar_op
.
identity
,
reduce_expr
,
redux
,
self
.
scalar_op
.
identity
,
reduce_expr
,
redux
,
arguments
=
[
make_argument
(
node
.
inputs
[
0
],
'a'
)],
arguments
=
[
make_argument
(
node
.
inputs
[
0
],
'a'
)],
init_nd
=
node
.
inputs
[
0
]
.
ndim
)
init_nd
=
node
.
inputs
[
0
]
.
ndim
)
def
perform
(
self
,
node
,
inp
,
out
):
def
perform
(
self
,
node
,
inp
,
out
,
ctx
):
input
,
=
inp
input
,
=
inp
output
,
=
out
output
,
=
out
...
@@ -2912,6 +2906,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2912,6 +2906,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
copy
=
False
,
dtype
=
node
.
outputs
[
0
]
.
type
.
dtype
)
copy
=
False
,
dtype
=
node
.
outputs
[
0
]
.
type
.
dtype
)
else
:
else
:
output
[
0
]
=
pygpu
.
gpuarray
.
array
(
input
,
copy
=
True
,
output
[
0
]
=
pygpu
.
gpuarray
.
array
(
input
,
copy
=
True
,
dtype
=
node
.
outputs
[
0
]
.
type
.
dtype
)
dtype
=
node
.
outputs
[
0
]
.
type
.
dtype
,
context
=
ctx
)
# To allow reloading old pickled files
# To allow reloading old pickled files
GpuCAReduce
=
GpuCAReduceCPY
GpuCAReduce
=
GpuCAReduceCPY
theano/sandbox/gpuarray/gemm16.c
浏览文件 @
4814cd99
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
/* Why do we need this? */
/* Why do we need this? */
size_t
dim
=
2048
*
32
;
size_t
dim
=
2048
*
32
;
rand_buf
=
pygpu_empty
(
1
,
&
dim
,
GA_UINT
,
GA_C_ORDER
,
pygpu_default_context
()
,
rand_buf
=
pygpu_empty
(
1
,
&
dim
,
GA_UINT
,
GA_C_ORDER
,
CONTEXT
,
Py_None
);
Py_None
);
if
(
rand_buf
==
NULL
)
{
if
(
rand_buf
==
NULL
)
{
FAIL
;
FAIL
;
...
@@ -14,7 +14,8 @@ PyGpuArrayObject *rand_buf;
...
@@ -14,7 +14,8 @@ PyGpuArrayObject *rand_buf;
int
gemm16
(
PyGpuArrayObject
*
C
,
float
alpha
,
int
gemm16
(
PyGpuArrayObject
*
C
,
float
alpha
,
PyGpuArrayObject
*
A
,
PyGpuArrayObject
*
B
,
PyGpuArrayObject
*
A
,
PyGpuArrayObject
*
B
,
float
beta
,
PyGpuArrayObject
**
out
)
{
float
beta
,
PyGpuArrayObject
**
out
,
PyGpuContextObject
*
c
)
{
PyGpuArrayObject
*
_A
=
NULL
;
PyGpuArrayObject
*
_A
=
NULL
;
PyGpuArrayObject
*
_B
=
NULL
;
PyGpuArrayObject
*
_B
=
NULL
;
GpuKernel
*
gk
;
GpuKernel
*
gk
;
...
...
theano/sandbox/gpuarray/neighbours.py
浏览文件 @
4814cd99
...
@@ -10,7 +10,8 @@ try:
...
@@ -10,7 +10,8 @@ try:
except
ImportError
:
except
ImportError
:
pass
pass
from
.basic_ops
import
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
infer_context_name
)
from
.opt
import
register_opt
as
register_gpu_opt
,
op_lifter
from
.opt
import
register_opt
as
register_gpu_opt
,
op_lifter
from
.type
import
GpuArrayType
from
.type
import
GpuArrayType
...
@@ -25,7 +26,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -25,7 +26,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
self
.
mode
=
mode
self
.
mode
=
mode
def
make_node
(
self
,
ten4
,
neib_shape
,
neib_step
):
def
make_node
(
self
,
ten4
,
neib_shape
,
neib_step
):
ten4
=
as_gpuarray_variable
(
ten4
)
ten4
=
as_gpuarray_variable
(
ten4
,
infer_context_name
(
ten4
)
)
neib_shape
=
T
.
as_tensor_variable
(
neib_shape
)
neib_shape
=
T
.
as_tensor_variable
(
neib_shape
)
neib_step
=
T
.
as_tensor_variable
(
neib_step
)
neib_step
=
T
.
as_tensor_variable
(
neib_step
)
...
@@ -37,7 +38,11 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -37,7 +38,11 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
return
Apply
(
self
,
[
ten4
,
neib_shape
,
neib_step
],
return
Apply
(
self
,
[
ten4
,
neib_shape
,
neib_step
],
[
GpuArrayType
(
broadcastable
=
(
False
,
False
),
[
GpuArrayType
(
broadcastable
=
(
False
,
False
),
dtype
=
ten4
.
type
.
dtype
)()])
dtype
=
ten4
.
type
.
dtype
,
context_name
=
ten4
.
type
.
context_name
)()])
def
get_context
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
11
,)
return
(
11
,)
...
@@ -56,7 +61,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -56,7 +61,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kname
=
"k_multi_warp_less"
kname
=
"k_multi_warp_less"
k_var
=
"k_multi_warp_less_"
+
nodename
k_var
=
"k_multi_warp_less_"
+
nodename
code
=
"""
code
=
"""
//
a version that use less register but don't work in all case
.
//
a version that uses less registers but doesn't work in all cases
.
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
const int nb_batch,
const int nb_batch,
const int nb_stack,
const int nb_stack,
...
@@ -233,6 +238,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -233,6 +238,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
return
kernels
return
kernels
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
if
node
.
inputs
[
0
]
.
type
.
context
.
kind
!=
'cuda'
:
raise
NotImplementedError
(
"cuda only"
)
dtype_ten4
=
node
.
inputs
[
0
]
.
dtype
dtype_ten4
=
node
.
inputs
[
0
]
.
dtype
dtype_neib_shape
=
node
.
inputs
[
1
]
.
dtype
dtype_neib_shape
=
node
.
inputs
[
1
]
.
dtype
dtype_neib_step
=
node
.
inputs
[
2
]
.
dtype
dtype_neib_step
=
node
.
inputs
[
2
]
.
dtype
...
@@ -243,6 +250,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -243,6 +250,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
ten4
,
neib_shape
,
neib_step
=
inp
ten4
,
neib_shape
,
neib_step
=
inp
z
,
=
out
z
,
=
out
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'context'
]
mode
=
self
.
mode
mode
=
self
.
mode
err_check
=
"""
err_check
=
"""
if (err != GA_NO_ERROR) {
if (err != GA_NO_ERROR) {
...
@@ -369,8 +377,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -369,8 +377,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
dims[0] = z_dim0;
dims[0] = z_dim0;
dims[1] = z_dim1;
dims[1] = z_dim1;
%(z)
s = pygpu_empty(2, dims,
%(typecode_z)
s,
%(z)
s = pygpu_empty(2, dims,
%(typecode_z)
s,
GA_C_ORDER, pygpu_default_context(),
GA_C_ORDER,
%(ctx)
s, Py_None);
Py_None);
if (!
%(z)
s)
if (!
%(z)
s)
{
{
PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
...
@@ -453,7 +460,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -453,7 +460,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
@op_lifter
([
Images2Neibs
])
@op_lifter
([
Images2Neibs
])
def
use_gpu_images2neibs
(
node
):
def
use_gpu_images2neibs
(
node
,
context_name
):
if
node
.
op
.
mode
in
[
'valid'
,
'ignore_borders'
,
'wrap_centered'
]:
if
node
.
op
.
mode
in
[
'valid'
,
'ignore_borders'
,
'wrap_centered'
]:
return
GpuImages2Neibs
(
node
.
op
.
mode
)
return
GpuImages2Neibs
(
node
.
op
.
mode
)
...
...
theano/sandbox/gpuarray/nerv.py
浏览文件 @
4814cd99
...
@@ -8,10 +8,10 @@ from theano.gof import local_optimizer, COp
...
@@ -8,10 +8,10 @@ from theano.gof import local_optimizer, COp
from
theano.scalar
import
as_scalar
,
constant
from
theano.scalar
import
as_scalar
,
constant
from
.
import
opt
from
.
import
opt
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuAllocEmpty
)
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuAllocEmpty
,
infer_context_name
)
from
.type
import
gpu_context_type
from
.opt_util
import
alpha_merge
,
output_merge
from
.opt_util
import
alpha_merge
,
output_merge
from
.pycuda_helper
import
ensure_pycuda_context
try
:
try
:
from
nervanagpu.nervanagpu
import
GPUTensor
,
NervanaGPU
from
nervanagpu.nervanagpu
import
GPUTensor
,
NervanaGPU
...
@@ -43,6 +43,7 @@ def ensure_float(val, name):
...
@@ -43,6 +43,7 @@ def ensure_float(val, name):
class
Gemm16
(
COp
):
class
Gemm16
(
COp
):
__props__
=
(
'relu'
,
'inplace'
)
__props__
=
(
'relu'
,
'inplace'
)
_f16_ok
=
True
_f16_ok
=
True
context_type
=
gpu_context_type
KERN_NAMES
=
(
'nn_128x128'
,
'nn_128x64'
,
'nn_128x32'
,
KERN_NAMES
=
(
'nn_128x128'
,
'nn_128x64'
,
'nn_128x32'
,
'nn_vec_128x128'
,
'nn_vec_128x64'
,
'nn_vec_128x32'
,
'nn_vec_128x128'
,
'nn_vec_128x64'
,
'nn_vec_128x32'
,
'tn_128x128'
,
'tn_128x64'
,
'tn_128x32'
,
'tn_128x128'
,
'tn_128x64'
,
'tn_128x32'
,
...
@@ -61,10 +62,11 @@ class Gemm16(COp):
...
@@ -61,10 +62,11 @@ class Gemm16(COp):
def
make_node
(
self
,
C
,
alpha
,
A
,
B
,
beta
):
def
make_node
(
self
,
C
,
alpha
,
A
,
B
,
beta
):
if
GPUTensor
is
None
:
if
GPUTensor
is
None
:
raise
RuntimeError
(
"Can't use Gemm16: nervanagpu not found"
)
raise
RuntimeError
(
"Can't use Gemm16: nervanagpu not found"
)
ctx_name
=
infer_context_name
(
C
,
A
,
B
)
A
=
as_gpuarray_variable
(
A
)
A
=
as_gpuarray_variable
(
A
,
ctx_name
)
B
=
as_gpuarray_variable
(
B
)
B
=
as_gpuarray_variable
(
B
,
ctx_name
)
C
=
as_gpuarray_variable
(
C
)
C
=
as_gpuarray_variable
(
C
,
ctx_name
)
alpha
=
ensure_float
(
alpha
,
'alpha'
)
alpha
=
ensure_float
(
alpha
,
'alpha'
)
beta
=
ensure_float
(
beta
,
'beta'
)
beta
=
ensure_float
(
beta
,
'beta'
)
...
@@ -73,27 +75,8 @@ class Gemm16(COp):
...
@@ -73,27 +75,8 @@ class Gemm16(COp):
return
Apply
(
self
,
[
C
,
alpha
,
A
,
B
,
beta
],
[
C
.
type
()])
return
Apply
(
self
,
[
C
,
alpha
,
A
,
B
,
beta
],
[
C
.
type
()])
def
perform
(
self
,
node
,
inputs
,
outputs
):
def
get_context
(
self
,
node
):
ensure_pycuda_context
()
return
node
.
inputs
[
0
]
.
type
.
context
C
,
alpha
,
A
,
B
,
beta
=
inputs
# The nervana code does not support the case where both inputs
# are trans, so we need to copy one if them if that is the
# case. We copy the smaller one.
if
A
.
flags
.
f_contiguous
and
B
.
flags
.
f_contiguous
:
if
A
.
size
<
B
.
size
:
A
=
A
.
copy
()
else
:
B
=
B
.
copy
()
inplace
=
self
.
inplace
if
inplace
and
not
C
.
flags
.
c_contiguous
:
inplace
=
False
if
not
inplace
:
C
=
C
.
copy
()
At
=
to_gputensor
(
A
)
Bt
=
to_gputensor
(
B
)
Ct
=
to_gputensor
(
C
)
nerv
.
dot
(
At
,
Bt
,
Ct
,
alpha
=
alpha
,
beta
=
beta
,
relu
=
False
)
outputs
[
0
][
0
]
=
C
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'gpuarray/types.h'
,
'numpy_compat.h'
,
'gpuarray_helper.h'
,
return
[
'gpuarray/types.h'
,
'numpy_compat.h'
,
'gpuarray_helper.h'
,
...
@@ -145,7 +128,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
...
@@ -145,7 +128,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
codel
.
append
(
"memset(&k_{0}, 0, sizeof(GpuKernel));"
.
format
(
name
))
codel
.
append
(
"memset(&k_{0}, 0, sizeof(GpuKernel));"
.
format
(
name
))
codel
.
append
(
"const char *bcode;"
)
codel
.
append
(
"const char *bcode;"
)
codel
.
append
(
"size_t sz;"
)
codel
.
append
(
"size_t sz;"
)
codel
.
append
(
"PyGpuContextObject *c =
pygpu_default_context();"
)
codel
.
append
(
"PyGpuContextObject *c =
%
s;"
%
(
sub
[
'context'
],)
)
codel
.
append
(
"int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
codel
.
append
(
"int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
"GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
"GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
"GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};"
)
"GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};"
)
...
@@ -162,7 +145,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
...
@@ -162,7 +145,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
@opt.register_opt
()
@opt.register_opt
()
@opt.op_lifter
([
tensor
.
Dot
])
@opt.op_lifter
([
tensor
.
Dot
])
def
local_dot_to_gemm16
(
node
):
def
local_dot_to_gemm16
(
node
,
ctx_name
):
if
nerv
is
None
:
if
nerv
is
None
:
return
return
A
=
node
.
inputs
[
0
]
A
=
node
.
inputs
[
0
]
...
@@ -170,7 +153,7 @@ def local_dot_to_gemm16(node):
...
@@ -170,7 +153,7 @@ def local_dot_to_gemm16(node):
if
(
A
.
ndim
==
2
and
B
.
ndim
==
2
and
if
(
A
.
ndim
==
2
and
B
.
ndim
==
2
and
A
.
dtype
==
'float16'
and
B
.
dtype
==
'float16'
):
A
.
dtype
==
'float16'
and
B
.
dtype
==
'float16'
):
fgraph
=
node
.
inputs
[
0
]
.
fgraph
fgraph
=
node
.
inputs
[
0
]
.
fgraph
C
=
GpuAllocEmpty
(
dtype
=
'float16'
)(
C
=
GpuAllocEmpty
(
dtype
=
'float16'
,
context_name
=
ctx_name
)(
shape_i
(
A
,
0
,
fgraph
),
shape_i
(
B
,
1
,
fgraph
))
shape_i
(
A
,
0
,
fgraph
),
shape_i
(
B
,
1
,
fgraph
))
return
Gemm16
()(
C
,
1.0
,
A
,
B
,
0.0
)
return
Gemm16
()(
C
,
1.0
,
A
,
B
,
0.0
)
...
...
theano/sandbox/gpuarray/nnet.py
浏览文件 @
4814cd99
...
@@ -10,7 +10,8 @@ try:
...
@@ -10,7 +10,8 @@ try:
except
ImportError
:
except
ImportError
:
pass
pass
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
)
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
infer_context_name
)
from
.type
import
GpuArrayType
from
.type
import
GpuArrayType
from
.kernel_codegen
import
(
nvcc_kernel
,
from
.kernel_codegen
import
(
nvcc_kernel
,
inline_softmax
,
inline_softmax
,
...
@@ -23,23 +24,26 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
...
@@ -23,23 +24,26 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
"""
"""
nin
=
3
nin
=
3
nout
=
3
nout
=
3
__props__
=
()
__props__
=
()
_f16_ok
=
True
_f16_ok
=
True
def
make_node
(
self
,
x
,
b
,
y_idx
):
def
make_node
(
self
,
x
,
b
,
y_idx
):
# N.B. won't work when we don't cast y_idx to float anymore
ctx_name
=
infer_context_name
(
x
,
b
,
y_idx
)
x
=
as_gpuarray_variable
(
x
)
x
=
as_gpuarray_variable
(
x
,
ctx_name
)
b
=
as_gpuarray_variable
(
b
)
b
=
as_gpuarray_variable
(
b
,
ctx_name
)
y_idx
=
as_gpuarray_variable
(
y_idx
)
y_idx
=
as_gpuarray_variable
(
y_idx
,
ctx_name
)
nll
=
GpuArrayType
(
x
.
type
.
dtype
,
nll
=
GpuArrayType
(
x
.
type
.
dtype
,
y_idx
.
type
.
broadcastable
)()
y_idx
.
type
.
broadcastable
,
context_name
=
ctx_name
)()
sm
=
x
.
type
()
sm
=
x
.
type
()
am
=
y_idx
.
type
()
am
=
y_idx
.
type
()
return
Apply
(
self
,
[
x
,
b
,
y_idx
],
[
nll
,
sm
,
am
])
return
Apply
(
self
,
[
x
,
b
,
y_idx
],
[
nll
,
sm
,
am
])
def
get_context
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
...
@@ -144,6 +148,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
...
@@ -144,6 +148,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
flags
=
flags
,
objvar
=
k_var
)]
flags
=
flags
,
objvar
=
k_var
)]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
if
node
.
inputs
[
0
]
.
type
.
context
.
kind
!=
'cuda'
:
raise
NotImplementedError
(
'cuda only'
)
typecode_x
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
inputs
[
0
]
.
dtype
)
typecode_x
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
inputs
[
0
]
.
dtype
)
typecode_b
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
inputs
[
1
]
.
dtype
)
typecode_b
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
inputs
[
1
]
.
dtype
)
typecode_y_idx
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
inputs
[
2
]
.
dtype
)
typecode_y_idx
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
inputs
[
2
]
.
dtype
)
...
@@ -163,6 +169,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
...
@@ -163,6 +169,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
dtype_am
=
node
.
outputs
[
2
]
.
dtype
dtype_am
=
node
.
outputs
[
2
]
.
dtype
classname
=
self
.
__class__
.
__name__
classname
=
self
.
__class__
.
__name__
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'context'
]
k_var
=
"k_xent_sm_1hot_bias_
%(nodename)
s"
%
locals
()
k_var
=
"k_xent_sm_1hot_bias_
%(nodename)
s"
%
locals
()
err_check
=
"""
err_check
=
"""
if (err != GA_NO_ERROR) {
if (err != GA_NO_ERROR) {
...
@@ -214,9 +221,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
...
@@ -214,9 +221,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
{
{
Py_XDECREF(
%(nll)
s);
Py_XDECREF(
%(nll)
s);
%(nll)
s = pygpu_empty(1, PyGpuArray_DIMS(
%(y_idx)
s),
%(nll)
s = pygpu_empty(1, PyGpuArray_DIMS(
%(y_idx)
s),
%(typecode_x)
s,
%(typecode_x)
s, GA_C_ORDER,
%(ctx)
s,
GA_C_ORDER,
Py_None);
pygpu_default_context(), Py_None);
if (!
%(nll)
s) {
if (!
%(nll)
s) {
%(fail)
s
%(fail)
s
}
}
...
@@ -229,9 +235,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
...
@@ -229,9 +235,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
{
{
Py_XDECREF(
%(sm)
s);
Py_XDECREF(
%(sm)
s);
%(sm)
s = pygpu_empty(2, PyGpuArray_DIMS(
%(x)
s),
%(sm)
s = pygpu_empty(2, PyGpuArray_DIMS(
%(x)
s),
%(typecode_b)
s,
%(typecode_b)
s, GA_C_ORDER,
GA_C_ORDER,
%(ctx)
s, Py_None);
pygpu_default_context(), Py_None);
if(!
%(sm)
s)
if(!
%(sm)
s)
{
{
PyErr_SetString(PyExc_MemoryError,
PyErr_SetString(PyExc_MemoryError,
...
@@ -246,9 +251,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
...
@@ -246,9 +251,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
{
{
Py_XDECREF(
%(am)
s);
Py_XDECREF(
%(am)
s);
%(am)
s = pygpu_empty(1, PyGpuArray_DIMS(
%(y_idx)
s),
%(am)
s = pygpu_empty(1, PyGpuArray_DIMS(
%(y_idx)
s),
%(typecode_y_idx)
s,
%(typecode_y_idx)
s, GA_C_ORDER,
GA_C_ORDER,
%(ctx)
s, Py_None);
pygpu_default_context(), Py_None);
if(!
%(am)
s)
if(!
%(am)
s)
{
{
PyErr_SetString(PyExc_MemoryError,
PyErr_SetString(PyExc_MemoryError,
...
@@ -306,18 +310,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
...
@@ -306,18 +310,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
Gradient wrt x of the CrossentropySoftmax1Hot Op.
Gradient wrt x of the CrossentropySoftmax1Hot Op.
"""
"""
nin
=
3
nin
=
3
nout
=
1
nout
=
1
__props__
=
()
__props__
=
()
_f16_ok
=
True
_f16_ok
=
True
def
make_node
(
self
,
dnll
,
sm
,
y_idx
):
def
make_node
(
self
,
dnll
,
sm
,
y_idx
):
dnll
=
as_gpuarray_variable
(
dnll
)
ctx_name
=
infer_context_name
(
dnll
,
sm
,
y_idx
)
sm
=
as_gpuarray_variable
(
sm
)
dnll
=
as_gpuarray_variable
(
dnll
,
ctx_name
)
y_idx
=
as_gpuarray_variable
(
y_idx
)
sm
=
as_gpuarray_variable
(
sm
,
ctx_name
)
y_idx
=
as_gpuarray_variable
(
y_idx
,
ctx_name
)
return
Apply
(
self
,
[
dnll
,
sm
,
y_idx
],
[
sm
.
type
()])
return
Apply
(
self
,
[
dnll
,
sm
,
y_idx
],
[
sm
.
type
()])
def
get_context
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
11
,)
return
(
11
,)
...
@@ -325,6 +332,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
...
@@ -325,6 +332,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
if
node
.
inputs
[
0
]
.
type
.
context
.
kind
!=
'cuda'
:
raise
NotImplementedError
(
"cuda only"
)
typecode_dx
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
outputs
[
0
]
.
dtype
)
typecode_dx
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
outputs
[
0
]
.
dtype
)
itemsize_dnll
=
numpy
.
dtype
(
node
.
inputs
[
0
]
.
dtype
)
.
itemsize
itemsize_dnll
=
numpy
.
dtype
(
node
.
inputs
[
0
]
.
dtype
)
.
itemsize
itemsize_sm
=
numpy
.
dtype
(
node
.
inputs
[
1
]
.
dtype
)
.
itemsize
itemsize_sm
=
numpy
.
dtype
(
node
.
inputs
[
1
]
.
dtype
)
.
itemsize
...
@@ -338,6 +347,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
...
@@ -338,6 +347,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
dnll
,
sm
,
y_idx
=
inp
dnll
,
sm
,
y_idx
=
inp
dx
,
=
out
dx
,
=
out
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'context'
]
k_var
=
"kCrossEntropySoftmax1HotWithBiasDx_"
+
nodename
k_var
=
"kCrossEntropySoftmax1HotWithBiasDx_"
+
nodename
err_check
=
"""
err_check
=
"""
if (err != GA_NO_ERROR) {
if (err != GA_NO_ERROR) {
...
@@ -403,9 +413,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
...
@@ -403,9 +413,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
{
{
Py_XDECREF(
%(dx)
s);
Py_XDECREF(
%(dx)
s);
%(dx)
s = pygpu_empty(2, PyGpuArray_DIMS(
%(sm)
s),
%(dx)
s = pygpu_empty(2, PyGpuArray_DIMS(
%(sm)
s),
%(typecode_dx)
s,
%(typecode_dx)
s, GA_C_ORDER,
GA_C_ORDER,
%(ctx)
s, Py_None);
pygpu_default_context(), Py_None);
if (!
%(dx)
s) {
if (!
%(dx)
s) {
%(fail)
s
%(fail)
s
}
}
...
@@ -512,14 +521,16 @@ class GpuSoftmax(GpuKernelBase, Op):
...
@@ -512,14 +521,16 @@ class GpuSoftmax(GpuKernelBase, Op):
Implement Softmax on the gpu.
Implement Softmax on the gpu.
"""
"""
__props__
=
()
__props__
=
()
_f16_ok
=
True
_f16_ok
=
True
def
make_node
(
self
,
x
):
def
make_node
(
self
,
x
):
x
=
as_gpuarray_variable
(
x
)
x
=
as_gpuarray_variable
(
x
,
infer_context_name
(
x
)
)
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
def
get_context
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
def
infer_shape
(
self
,
node
,
shape
):
def
infer_shape
(
self
,
node
,
shape
):
return
shape
return
shape
...
@@ -530,6 +541,8 @@ class GpuSoftmax(GpuKernelBase, Op):
...
@@ -530,6 +541,8 @@ class GpuSoftmax(GpuKernelBase, Op):
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
if
node
.
inputs
[
0
]
.
type
.
context
.
kind
!=
'cuda'
:
raise
NotImplementedError
(
"cuda only"
)
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_x
=
node
.
inputs
[
0
]
.
dtype
work_x
=
work_dtype
(
dtype_x
)
work_x
=
work_dtype
(
dtype_x
)
dtype_z
=
node
.
outputs
[
0
]
.
dtype
dtype_z
=
node
.
outputs
[
0
]
.
dtype
...
@@ -539,6 +552,7 @@ class GpuSoftmax(GpuKernelBase, Op):
...
@@ -539,6 +552,7 @@ class GpuSoftmax(GpuKernelBase, Op):
x
,
=
inp
x
,
=
inp
z
,
=
out
z
,
=
out
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'context'
]
err_check
=
"""
err_check
=
"""
if (err != GA_NO_ERROR) {
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
...
@@ -568,9 +582,8 @@ class GpuSoftmax(GpuKernelBase, Op):
...
@@ -568,9 +582,8 @@ class GpuSoftmax(GpuKernelBase, Op):
{
{
Py_XDECREF(
%(z)
s);
Py_XDECREF(
%(z)
s);
%(z)
s = pygpu_empty(2, PyGpuArray_DIMS(
%(x)
s),
%(z)
s = pygpu_empty(2, PyGpuArray_DIMS(
%(x)
s),
%(typecode)
s,
%(typecode)
s, GA_C_ORDER,
GA_C_ORDER,
%(ctx)
s, Py_None);
pygpu_default_context(), Py_None);
if (!
%(z)
s) {
if (!
%(z)
s) {
%(fail)
s
%(fail)
s
}
}
...
@@ -698,22 +711,25 @@ class GpuSoftmax(GpuKernelBase, Op):
...
@@ -698,22 +711,25 @@ class GpuSoftmax(GpuKernelBase, Op):
gpu_softmax
=
GpuSoftmax
()
gpu_softmax
=
GpuSoftmax
()
class
GpuSoftmaxWithBias
(
GpuKernelBase
,
Op
):
class
GpuSoftmaxWithBias
(
GpuKernelBase
,
Op
):
"""
"""
Implement SoftmaxWithBias on the gpu.
Implement SoftmaxWithBias on the gpu.
"""
"""
nin
=
2
nin
=
2
nout
=
1
nout
=
1
__props__
=
()
__props__
=
()
_f16_ok
=
True
_f16_ok
=
True
def
make_node
(
self
,
x
,
b
):
def
make_node
(
self
,
x
,
b
):
x
=
as_gpuarray_variable
(
x
)
ctx_name
=
infer_context_name
(
x
,
b
)
b
=
as_gpuarray_variable
(
b
)
x
=
as_gpuarray_variable
(
x
,
ctx_name
)
b
=
as_gpuarray_variable
(
b
,
ctx_name
)
return
Apply
(
self
,
[
x
,
b
],
[
x
.
type
()])
return
Apply
(
self
,
[
x
,
b
],
[
x
.
type
()])
def
get_context
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
def
infer_shape
(
self
,
node
,
shape
):
def
infer_shape
(
self
,
node
,
shape
):
return
[
shape
[
0
]]
return
[
shape
[
0
]]
...
@@ -724,6 +740,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
...
@@ -724,6 +740,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
if
node
.
inputs
[
0
]
.
type
.
context
.
kind
!=
'cuda'
:
raise
NotImplementedError
(
'cuda only'
)
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_b
=
node
.
inputs
[
1
]
.
dtype
dtype_b
=
node
.
inputs
[
1
]
.
dtype
dtype_z
=
node
.
outputs
[
0
]
.
dtype
dtype_z
=
node
.
outputs
[
0
]
.
dtype
...
@@ -735,6 +753,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
...
@@ -735,6 +753,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
x
,
b
=
inp
x
,
b
=
inp
z
,
=
out
z
,
=
out
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'context'
]
err_check
=
"""
err_check
=
"""
if (err != GA_NO_ERROR) {
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
...
@@ -777,9 +796,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
...
@@ -777,9 +796,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
{
{
Py_XDECREF(
%(z)
s);
Py_XDECREF(
%(z)
s);
%(z)
s = pygpu_empty(2, PyGpuArray_DIMS(
%(x)
s),
%(z)
s = pygpu_empty(2, PyGpuArray_DIMS(
%(x)
s),
%(typecode)
s,
%(typecode)
s, GA_C_ORDER,
GA_C_ORDER,
%(ctx)
s, Py_None);
pygpu_default_context(), Py_None);
if (!
%(z)
s) {
if (!
%(z)
s) {
%(fail)
s
%(fail)
s
}
}
...
...
theano/sandbox/gpuarray/opt.py
浏览文件 @
4814cd99
...
@@ -3,11 +3,6 @@ import numpy
...
@@ -3,11 +3,6 @@ import numpy
import
logging
import
logging
from
six.moves
import
xrange
from
six.moves
import
xrange
try
:
import
pygpu
except
ImportError
:
pass
import
theano
import
theano
from
theano
import
tensor
,
scalar
,
gof
from
theano
import
tensor
,
scalar
,
gof
from
theano.compile
import
optdb
from
theano.compile
import
optdb
...
@@ -22,12 +17,12 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
...
@@ -22,12 +17,12 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
from
theano.tensor.nnet.conv
import
ConvOp
from
theano.tensor.nnet.conv
import
ConvOp
from
theano.tests.breakpoint
import
PdbBreakpoint
from
theano.tests.breakpoint
import
PdbBreakpoint
from
.type
import
GpuArrayType
,
GpuArrayConstant
from
.type
import
GpuArrayType
,
GpuArrayConstant
,
get_context
from
.basic_ops
import
(
as_gpuarray_variable
,
from
.basic_ops
import
(
as_gpuarray_variable
,
infer_context_name
,
host_from_gpu
,
gpu_from_host
,
host_from_gpu
,
GpuToGpu
,
HostFromGpu
,
GpuFromHost
,
HostFromGpu
,
GpuFromHost
,
GpuSplit
,
GpuContiguous
,
GpuSplit
,
GpuContiguous
,
gpu_alloc
,
GpuAlloc
,
GpuAllocEmpty
,
GpuReshape
,
GpuAlloc
,
GpuAllocEmpty
,
GpuReshape
,
GpuEye
,
gpu_join
,
GpuJoin
)
GpuEye
,
gpu_join
,
GpuJoin
)
from
.blas
import
(
gpu_dot22
,
GpuGemv
,
GpuGemm
,
GpuGer
,
from
.blas
import
(
gpu_dot22
,
GpuGemv
,
GpuGemm
,
GpuGer
,
gpugemm_no_inplace
)
gpugemm_no_inplace
)
...
@@ -79,9 +74,9 @@ gpu_optimizer.register('local_remove_all_assert',
...
@@ -79,9 +74,9 @@ gpu_optimizer.register('local_remove_all_assert',
'unsafe'
)
'unsafe'
)
def
safe_to_gpu
(
x
):
def
safe_to_gpu
(
x
,
ctx_name
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
return
gpu_from_host
(
x
)
return
GpuFromHost
(
ctx_name
)
(
x
)
else
:
else
:
return
x
return
x
...
@@ -102,28 +97,53 @@ def op_lifter(OP, cuda_only=False):
...
@@ -102,28 +97,53 @@ def op_lifter(OP, cuda_only=False):
"""
"""
def
f
(
maker
):
def
f
(
maker
):
def
local_opt
(
node
):
def
local_opt
(
node
):
dev
=
theano
.
sandbox
.
gpuarray
.
init_dev
.
device
if
cuda_only
and
not
dev
.
startswith
(
'cuda'
):
return
if
type
(
node
.
op
)
in
OP
:
if
type
(
node
.
op
)
in
OP
:
# Either one of our inputs is on the gpu or
# Either one of our inputs is on the gpu or
# all of our client are on the gpu
# all of our clients are on the gpu
if
(
any
([
i
.
owner
and
i
.
owner
.
op
==
host_from_gpu
replace
=
False
for
i
in
node
.
inputs
])
or
# TODO: Maybe set context_name with infer_context_name()?
all
([
c
!=
'output'
and
c
.
op
==
gpu_from_host
context_name
=
None
for
c
,
idx
in
node
.
outputs
[
0
]
.
clients
])):
# We replace if any input is a host_from_gpu
new_op
=
maker
(
node
)
for
i
in
node
.
inputs
:
# This is needed as sometimes new_op inherit from OP.
if
i
.
owner
and
i
.
owner
.
op
==
host_from_gpu
:
if
new_op
and
new_op
!=
node
.
op
:
context_name
=
i
.
owner
.
inputs
[
0
]
.
type
.
context_name
if
isinstance
(
new_op
,
theano
.
Op
):
replace
=
True
return
[
safe_to_cpu
(
o
)
for
o
in
break
new_op
(
*
node
.
inputs
,
return_list
=
True
)]
if
not
replace
:
elif
isinstance
(
new_op
,
(
tuple
,
list
)):
# We replace if *all* clients are on the GPU
return
[
safe_to_cpu
(
o
)
for
o
in
new_op
]
clients
=
[
c
for
o
in
node
.
outputs
for
c
in
o
.
clients
]
else
:
# suppose it is a variable on the GPU
replace
=
len
(
clients
)
!=
0
return
[
host_from_gpu
(
new_op
)]
for
c
,
idx
in
clients
:
if
(
c
==
'output'
or
not
isinstance
(
c
.
op
,
GpuFromHost
)):
replace
=
False
# TODO: check that the clients want the same context?
if
replace
:
# All clients are GpuFromHost and we have at least one
context_name
=
clients
[
0
][
0
]
.
op
.
context_name
# Check if we should replace
if
(
not
replace
or
(
cuda_only
and
get_context
(
context_name
)
.
kind
!=
'cuda'
)):
return
False
new_op
=
maker
(
node
,
context_name
)
# This is needed as sometimes new_op inherits from OP.
if
new_op
and
new_op
!=
node
.
op
:
if
isinstance
(
new_op
,
theano
.
Op
):
# tag the inputs with the context in case
# the context was derived from the outputs
def
tag
(
i
,
ctx
):
i
.
tag
.
context_name
=
ctx
return
i
inputs
=
[
tag
(
i
,
context_name
)
for
i
in
node
.
inputs
]
return
[
safe_to_cpu
(
o
)
for
o
in
new_op
(
*
inputs
,
return_list
=
True
)]
elif
isinstance
(
new_op
,
(
tuple
,
list
)):
return
[
safe_to_cpu
(
o
)
for
o
in
new_op
]
else
:
# suppose it is a variable on the GPU
return
[
host_from_gpu
(
new_op
)]
return
False
return
False
local_opt
.
__name__
=
maker
.
__name__
local_opt
.
__name__
=
maker
.
__name__
return
local_optimizer
(
OP
)(
local_opt
)
return
local_optimizer
(
OP
)(
local_opt
)
...
@@ -146,35 +166,81 @@ class InputToGpuOptimizer(Optimizer):
...
@@ -146,35 +166,81 @@ class InputToGpuOptimizer(Optimizer):
if
(
len
(
input
.
clients
)
==
1
and
if
(
len
(
input
.
clients
)
==
1
and
(
input
.
clients
[
0
][
0
]
==
'output'
or
(
input
.
clients
[
0
][
0
]
==
'output'
or
i
nput
.
clients
[
0
][
0
]
.
op
==
gpu_from_host
)):
i
sinstance
(
input
.
clients
[
0
][
0
]
.
op
,
GpuFromHost
)
)):
continue
continue
ctx_name
=
getattr
(
input
.
tag
,
'context_name'
,
None
)
try
:
try
:
new_input
=
host_from_gpu
(
gpu_from_host
(
input
))
new_input
=
host_from_gpu
(
GpuFromHost
(
ctx_name
)
(
input
))
fgraph
.
replace_validate
(
input
,
new_input
,
fgraph
.
replace_validate
(
input
,
new_input
,
"InputToGpuOptimizer"
)
"InputToGpuOptimizer"
)
except
TypeError
:
except
TypeError
:
# This could fail if the inputs are not TensorTypes
# This could fail if the inputs are not TensorTypes
pass
pass
except
ValueError
:
# If there is no context tag and no default context
# then it stays on the CPU
if
not
hasattr
(
input
.
tag
,
'context_name'
):
raise
pass
gpu_seqopt
.
register
(
'InputToGpuArrayOptimizer'
,
InputToGpuOptimizer
(),
gpu_seqopt
.
register
(
'InputToGpuArrayOptimizer'
,
InputToGpuOptimizer
(),
0
,
'fast_run'
,
'fast_compile'
,
'merge'
)
0
,
'fast_run'
,
'fast_compile'
,
'merge'
)
@local_optimizer
([
gpu_from_host
,
host_from_gpu
])
@local_optimizer
([
GpuFromHost
,
GpuToGpu
,
host_from_gpu
])
def
local_cut_gpu_host_gpu
(
node
):
def
local_cut_gpu_transfers
(
node
):
if
tensor
.
opt
.
opt
.
check_chain
(
node
,
gpu_from_host
,
host_from_gpu
):
# gpu[ab] -> host -> gpub
return
[
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]]
if
(
isinstance
(
node
.
op
,
GpuFromHost
)
and
if
tensor
.
opt
.
opt
.
check_chain
(
node
,
host_from_gpu
,
gpu_from_host
):
node
.
inputs
[
0
]
.
owner
and
return
[
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]]
node
.
inputs
[
0
]
.
owner
.
op
==
host_from_gpu
):
return
False
other
=
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]
gpu_cut_copies
.
register
(
'cut_gpua_host_transfers'
,
local_cut_gpu_host_gpu
,
if
node
.
op
.
context_name
==
other
.
type
.
context_name
:
return
[
other
]
else
:
return
[
GpuToGpu
(
node
.
op
.
context_name
)(
other
)]
# ? -> gpua -> host
elif
(
node
.
op
==
host_from_gpu
and
node
.
inputs
[
0
]
.
owner
):
n2
=
node
.
inputs
[
0
]
.
owner
# host ->
if
isinstance
(
n2
.
op
,
GpuFromHost
):
return
[
n2
.
inputs
[
0
]]
# gpub ->
if
isinstance
(
n2
.
op
,
GpuToGpu
):
return
[
host_from_gpu
(
n2
.
inputs
[
0
])]
# ? -> gpua -> gpub
elif
isinstance
(
node
.
op
,
GpuToGpu
):
# Transfer within same context
if
node
.
inputs
[
0
]
.
type
.
context_name
==
node
.
op
.
context_name
:
return
[
node
.
inputs
[
0
]]
if
node
.
inputs
[
0
]
.
owner
:
n2
=
node
.
inputs
[
0
]
.
owner
# host ->
if
isinstance
(
n2
.
op
,
GpuFromHost
):
return
[
GpuFromHost
(
node
.
op
.
context_name
)(
n2
.
inputs
[
0
])]
# gpuc ->
if
isinstance
(
n2
.
op
,
GpuToGpu
):
if
node
.
op
.
context_name
==
n2
.
inputs
[
0
]
.
type
.
context_name
:
return
[
n2
.
inputs
[
0
]]
else
:
return
[
node
.
op
(
n2
.
inputs
[
0
])]
gpu_cut_copies
.
register
(
'cut_gpua_host_transfers'
,
local_cut_gpu_transfers
,
'fast_compile'
,
'fast_run'
,
'inplace'
,
'gpuarray'
)
'fast_compile'
,
'fast_run'
,
'inplace'
,
'gpuarray'
)
gpu_cut_copies
.
register
(
'cut_gpua_constant_transfers'
,
gpu_cut_copies
.
register
(
'cut_gpua_constant_transfers'
,
tensor
.
opt
.
constant_folding
,
tensor
.
opt
.
constant_folding
,
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
optdb
[
'canonicalize'
]
.
register
(
'local_cut_gpua_host_gpua'
,
optdb
[
'canonicalize'
]
.
register
(
'local_cut_gpua_host_gpua'
,
local_cut_gpu_
host_gpu
,
local_cut_gpu_
transfers
,
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
...
@@ -187,6 +253,11 @@ def local_gpuaalloc2(node):
...
@@ -187,6 +253,11 @@ def local_gpuaalloc2(node):
Moves an alloc that is an input to join to the gpu.
Moves an alloc that is an input to join to the gpu.
"""
"""
try
:
get_context
(
None
)
except
ValueError
:
# If there is no default context then we do not perform the move here.
return
if
(
isinstance
(
node
.
op
,
tensor
.
Alloc
)
and
if
(
isinstance
(
node
.
op
,
tensor
.
Alloc
)
and
all
(
c
!=
'output'
and
all
(
c
!=
'output'
and
c
.
op
==
tensor
.
join
and
c
.
op
==
tensor
.
join
and
...
@@ -194,23 +265,13 @@ def local_gpuaalloc2(node):
...
@@ -194,23 +265,13 @@ def local_gpuaalloc2(node):
i
.
owner
.
op
in
[
host_from_gpu
,
tensor
.
alloc
]
i
.
owner
.
op
in
[
host_from_gpu
,
tensor
.
alloc
]
for
i
in
c
.
inputs
[
1
:])
for
i
in
c
.
inputs
[
1
:])
for
c
,
idx
in
node
.
outputs
[
0
]
.
clients
)):
for
c
,
idx
in
node
.
outputs
[
0
]
.
clients
)):
return
[
host_from_gpu
(
gpu_alloc
(
*
node
.
inputs
))]
return
[
host_from_gpu
(
GpuAlloc
(
None
)
(
*
node
.
inputs
))]
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Alloc
])
@op_lifter
([
tensor
.
Alloc
])
def
local_gpuaalloc
(
node
):
def
local_gpuaalloc
(
node
,
context_name
):
new_out
=
gpu_alloc
(
*
node
.
inputs
)
return
GpuAlloc
(
context_name
)(
*
node
.
inputs
)
# We need to hide new broadcastable dimensions because
# ReplaceValidate doesn't like when they change.
if
new_out
.
broadcastable
!=
node
.
outputs
[
0
]
.
broadcastable
:
# but if a dim is suddenly not broadcastable anymore then that's a bug
for
b_old
,
b_new
in
zip
(
node
.
outputs
[
0
]
.
broadcastable
,
new_out
.
broadcastable
):
assert
b_new
or
(
not
b_old
)
new_out
=
tensor
.
patternbroadcast
(
new_out
,
node
.
outputs
[
0
]
.
broadcastable
)
return
(
new_out
,)
@register_opt
()
@register_opt
()
...
@@ -221,8 +282,8 @@ def local_gpualloc_memset_0(node):
...
@@ -221,8 +282,8 @@ def local_gpualloc_memset_0(node):
if
(
isinstance
(
inp
,
GpuArrayConstant
)
and
if
(
isinstance
(
inp
,
GpuArrayConstant
)
and
inp
.
data
.
size
==
1
and
inp
.
data
.
size
==
1
and
(
numpy
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
(
numpy
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
new_o
ut
=
GpuAlloc
(
memset_0
=
True
)(
*
node
.
inputs
)
new_o
p
=
GpuAlloc
(
node
.
op
.
context_name
,
memset_0
=
True
)
return
[
new_o
ut
]
return
[
new_o
p
(
*
node
.
inputs
)
]
@register_opt
()
@register_opt
()
...
@@ -240,7 +301,7 @@ def local_gpu_contiguous_gpu_contiguous(node):
...
@@ -240,7 +301,7 @@ def local_gpu_contiguous_gpu_contiguous(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Reshape
])
@op_lifter
([
tensor
.
Reshape
])
def
local_gpureshape
(
node
):
def
local_gpureshape
(
node
,
context_name
):
op
=
node
.
op
op
=
node
.
op
name
=
op
.
name
name
=
op
.
name
if
name
:
if
name
:
...
@@ -251,14 +312,14 @@ def local_gpureshape(node):
...
@@ -251,14 +312,14 @@ def local_gpureshape(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Rebroadcast
])
@op_lifter
([
tensor
.
Rebroadcast
])
def
local_gpu_rebroadcast
(
node
):
def
local_gpu_rebroadcast
(
node
,
context_name
):
if
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
,
HostFromGpu
):
if
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
,
HostFromGpu
):
return
node
.
op
(
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
])
return
node
.
op
(
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
])
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Flatten
])
@op_lifter
([
tensor
.
Flatten
])
def
local_gpuflatten
(
node
):
def
local_gpuflatten
(
node
,
context_name
):
op
=
node
.
op
op
=
node
.
op
shp
=
[]
shp
=
[]
if
op
.
outdim
!=
1
:
if
op
.
outdim
!=
1
:
...
@@ -271,7 +332,7 @@ def local_gpuflatten(node):
...
@@ -271,7 +332,7 @@ def local_gpuflatten(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Elemwise
])
@op_lifter
([
tensor
.
Elemwise
])
def
local_gpu_elemwise
(
node
):
def
local_gpu_elemwise
(
node
,
context_name
):
op
=
node
.
op
op
=
node
.
op
scal_op
=
op
.
scalar_op
scal_op
=
op
.
scalar_op
name
=
op
.
name
name
=
op
.
name
...
@@ -344,28 +405,28 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
...
@@ -344,28 +405,28 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
DimShuffle
])
@op_lifter
([
tensor
.
DimShuffle
])
def
local_gpua_dimshuffle
(
node
):
def
local_gpua_dimshuffle
(
node
,
context_name
):
return
GpuDimShuffle
(
node
.
op
.
input_broadcastable
,
return
GpuDimShuffle
(
node
.
op
.
input_broadcastable
,
node
.
op
.
new_order
)
node
.
op
.
new_order
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
SpecifyShape
])
@op_lifter
([
tensor
.
SpecifyShape
])
def
local_gpua_specifyShape
(
node
):
def
local_gpua_specifyShape
(
node
,
context_name
):
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
return
return
inp
=
[
gpu_from_host
(
node
.
inputs
[
0
])]
+
node
.
inputs
[
1
:]
inp
=
[
GpuFromHost
(
context_name
)
(
node
.
inputs
[
0
])]
+
node
.
inputs
[
1
:]
return
tensor
.
specify_shape
(
*
inp
)
return
tensor
.
specify_shape
(
*
inp
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
theano
.
compile
.
ops
.
Shape
])
@op_lifter
([
theano
.
compile
.
ops
.
Shape
])
def
local_gpua_shape
(
node
):
def
local_gpua_shape
(
node
,
context_name
):
# op_lifter will call this opt too frequently as the output is
# op_lifter will call this opt too frequently as the output is
# always on the CPU.
# always on the CPU.
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
return
return
return
[
gpu_from_host
(
node
.
inputs
[
0
])
.
shape
]
return
[
GpuFromHost
(
context_name
)
(
node
.
inputs
[
0
])
.
shape
]
def
gpu_print_wrapper
(
op
,
cnda
):
def
gpu_print_wrapper
(
op
,
cnda
):
...
@@ -374,7 +435,7 @@ def gpu_print_wrapper(op, cnda):
...
@@ -374,7 +435,7 @@ def gpu_print_wrapper(op, cnda):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
printing
.
Print
])
@op_lifter
([
tensor
.
printing
.
Print
])
def
local_gpu_print_op
(
node
):
def
local_gpu_print_op
(
node
,
context_name
):
x
,
=
node
.
inputs
x
,
=
node
.
inputs
gpu_x
,
=
x
.
owner
.
inputs
gpu_x
,
=
x
.
owner
.
inputs
new_op
=
node
.
op
.
__class__
(
global_fn
=
gpu_print_wrapper
)
new_op
=
node
.
op
.
__class__
(
global_fn
=
gpu_print_wrapper
)
...
@@ -404,9 +465,14 @@ def local_gpu_pdbbreakpoint_op(node):
...
@@ -404,9 +465,14 @@ def local_gpu_pdbbreakpoint_op(node):
input_is_from_gpu
=
(
inp
.
owner
and
input_is_from_gpu
=
(
inp
.
owner
and
isinstance
(
inp
.
owner
.
op
,
HostFromGpu
))
isinstance
(
inp
.
owner
.
op
,
HostFromGpu
))
output_goes_to_gpu
=
any
([
c
[
0
]
!=
"output"
and
output_goes_to_gpu
=
False
isinstance
(
c
[
0
]
.
op
,
GpuFromHost
)
for
c
in
out
.
clients
:
for
c
in
out
.
clients
])
if
c
==
'output'
:
continue
if
isinstance
(
c
[
0
]
.
op
,
GpuFromHost
):
output_goes_to_gpu
=
True
context_name
=
c
[
0
]
.
op
.
context_name
break
if
input_is_from_gpu
:
if
input_is_from_gpu
:
# The op should be applied on the GPU version of the input
# The op should be applied on the GPU version of the input
...
@@ -415,7 +481,7 @@ def local_gpu_pdbbreakpoint_op(node):
...
@@ -415,7 +481,7 @@ def local_gpu_pdbbreakpoint_op(node):
elif
output_goes_to_gpu
:
elif
output_goes_to_gpu
:
# The input should be transfered to the gpu
# The input should be transfered to the gpu
new_inputs
.
append
(
gpu_from_host
(
inp
))
new_inputs
.
append
(
GpuFromHost
(
context_name
)
(
inp
))
input_transfered
.
append
(
True
)
input_transfered
.
append
(
True
)
else
:
else
:
...
@@ -447,7 +513,7 @@ def local_gpu_pdbbreakpoint_op(node):
...
@@ -447,7 +513,7 @@ def local_gpu_pdbbreakpoint_op(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Join
])
@op_lifter
([
tensor
.
Join
])
def
local_gpua_join
(
node
):
def
local_gpua_join
(
node
,
context_name
):
return
gpu_join
return
gpu_join
...
@@ -462,13 +528,13 @@ def local_gpuajoin_1(node):
...
@@ -462,13 +528,13 @@ def local_gpuajoin_1(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Split
])
@op_lifter
([
tensor
.
Split
])
def
local_gpua_split
(
node
):
def
local_gpua_split
(
node
,
context_name
):
return
GpuSplit
(
node
.
op
.
len_splits
)
return
GpuSplit
(
node
.
op
.
len_splits
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Subtensor
])
@op_lifter
([
tensor
.
Subtensor
])
def
local_gpua_subtensor
(
node
):
def
local_gpua_subtensor
(
node
,
context_name
):
x
=
node
.
inputs
[
0
]
x
=
node
.
inputs
[
0
]
if
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)):
if
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)):
gpu_x
=
x
.
owner
.
inputs
[
0
]
gpu_x
=
x
.
owner
.
inputs
[
0
]
...
@@ -482,14 +548,14 @@ def local_gpua_subtensor(node):
...
@@ -482,14 +548,14 @@ def local_gpua_subtensor(node):
for
n
,
_
in
node
.
outputs
[
0
]
.
clients
]):
for
n
,
_
in
node
.
outputs
[
0
]
.
clients
]):
return
return
else
:
else
:
return
[
host_from_gpu
(
gpu_
from_host
(
node
.
outputs
[
0
]))]
return
[
host_from_gpu
(
gpu_
x
.
owner
.
op
(
node
.
outputs
[
0
]))]
return
GpuSubtensor
(
node
.
op
.
idx_list
)
return
GpuSubtensor
(
node
.
op
.
idx_list
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
IncSubtensor
])
@op_lifter
([
tensor
.
IncSubtensor
])
def
local_gpua_incsubtensor
(
node
):
def
local_gpua_incsubtensor
(
node
,
context_name
):
return
GpuIncSubtensor
(
node
.
op
.
idx_list
,
node
.
op
.
inplace
,
return
GpuIncSubtensor
(
node
.
op
.
idx_list
,
node
.
op
.
inplace
,
node
.
op
.
set_instead_of_inc
,
node
.
op
.
set_instead_of_inc
,
node
.
op
.
destroyhandler_tolerate_aliased
)
node
.
op
.
destroyhandler_tolerate_aliased
)
...
@@ -497,16 +563,16 @@ def local_gpua_incsubtensor(node):
...
@@ -497,16 +563,16 @@ def local_gpua_incsubtensor(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
AdvancedSubtensor1
])
@op_lifter
([
tensor
.
AdvancedSubtensor1
])
def
local_gpua_advanced_subtensor
(
node
):
def
local_gpua_advanced_subtensor
(
node
,
context_name
):
return
GpuAdvancedSubtensor1
()
return
GpuAdvancedSubtensor1
()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
AdvancedIncSubtensor1
])
@op_lifter
([
tensor
.
AdvancedIncSubtensor1
])
def
local_gpua_advanced_incsubtensor
(
node
):
def
local_gpua_advanced_incsubtensor
(
node
,
context_name
):
# This
optimization is disabled if cuda is not active
# This
is disabled on non-cuda contexts
if
pygpu
.
get_default_context
()
.
kind
!=
"cuda"
:
if
get_context
(
context_name
)
.
kind
!=
'cuda'
:
return
None
return
None
x
,
y
,
ilist
=
node
.
inputs
x
,
y
,
ilist
=
node
.
inputs
...
@@ -535,17 +601,19 @@ def local_gpua_advanced_incsubtensor(node):
...
@@ -535,17 +601,19 @@ def local_gpua_advanced_incsubtensor(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
CAReduce
,
tensor
.
Sum
,
tensor
.
elemwise
.
Prod
])
@op_lifter
([
tensor
.
CAReduce
,
tensor
.
Sum
,
tensor
.
elemwise
.
Prod
])
def
local_gpua_careduce
(
node
):
def
local_gpua_careduce
(
node
,
context_name
):
if
isinstance
(
node
.
op
.
scalar_op
,
(
scalar
.
Add
,
scalar
.
Mul
,
if
isinstance
(
node
.
op
.
scalar_op
,
(
scalar
.
Add
,
scalar
.
Mul
,
scalar
.
Maximum
,
scalar
.
Minimum
)):
scalar
.
Maximum
,
scalar
.
Minimum
)):
dev
=
theano
.
sandbox
.
gpuarray
.
init_dev
.
device
ctx
=
get_context
(
context_name
)
if
dev
.
startswith
(
'opencl'
)
:
if
ctx
.
kind
==
'opencl'
:
op
=
GpuCAReduceCPY
op
=
GpuCAReduceCPY
if
node
.
op
.
scalar_op
not
in
[
scalar
.
add
,
scalar
.
mul
]:
if
node
.
op
.
scalar_op
not
in
[
scalar
.
add
,
scalar
.
mul
]:
# We don't support yet all reduction with cpy code.
# We don't support yet all reduction with cpy code.
return
return
el
se
:
el
if
ctx
.
kind
==
'cuda'
:
op
=
GpuCAReduceCuda
op
=
GpuCAReduceCuda
else
:
return
False
x
,
=
node
.
inputs
x
,
=
node
.
inputs
greduce
=
op
(
greduce
=
op
(
...
@@ -556,7 +624,7 @@ def local_gpua_careduce(node):
...
@@ -556,7 +624,7 @@ def local_gpua_careduce(node):
# We need to have the make node called, otherwise the mask can
# We need to have the make node called, otherwise the mask can
# be None
# be None
if
(
op
is
GpuCAReduceCPY
or
if
(
op
is
GpuCAReduceCPY
or
gvar
.
owner
.
op
.
supports_c_code
([
gpu_from_host
(
x
)])):
gvar
.
owner
.
op
.
supports_c_code
([
GpuFromHost
(
context_name
)
(
x
)])):
return
greduce
return
greduce
else
:
else
:
# Try to make a simpler pattern based on reshaping
# Try to make a simpler pattern based on reshaping
...
@@ -596,7 +664,7 @@ def local_gpua_careduce(node):
...
@@ -596,7 +664,7 @@ def local_gpua_careduce(node):
acc_dtype
=
getattr
(
node
.
op
,
'acc_dtype'
,
None
))
acc_dtype
=
getattr
(
node
.
op
,
'acc_dtype'
,
None
))
reshaped_x
=
x
.
reshape
(
tensor
.
stack
(
new_in_shp
))
reshaped_x
=
x
.
reshape
(
tensor
.
stack
(
new_in_shp
))
gpu_reshaped_x
=
gpu_from_host
(
reshaped_x
)
gpu_reshaped_x
=
GpuFromHost
(
context_name
)
(
reshaped_x
)
gvar
=
greduce
(
gpu_reshaped_x
)
gvar
=
greduce
(
gpu_reshaped_x
)
# We need to have the make node called, otherwise the mask can
# We need to have the make node called, otherwise the mask can
# be None
# be None
...
@@ -615,19 +683,19 @@ def local_gpua_careduce(node):
...
@@ -615,19 +683,19 @@ def local_gpua_careduce(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Gemv
,
tensor
.
blas_c
.
CGemv
])
@op_lifter
([
tensor
.
blas
.
Gemv
,
tensor
.
blas_c
.
CGemv
])
def
local_gpua_gemv
(
node
):
def
local_gpua_gemv
(
node
,
context_name
):
return
GpuGemv
(
inplace
=
node
.
op
.
inplace
)
return
GpuGemv
(
inplace
=
node
.
op
.
inplace
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Gemm
])
@op_lifter
([
tensor
.
blas
.
Gemm
])
def
local_gpua_gemm
(
node
):
def
local_gpua_gemm
(
node
,
context_name
):
return
GpuGemm
(
inplace
=
node
.
op
.
inplace
)
return
GpuGemm
(
inplace
=
node
.
op
.
inplace
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
basic
.
Dot
])
@op_lifter
([
tensor
.
basic
.
Dot
])
def
local_gpua_hgemm
(
node
):
def
local_gpua_hgemm
(
node
,
context_name
):
from
theano.sandbox.cuda
import
nvcc_compiler
from
theano.sandbox.cuda
import
nvcc_compiler
if
nvcc_compiler
.
nvcc_version
<
'7.5'
:
if
nvcc_compiler
.
nvcc_version
<
'7.5'
:
_logger
.
warning
(
"Not performing dot of float16 on the GPU since "
_logger
.
warning
(
"Not performing dot of float16 on the GPU since "
...
@@ -639,8 +707,9 @@ def local_gpua_hgemm(node):
...
@@ -639,8 +707,9 @@ def local_gpua_hgemm(node):
if
(
A
.
ndim
==
2
and
B
.
ndim
==
2
and
if
(
A
.
ndim
==
2
and
B
.
ndim
==
2
and
A
.
dtype
==
'float16'
and
B
.
dtype
==
'float16'
):
A
.
dtype
==
'float16'
and
B
.
dtype
==
'float16'
):
fgraph
=
node
.
inputs
[
0
]
.
fgraph
fgraph
=
node
.
inputs
[
0
]
.
fgraph
C
=
GpuAllocEmpty
(
dtype
=
'float16'
)(
shape_i
(
A
,
0
,
fgraph
),
C
=
GpuAllocEmpty
(
dtype
=
'float16'
,
context_name
=
context_name
)(
shape_i
(
B
,
1
,
fgraph
))
shape_i
(
A
,
0
,
fgraph
),
shape_i
(
B
,
1
,
fgraph
))
return
gpugemm_no_inplace
(
C
,
1.0
,
A
,
B
,
0.0
)
return
gpugemm_no_inplace
(
C
,
1.0
,
A
,
B
,
0.0
)
...
@@ -658,49 +727,49 @@ def local_gpuagemm_output_merge(node, *inputs):
...
@@ -658,49 +727,49 @@ def local_gpuagemm_output_merge(node, *inputs):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Ger
,
tensor
.
blas_c
.
CGer
,
tensor
.
blas_scipy
.
ScipyGer
])
@op_lifter
([
tensor
.
blas
.
Ger
,
tensor
.
blas_c
.
CGer
,
tensor
.
blas_scipy
.
ScipyGer
])
def
local_gpua_ger
(
node
):
def
local_gpua_ger
(
node
,
context_name
):
return
GpuGer
(
destructiv
e
=
node
.
op
.
destructive
)
return
GpuGer
(
inplac
e
=
node
.
op
.
destructive
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Dot22
])
@op_lifter
([
tensor
.
blas
.
Dot22
])
def
local_gpua_dot22
(
node
):
def
local_gpua_dot22
(
node
,
context_name
):
return
gpu_dot22
return
gpu_dot22
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
basic
.
Eye
])
@op_lifter
([
tensor
.
basic
.
Eye
])
def
local_gpua_eye
(
node
):
def
local_gpua_eye
(
node
,
context_name
):
return
GpuEye
(
dtype
=
node
.
op
.
dtype
)
return
GpuEye
(
dtype
=
node
.
op
.
dtype
,
context_name
=
context_name
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
],
cuda_only
=
True
)
def
local_gpua_crossentropysoftmaxargmax1hotwithbias
(
node
):
def
local_gpua_crossentropysoftmaxargmax1hotwithbias
(
node
,
context_name
):
return
GpuCrossentropySoftmaxArgmax1HotWithBias
()
return
GpuCrossentropySoftmaxArgmax1HotWithBias
()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
],
cuda_only
=
True
)
def
local_gpua_crossentropysoftmax1hotwithbiasdx
(
node
):
def
local_gpua_crossentropysoftmax1hotwithbiasdx
(
node
,
context_name
):
return
GpuCrossentropySoftmax1HotWithBiasDx
()
return
GpuCrossentropySoftmax1HotWithBiasDx
()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
Softmax
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
Softmax
],
cuda_only
=
True
)
def
local_gpua_softmax
(
node
):
def
local_gpua_softmax
(
node
,
context_name
):
return
GpuSoftmax
()
return
GpuSoftmax
()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
SoftmaxWithBias
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
SoftmaxWithBias
],
cuda_only
=
True
)
def
local_gpua_softmaxwithbias
(
node
):
def
local_gpua_softmaxwithbias
(
node
,
context_name
):
return
GpuSoftmaxWithBias
()
return
GpuSoftmaxWithBias
()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
theano
.
tensor
.
opt
.
Assert
])
@op_lifter
([
theano
.
tensor
.
opt
.
Assert
])
def
local_assert
(
node
):
def
local_assert
(
node
,
context_name
):
if
(
node
.
inputs
[
0
]
.
owner
and
if
(
node
.
inputs
[
0
]
.
owner
and
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
,
HostFromGpu
)):
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
,
HostFromGpu
)):
return
[
host_from_gpu
(
node
.
op
(
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
],
return
[
host_from_gpu
(
node
.
op
(
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
],
...
@@ -708,21 +777,14 @@ def local_assert(node):
...
@@ -708,21 +777,14 @@ def local_assert(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
gpu_from_host
,
ConvOp
])
@op_lifter
([
ConvOp
])
def
local_gpu_conv
(
node
):
def
local_gpu_conv
(
node
,
context_name
):
"""
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(gpu_conv)
"""
def
GpuConvOp_from_ConvOp
(
op
):
def
GpuConvOp_from_ConvOp
(
op
):
logical_img_hw
=
None
logical_img_hw
=
None
if
op
.
kshp_logical
is
not
None
and
op
.
kshp_logical
!=
op
.
kshp
:
if
op
.
kshp_logical
is
not
None
and
op
.
kshp_logical
!=
op
.
kshp
:
return
None
return
None
# print op.kshp, op.imshp[1:3]
# print op.kshp_logical, logical_img_hw
ret
=
GpuConv
(
border_mode
=
op
.
out_mode
,
ret
=
GpuConv
(
border_mode
=
op
.
out_mode
,
subsample
=
(
op
.
dx
,
op
.
dy
),
subsample
=
(
op
.
dx
,
op
.
dy
),
logical_img_hw
=
logical_img_hw
,
logical_img_hw
=
logical_img_hw
,
...
@@ -735,13 +797,10 @@ def local_gpu_conv(node):
...
@@ -735,13 +797,10 @@ def local_gpu_conv(node):
imshp
=
op
.
imshp
,
imshp
=
op
.
imshp
,
nkern
=
op
.
nkern
,
nkern
=
op
.
nkern
,
bsize
=
op
.
bsize
,
bsize
=
op
.
bsize
,
fft_opt
=
op
.
fft_opt
fft_opt
=
op
.
fft_opt
)
)
if
op
.
imshp_logical
is
not
None
:
if
op
.
imshp_logical
is
not
None
:
logical_img_hw
=
op
.
imshp_logical
[
1
:
3
]
logical_img_hw
=
op
.
imshp_logical
[
1
:
3
]
if
logical_img_hw
!=
op
.
imshp
[
1
:
3
]:
if
logical_img_hw
!=
op
.
imshp
[
1
:
3
]:
# this case is not implemented
# return None
rstride
=
int
(
numpy
.
ceil
(
op
.
imshp_logical
[
1
]
/
rstride
=
int
(
numpy
.
ceil
(
op
.
imshp_logical
[
1
]
/
float
(
op
.
imshp
[
1
])))
float
(
op
.
imshp
[
1
])))
cstride
=
int
(
numpy
.
ceil
(
op
.
imshp_logical
[
2
]
/
cstride
=
int
(
numpy
.
ceil
(
op
.
imshp_logical
[
2
]
/
...
@@ -752,7 +811,7 @@ def local_gpu_conv(node):
...
@@ -752,7 +811,7 @@ def local_gpu_conv(node):
img
.
shape
[
0
],
*
op
.
imshp_logical
)
img
.
shape
[
0
],
*
op
.
imshp_logical
)
img
=
tensor
.
set_subtensor
(
buf
[:,
:,
::
rstride
,
::
cstride
],
img
=
tensor
.
set_subtensor
(
buf
[:,
:,
::
rstride
,
::
cstride
],
img
)
img
)
img
=
gpu_from_host
(
img
)
img
=
GpuFromHost
(
context_name
)
(
img
)
return
ret
(
img
,
kern
)
return
ret
(
img
,
kern
)
return
make_graph
return
make_graph
...
@@ -779,15 +838,13 @@ def local_gpu_conv(node):
...
@@ -779,15 +838,13 @@ def local_gpu_conv(node):
gpu_conv
=
GpuConvOp_from_ConvOp
(
node
.
op
)
gpu_conv
=
GpuConvOp_from_ConvOp
(
node
.
op
)
if
gpu_conv
is
None
:
if
gpu_conv
is
None
:
return
return
out
=
gpu_conv
(
gpu_from_host
(
img
),
out
=
gpu_conv
(
GpuFromHost
(
context_name
)(
img
),
gpu_from_host
(
kern
))
GpuFromHost
(
context_name
)(
kern
))
# in some case the ConvOp broadcast the last 2 dimensions
assert
isinstance
(
out
.
type
,
GpuArrayType
)
# differently then the gpu ConvOp
# Make sure to keep the broadcastable pattern of the original
out
=
tensor
.
patternbroadcast
(
# convolution even if we might gain or lose some due to different
host_from_gpu
(
out
),
# information at the node level.
node
.
outputs
[
0
]
.
broadcastable
)
out
=
tensor
.
patternbroadcast
(
out
,
node
.
outputs
[
0
]
.
broadcastable
)
# op_lifter want the output on the GPU.
out
=
gpu_from_host
(
out
)
out
.
values_eq_approx
=
values_eq_approx
out
.
values_eq_approx
=
values_eq_approx
return
[
out
]
return
[
out
]
...
@@ -818,9 +875,10 @@ def local_gpu_elemwise_careduce(node):
...
@@ -818,9 +875,10 @@ def local_gpu_elemwise_careduce(node):
pre_scalar_op
=
scalar
.
basic
.
sqr
)(
inp
)]
pre_scalar_op
=
scalar
.
basic
.
sqr
)(
inp
)]
def
tensor_to_gpu
(
x
):
def
tensor_to_gpu
(
x
,
context_name
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
y
=
GpuArrayType
(
broadcastable
=
x
.
type
.
broadcastable
,
y
=
GpuArrayType
(
broadcastable
=
x
.
type
.
broadcastable
,
context_name
=
context_name
,
dtype
=
x
.
type
.
dtype
)()
dtype
=
x
.
type
.
dtype
)()
if
x
.
name
:
if
x
.
name
:
y
.
name
=
x
.
name
+
'[Gpua]'
y
.
name
=
x
.
name
+
'[Gpua]'
...
@@ -842,6 +900,7 @@ def gpu_safe_new(x, tag=''):
...
@@ -842,6 +900,7 @@ def gpu_safe_new(x, tag=''):
nw_name
=
x
.
name
+
tag
nw_name
=
x
.
name
+
tag
else
:
else
:
nw_name
=
None
nw_name
=
None
if
isinstance
(
x
,
theano
.
Constant
):
if
isinstance
(
x
,
theano
.
Constant
):
return
x
.
clone
()
return
x
.
clone
()
...
@@ -870,7 +929,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
...
@@ -870,7 +929,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@register_opt
(
'scan'
,
'fast_compile'
)
@register_opt
(
'scan'
,
'fast_compile'
)
@op_lifter
([
scan_op
.
Scan
])
@op_lifter
([
scan_op
.
Scan
])
def
local_scan_to_gpua
(
node
):
def
local_scan_to_gpua
(
node
,
context_name
):
info
=
copy
.
deepcopy
(
node
.
op
.
info
)
info
=
copy
.
deepcopy
(
node
.
op
.
info
)
if
info
.
get
(
'gpua'
,
False
):
if
info
.
get
(
'gpua'
,
False
):
return
return
...
@@ -882,20 +941,20 @@ def local_scan_to_gpua(node):
...
@@ -882,20 +941,20 @@ def local_scan_to_gpua(node):
node
.
op
.
n_mit_sot
+
node
.
op
.
n_mit_sot
+
node
.
op
.
n_sit_sot
+
node
.
op
.
n_sit_sot
+
node
.
op
.
n_shared_outs
)
node
.
op
.
n_shared_outs
)
nw_ins
+=
[
safe_to_gpu
(
x
)
for
x
in
node
.
inputs
[
1
:
e
]]
nw_ins
+=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
node
.
inputs
[
1
:
e
]]
b
=
e
b
=
e
e
=
e
+
node
.
op
.
n_nit_sot
e
=
e
+
node
.
op
.
n_nit_sot
nw_ins
+=
node
.
inputs
[
b
:
e
]
nw_ins
+=
node
.
inputs
[
b
:
e
]
nw_ins
+=
[
safe_to_gpu
(
x
)
for
x
in
node
.
inputs
[
e
:]]
nw_ins
+=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
node
.
inputs
[
e
:]]
scan_ins
=
[
tensor_to_gpu
(
x
)
for
x
in
node
.
op
.
inputs
]
scan_ins
=
[
tensor_to_gpu
(
x
,
context_name
)
for
x
in
node
.
op
.
inputs
]
# The inner output corresponding to the looping condition should not be
# The inner output corresponding to the looping condition should not be
# moved to the gpu
# moved to the gpu
if
node
.
op
.
info
[
'as_while'
]:
if
node
.
op
.
info
[
'as_while'
]:
scan_outs
=
[
safe_to_gpu
(
x
)
for
x
in
node
.
op
.
outputs
[:
-
1
]]
scan_outs
=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
node
.
op
.
outputs
[:
-
1
]]
scan_outs
+=
[
node
.
op
.
outputs
[
-
1
]]
scan_outs
+=
[
node
.
op
.
outputs
[
-
1
]]
else
:
else
:
scan_outs
=
[
safe_to_gpu
(
x
)
for
x
in
node
.
op
.
outputs
]
scan_outs
=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
node
.
op
.
outputs
]
scan_outs
=
scan_utils
.
clone
(
scan_outs
=
scan_utils
.
clone
(
scan_outs
,
scan_outs
,
replace
=
list
(
zip
(
node
.
op
.
inputs
,
replace
=
list
(
zip
(
node
.
op
.
inputs
,
...
@@ -909,12 +968,25 @@ def local_scan_to_gpua(node):
...
@@ -909,12 +968,25 @@ def local_scan_to_gpua(node):
_cmodule_key
=
gof
.
CLinker
()
.
cmodule_key_
(
local_fgraph
,
[])
_cmodule_key
=
gof
.
CLinker
()
.
cmodule_key_
(
local_fgraph
,
[])
info
[
'gpu_hash'
]
=
hash
(
_cmodule_key
)
info
[
'gpu_hash'
]
=
hash
(
_cmodule_key
)
def
typebuild
(
dtype
,
broadcastable
,
context_name
=
context_name
):
return
GpuArrayType
(
dtype
=
dtype
,
broadcastable
=
broadcastable
,
context_name
=
context_name
)
nw_op
=
scan_op
.
Scan
(
scan_ins
,
scan_outs
,
info
,
nw_op
=
scan_op
.
Scan
(
scan_ins
,
scan_outs
,
info
,
typeConstructor
=
GpuArrayType
)
.
make_node
(
*
nw_ins
)
typeConstructor
=
typebuild
)
.
make_node
(
*
nw_ins
)
return
nw_op
.
outputs
return
nw_op
.
outputs
def
_scan_type_infer
(
node
):
context_name
=
infer_context_name
(
*
node
.
inputs
)
def
typebuild
(
dtype
,
broadcastable
,
context_name
=
context_name
):
return
GpuArrayType
(
dtype
=
dtype
,
broadcastable
=
broadcastable
,
context_name
=
context_name
)
return
typebuild
optdb
.
register
(
'gpua_scanOp_make_inplace'
,
optdb
.
register
(
'gpua_scanOp_make_inplace'
,
scan_opt
.
ScanInplaceOptimizer
(
type
Constructor
=
GpuArrayType
,
scan_opt
.
ScanInplaceOptimizer
(
type
Infer
=
_scan_type_infer
,
gpua_flag
=
True
),
gpua_flag
=
True
),
75
,
75
,
'gpuarray'
,
'gpuarray'
,
...
...
theano/sandbox/gpuarray/opt_util.py
浏览文件 @
4814cd99
...
@@ -294,7 +294,7 @@ def inplace_allocempty(op, idx):
...
@@ -294,7 +294,7 @@ def inplace_allocempty(op, idx):
function can be as simple as:
function can be as simple as:
def maker(node, inputs):
def maker(node, inputs):
return
node.op.__class__(inplace=True)(*inputs)
return
[node.op.__class__(inplace=True)(*inputs)]
Parameters
Parameters
----------
----------
...
@@ -320,7 +320,8 @@ def inplace_allocempty(op, idx):
...
@@ -320,7 +320,8 @@ def inplace_allocempty(op, idx):
if
(
alloc
.
owner
and
if
(
alloc
.
owner
and
isinstance
(
alloc
.
owner
.
op
,
GpuAllocEmpty
)
and
isinstance
(
alloc
.
owner
.
op
,
GpuAllocEmpty
)
and
len
(
alloc
.
clients
)
>
1
):
len
(
alloc
.
clients
)
>
1
):
alloc_op
=
GpuAllocEmpty
(
alloc
.
owner
.
op
.
dtype
)
alloc_op
=
GpuAllocEmpty
(
alloc
.
owner
.
op
.
dtype
,
alloc
.
owner
.
op
.
context_name
)
inputs
[
idx
]
=
alloc_op
(
*
alloc
.
owner
.
inputs
)
inputs
[
idx
]
=
alloc_op
(
*
alloc
.
owner
.
inputs
)
return
maker
(
node
,
inputs
)
return
maker
(
node
,
inputs
)
return
opt
return
opt
...
...
theano/sandbox/gpuarray/pycuda_helper.py
deleted
100644 → 0
浏览文件 @
6ca7b2b6
try
:
from
pycuda.driver
import
Context
if
not
hasattr
(
Context
,
'attach'
):
raise
ImportError
(
'too old'
)
except
ImportError
:
Context
=
None
pycuda_initialized
=
False
pycuda_context
=
None
def
ensure_pycuda_context
():
global
pycuda_context
,
pycuda_initialized
if
not
pycuda_initialized
:
if
Context
is
None
:
raise
RuntimeError
(
"PyCUDA not found or too old."
)
else
:
pycuda_context
=
Context
.
attach
()
import
atexit
atexit
.
register
(
pycuda_context
.
detach
)
pycuda_initialized
=
True
return
pycuda_context
theano/sandbox/gpuarray/subtensor.py
浏览文件 @
4814cd99
from
__future__
import
print_function
from
__future__
import
print_function
import
copy
import
os
import
os
import
copy
import
numpy
import
numpy
import
theano
import
theano
from
theano
import
tensor
,
gof
,
config
from
theano
import
tensor
,
gof
from
theano.gof.utils
import
MethodNotDefined
from
six.moves
import
StringIO
from
six.moves
import
StringIO
from
theano.tensor.subtensor
import
IncSubtensor
,
Subtensor
,
get_idx_list
from
theano.tensor.subtensor
import
IncSubtensor
,
Subtensor
,
get_idx_list
import
theano.tensor.inplace
import
theano.tensor.inplace
...
@@ -19,7 +18,8 @@ except ImportError:
...
@@ -19,7 +18,8 @@ except ImportError:
pass
pass
from
.type
import
GpuArrayType
from
.type
import
GpuArrayType
from
.basic_ops
import
(
as_gpuarray_variable
,
HideC
,
GpuKernelBase
,
Kernel
)
from
.basic_ops
import
(
as_gpuarray_variable
,
HideC
,
GpuKernelBase
,
Kernel
,
infer_context_name
)
from
.elemwise
import
GpuElemwise
from
.elemwise
import
GpuElemwise
...
@@ -27,10 +27,12 @@ class GpuSubtensor(HideC, Subtensor):
...
@@ -27,10 +27,12 @@ class GpuSubtensor(HideC, Subtensor):
_f16_ok
=
True
_f16_ok
=
True
def
make_node
(
self
,
x
,
*
inputs
):
def
make_node
(
self
,
x
,
*
inputs
):
ctx_name
=
infer_context_name
(
x
)
rval
=
tensor
.
Subtensor
.
make_node
(
self
,
x
,
*
inputs
)
rval
=
tensor
.
Subtensor
.
make_node
(
self
,
x
,
*
inputs
)
otype
=
GpuArrayType
(
dtype
=
rval
.
outputs
[
0
]
.
type
.
dtype
,
otype
=
GpuArrayType
(
dtype
=
rval
.
outputs
[
0
]
.
type
.
dtype
,
broadcastable
=
rval
.
outputs
[
0
]
.
type
.
broadcastable
)
broadcastable
=
rval
.
outputs
[
0
]
.
type
.
broadcastable
,
x
=
as_gpuarray_variable
(
x
)
context_name
=
ctx_name
)
x
=
as_gpuarray_variable
(
x
,
ctx_name
)
return
gof
.
Apply
(
self
,
[
x
]
+
rval
.
inputs
[
1
:],
[
otype
()])
return
gof
.
Apply
(
self
,
[
x
]
+
rval
.
inputs
[
1
:],
[
otype
()])
def
perform
(
self
,
node
,
inputs
,
out_
):
def
perform
(
self
,
node
,
inputs
,
out_
):
...
@@ -191,14 +193,18 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
...
@@ -191,14 +193,18 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
return
self
.
iadd_node
.
op
.
gpu_kernels
(
self
.
iadd_node
,
subname
)
return
self
.
iadd_node
.
op
.
gpu_kernels
(
self
.
iadd_node
,
subname
)
def
make_node
(
self
,
x
,
y
,
*
inputs
):
def
make_node
(
self
,
x
,
y
,
*
inputs
):
x
=
as_gpuarray_variable
(
x
)
ctx_name
=
infer_context_name
(
x
,
y
)
y
=
as_gpuarray_variable
(
y
)
x
=
as_gpuarray_variable
(
x
,
ctx_name
)
y
=
as_gpuarray_variable
(
y
,
ctx_name
)
rval
=
tensor
.
IncSubtensor
.
make_node
(
self
,
x
,
y
,
*
inputs
)
rval
=
tensor
.
IncSubtensor
.
make_node
(
self
,
x
,
y
,
*
inputs
)
op
=
copy
.
copy
(
self
)
op
=
copy
.
copy
(
self
)
ret
=
gof
.
Apply
(
op
,
[
x
,
y
]
+
rval
.
inputs
[
2
:],
[
x
.
type
()])
ret
=
gof
.
Apply
(
op
,
[
x
,
y
]
+
rval
.
inputs
[
2
:],
[
x
.
type
()])
op
.
create_iadd_node
(
ret
)
op
.
create_iadd_node
(
ret
)
return
ret
return
ret
def
get_context
(
self
,
node
):
return
node
.
outputs
[
0
]
.
type
.
context
def
create_iadd_node
(
self
,
node
):
def
create_iadd_node
(
self
,
node
):
# We store a iadd_node in the op that contain the info needed
# We store a iadd_node in the op that contain the info needed
# for the inplace add.
# for the inplace add.
...
@@ -210,7 +216,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
...
@@ -210,7 +216,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
iadd_node
=
gop
(
xview
,
y
)
.
owner
iadd_node
=
gop
(
xview
,
y
)
.
owner
self
.
iadd_node
=
iadd_node
self
.
iadd_node
=
iadd_node
def
perform
(
self
,
node
,
inputs
,
out_
):
def
perform
(
self
,
node
,
inputs
,
out_
,
ctx
):
out
,
=
out_
out
,
=
out_
x
,
y
=
inputs
[:
2
]
x
,
y
=
inputs
[:
2
]
indices
=
list
(
reversed
(
inputs
[
2
:]))
indices
=
list
(
reversed
(
inputs
[
2
:]))
...
@@ -321,7 +327,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
...
@@ -321,7 +327,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
%(view_ndim)
s,
%(view_ndim)
s,
dims,
dims,
xview_strides,
xview_strides,
pygpu_default_context()
,
%(x)
s->context
,
1,
1,
(PyObject *)
%(x)
s,
(PyObject *)
%(x)
s,
(PyObject *)&PyGpuArrayType);
(PyObject *)&PyGpuArrayType);
...
@@ -355,10 +361,10 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
...
@@ -355,10 +361,10 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
"""
"""
return
"""GpuArray_setarray(&
%(view)
s->ga, &
%(source)
s->ga)"""
%
locals
()
return
"""GpuArray_setarray(&
%(view)
s->ga, &
%(source)
s->ga)"""
%
locals
()
def
c_support_code_
apply
(
self
,
node
,
nodename
):
def
c_support_code_
struct
(
self
,
node
,
nodename
):
gop
=
self
.
iadd_node
.
op
gop
=
self
.
iadd_node
.
op
sub_name
=
nodename
+
"_add_to_zview"
sub_name
=
nodename
+
"_add_to_zview"
ret
=
gop
.
c_support_code_
apply
(
self
.
iadd_node
,
sub_name
)
ret
=
gop
.
c_support_code_
struct
(
self
.
iadd_node
,
sub_name
)
ret
+=
"""
ret
+=
"""
PyGpuArrayObject* inc_sub_iadd_
%(nodename)
s(PyGpuArrayObject* dst,
PyGpuArrayObject* inc_sub_iadd_
%(nodename)
s(PyGpuArrayObject* dst,
PyGpuArrayObject* src){
PyGpuArrayObject* src){
...
@@ -366,10 +372,11 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
...
@@ -366,10 +372,11 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
"""
%
locals
()
"""
%
locals
()
inputs
=
[
"dst"
,
"src"
]
inputs
=
[
"dst"
,
"src"
]
outputs
=
[
"ret"
]
outputs
=
[
"ret"
]
sub
=
{
"fail"
:
"return NULL;"
}
sub
=
{
"fail"
:
"return NULL;"
,
"context"
:
"dst->context"
}
ret
+=
gop
.
c_code
(
self
.
iadd_node
,
sub_name
,
inputs
,
outputs
,
sub
)
ret
+=
gop
.
c_code
(
self
.
iadd_node
,
sub_name
,
inputs
,
outputs
,
sub
)
ret
+=
"""
ret
+=
"""
return dst;
return ret;
}
}
"""
"""
return
ret
return
ret
...
@@ -399,7 +406,8 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
...
@@ -399,7 +406,8 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
class
GpuAdvancedSubtensor1
(
HideC
,
tensor
.
AdvancedSubtensor1
):
class
GpuAdvancedSubtensor1
(
HideC
,
tensor
.
AdvancedSubtensor1
):
def
make_node
(
self
,
x
,
ilist
):
def
make_node
(
self
,
x
,
ilist
):
x_
=
as_gpuarray_variable
(
x
)
ctx_name
=
infer_context_name
(
x
,
ilist
)
x_
=
as_gpuarray_variable
(
x
,
ctx_name
)
ilist__
=
tensor
.
as_tensor_variable
(
ilist
)
ilist__
=
tensor
.
as_tensor_variable
(
ilist
)
if
ilist__
.
type
.
dtype
[:
3
]
not
in
(
'int'
,
'uin'
):
if
ilist__
.
type
.
dtype
[:
3
]
not
in
(
'int'
,
'uin'
):
...
@@ -407,7 +415,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
...
@@ -407,7 +415,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
if
ilist__
.
type
.
dtype
!=
'int64'
:
if
ilist__
.
type
.
dtype
!=
'int64'
:
ilist__
=
tensor
.
cast
(
ilist__
,
'int64'
)
ilist__
=
tensor
.
cast
(
ilist__
,
'int64'
)
ilist_
=
as_gpuarray_variable
(
ilist__
)
ilist_
=
as_gpuarray_variable
(
ilist__
,
ctx_name
)
if
ilist_
.
type
.
dtype
!=
'int64'
:
if
ilist_
.
type
.
dtype
!=
'int64'
:
raise
TypeError
(
'index must be int64'
)
raise
TypeError
(
'index must be int64'
)
...
@@ -419,6 +427,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
...
@@ -419,6 +427,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
bcast
=
ilist_
.
broadcastable
+
x_
.
broadcastable
[
1
:]
bcast
=
ilist_
.
broadcastable
+
x_
.
broadcastable
[
1
:]
return
gof
.
Apply
(
self
,
[
x_
,
ilist_
],
return
gof
.
Apply
(
self
,
[
x_
,
ilist_
],
[
GpuArrayType
(
dtype
=
x
.
dtype
,
[
GpuArrayType
(
dtype
=
x
.
dtype
,
context_name
=
ctx_name
,
broadcastable
=
bcast
)()])
broadcastable
=
bcast
)()])
def
perform
(
self
,
node
,
inp
,
out_
):
def
perform
(
self
,
node
,
inp
,
out_
):
...
@@ -475,8 +484,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
...
@@ -475,8 +484,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
"""
"""
def
make_node
(
self
,
x
,
y
,
ilist
):
def
make_node
(
self
,
x
,
y
,
ilist
):
x_
=
as_gpuarray_variable
(
x
)
ctx_name
=
infer_context_name
(
x
,
y
)
y_
=
as_gpuarray_variable
(
y
)
x_
=
as_gpuarray_variable
(
x
,
ctx_name
)
y_
=
as_gpuarray_variable
(
y
,
ctx_name
)
ilist_
=
tensor
.
as_tensor_variable
(
ilist
)
ilist_
=
tensor
.
as_tensor_variable
(
ilist
)
assert
x_
.
type
.
dtype
==
y_
.
type
.
dtype
assert
x_
.
type
.
dtype
==
y_
.
type
.
dtype
...
@@ -567,16 +577,16 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
...
@@ -567,16 +577,16 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
only avail on compute capability 2.0 and more recent.
only avail on compute capability 2.0 and more recent.
"""
"""
_f16_ok
=
True
_f16_ok
=
True
def
make_node
(
self
,
x
,
y
,
ilist
):
def
make_node
(
self
,
x
,
y
,
ilist
):
"""It defer from GpuAdvancedIncSubtensor1 in that it make sure
"""It defer from GpuAdvancedIncSubtensor1 in that it make sure
the index are of type long.
the index are of type long.
"""
"""
x_
=
as_gpuarray_variable
(
x
)
ctx_name
=
infer_context_name
(
x
,
y
,
ilist
)
y_
=
as_gpuarray_variable
(
y
)
x_
=
as_gpuarray_variable
(
x
,
ctx_name
)
ilist_
=
as_gpuarray_variable
(
ilist
)
y_
=
as_gpuarray_variable
(
y
,
ctx_name
)
ilist_
=
as_gpuarray_variable
(
ilist
,
ctx_name
)
assert
x_
.
type
.
dtype
==
y_
.
type
.
dtype
assert
x_
.
type
.
dtype
==
y_
.
type
.
dtype
assert
x_
.
type
.
ndim
>=
y_
.
type
.
ndim
assert
x_
.
type
.
ndim
>=
y_
.
type
.
ndim
...
@@ -599,32 +609,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
...
@@ -599,32 +609,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
return
gof
.
Apply
(
self
,
[
x_
,
y_
,
ilist_
],
[
x_
.
type
()])
return
gof
.
Apply
(
self
,
[
x_
,
y_
,
ilist_
],
[
x_
.
type
()])
def
get_context
(
self
,
node
):
return
node
.
outputs
[
0
]
.
type
.
context
def
perform
(
self
,
node
,
inp
,
out
,
ctx
):
return
super
(
GpuAdvancedIncSubtensor1_dev20
,
self
)
.
perform
(
node
,
inp
,
out
)
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
6
,)
return
(
6
,)
def
c_headers
(
self
):
def
c_headers
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
return
[
'<numpy_compat.h>'
,
'<gpuarray_helper.h>'
,
raise
MethodNotDefined
(
'cuda only'
)
return
[
'cuda.h'
,
'<numpy_compat.h>'
,
'<gpuarray_helper.h>'
,
'<gpuarray/types.h>'
]
'<gpuarray/types.h>'
]
def
c_header_dirs
(
self
):
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
return
[
os
.
path
.
dirname
(
__file__
)]
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
res
=
[
os
.
path
.
dirname
(
__file__
)]
if
cuda_root
:
res
.
append
(
os
.
path
.
join
(
cuda_root
,
'include'
))
return
res
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
active_device_no
=
theano
.
sandbox
.
cuda
.
active_device_number
(
)
ctx
=
self
.
get_context
(
node
)
device_properties
=
theano
.
sandbox
.
cuda
.
device_properties
if
ctx
.
kind
!=
'cuda'
:
compute_capability
=
device_properties
(
active_device_no
)[
'major'
]
raise
NotImplementedError
(
"cuda only"
)
if
(
(
self
.
set_instead_of_inc
)
or
if
(
self
.
set_instead_of_inc
or
(
node
.
inputs
[
0
]
.
ndim
!=
node
.
inputs
[
1
]
.
ndim
)
or
node
.
inputs
[
0
]
.
ndim
!=
node
.
inputs
[
1
]
.
ndim
or
(
node
.
inputs
[
0
]
.
ndim
!=
2
)
or
node
.
inputs
[
0
]
.
ndim
!=
2
or
(
compute_capability
<
2
)
):
ctx
.
bin_id
[
-
2
]
<
'2'
):
raise
NotImplementedError
(
"This case does not have C code yet."
)
raise
NotImplementedError
(
"This case does not have C code yet."
)
x
=
inputs
[
0
]
x
=
inputs
[
0
]
...
@@ -754,7 +762,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
...
@@ -754,7 +762,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
return
[
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
return
[
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
flags
=
flags
,
objvar
=
k_var
)]
flags
=
flags
,
objvar
=
k_var
)]
def
c_support_code_
apply
(
self
,
node
,
nodename
):
def
c_support_code_
struct
(
self
,
node
,
nodename
):
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_y
=
node
.
inputs
[
1
]
.
dtype
dtype_y
=
node
.
inputs
[
1
]
.
dtype
dtype_ind
=
node
.
inputs
[
2
]
.
dtype
dtype_ind
=
node
.
inputs
[
2
]
.
dtype
...
@@ -765,7 +773,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
...
@@ -765,7 +773,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
itemsize_out
=
numpy
.
dtype
(
dtype_out
)
.
itemsize
itemsize_out
=
numpy
.
dtype
(
dtype_out
)
.
itemsize
k_var
=
"k_vector_add_fast_"
+
nodename
k_var
=
"k_vector_add_fast_"
+
nodename
return
super
(
GpuAdvancedIncSubtensor1_dev20
,
self
)
.
c_support_code_
apply
(
node
,
nodename
)
+
"""
return
super
(
GpuAdvancedIncSubtensor1_dev20
,
self
)
.
c_support_code_
struct
(
node
,
nodename
)
+
"""
int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other,
PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr)
PyGpuArrayObject *indices_arr)
...
...
theano/sandbox/gpuarray/tests/config.py
0 → 100644
浏览文件 @
4814cd99
from
nose.plugins.skip
import
SkipTest
import
theano.sandbox.gpuarray
if
theano
.
sandbox
.
gpuarray
.
pygpu
is
None
:
raise
SkipTest
(
"pygpu not installed"
)
if
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
:
cuda_ndarray
.
use
(
'gpu'
,
default_to_move_computation_to_gpu
=
False
,
move_shared_float32_to_gpu
=
False
,
enable_cuda
=
False
)
theano
.
sandbox
.
gpuarray
.
init_dev
(
'cuda'
)
if
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
raise
SkipTest
(
"pygpu disabled"
)
test_ctx_name
=
None
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpuarray'
)
.
excluding
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpuarray'
)
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpuarray'
)
.
excluding
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpuarray'
)
theano/sandbox/gpuarray/tests/test_basic_ops.py
浏览文件 @
4814cd99
...
@@ -13,53 +13,22 @@ from theano.tensor.basic import alloc
...
@@ -13,53 +13,22 @@ from theano.tensor.basic import alloc
from
theano.tensor.tests
import
test_basic
from
theano.tensor.tests
import
test_basic
from
theano.tensor.tests.test_basic
import
rand
,
safe_make_node
from
theano.tensor.tests.test_basic
import
rand
,
safe_make_node
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests.unittest_tools
import
SkipTest
import
theano.sandbox.gpuarray
from
..type
import
(
GpuArrayType
,
get_context
,
from
..type
import
(
GpuArrayType
,
gpuarray_shared_constructor
)
gpuarray_shared_constructor
)
from
..basic_ops
import
(
from
..basic_ops
import
(
host_from_gpu
,
gpu_from_host
,
HostFromGpu
,
GpuFromHost
,
GpuReshape
,
host_from_gpu
,
HostFromGpu
,
GpuFromHost
,
GpuReshape
,
gpu_alloc
,
GpuAlloc
,
GpuAllocEmpty
,
GpuContiguous
,
GpuAlloc
,
GpuAllocEmpty
,
GpuContiguous
,
gpu_join
,
GpuJoin
,
GpuSplit
,
GpuEye
,
gpu_contiguous
)
gpu_join
,
GpuJoin
,
GpuSplit
,
GpuEye
,
gpu_contiguous
)
from
..subtensor
import
GpuSubtensor
from
..subtensor
import
GpuSubtensor
import
theano.sandbox.cuda
as
cuda_ndarray
from
.config
import
mode_with_gpu
,
mode_without_gpu
,
test_ctx_name
try
:
from
pygpu
import
gpuarray
except
:
pass
if
theano
.
sandbox
.
gpuarray
.
pygpu
is
None
:
raise
SkipTest
(
"pygpu not installed"
)
# If you are writing a new test file, don't copy this code, but rather
from
pygpu
import
gpuarray
# import stuff from this file (like mode_with_gpu) to reuse it.
if
cuda_ndarray
.
cuda_available
and
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
if
not
cuda_ndarray
.
use
.
device_number
:
# We should not enable all the use like the flag device=gpu,
# as many tests don't work in that setup.
cuda_ndarray
.
use
(
'gpu'
,
default_to_move_computation_to_gpu
=
False
,
move_shared_float32_to_gpu
=
False
,
enable_cuda
=
False
)
theano
.
sandbox
.
gpuarray
.
init_dev
(
'cuda'
)
if
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
raise
SkipTest
(
"pygpu disabled"
)
utt
.
seed_rng
()
utt
.
seed_rng
()
rng
=
numpy
.
random
.
RandomState
(
seed
=
utt
.
fetch_seed
())
rng
=
numpy
.
random
.
RandomState
(
seed
=
utt
.
fetch_seed
())
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpuarray'
)
.
excluding
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpuarray'
)
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpuarray'
)
.
excluding
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpuarray'
)
def
inplace_func
(
inputs
,
outputs
,
mode
=
None
,
allow_input_downcast
=
False
,
def
inplace_func
(
inputs
,
outputs
,
mode
=
None
,
allow_input_downcast
=
False
,
on_unused_input
=
'raise'
,
name
=
None
):
on_unused_input
=
'raise'
,
name
=
None
):
...
@@ -88,7 +57,8 @@ def rand_gpuarray(*shape, **kwargs):
...
@@ -88,7 +57,8 @@ def rand_gpuarray(*shape, **kwargs):
cls
=
kwargs
.
pop
(
'cls'
,
None
)
cls
=
kwargs
.
pop
(
'cls'
,
None
)
if
len
(
kwargs
)
!=
0
:
if
len
(
kwargs
)
!=
0
:
raise
TypeError
(
'Unexpected argument
%
s'
,
list
(
kwargs
.
keys
())[
0
])
raise
TypeError
(
'Unexpected argument
%
s'
,
list
(
kwargs
.
keys
())[
0
])
return
gpuarray
.
array
(
r
,
dtype
=
dtype
,
cls
=
cls
)
return
gpuarray
.
array
(
r
,
dtype
=
dtype
,
cls
=
cls
,
context
=
get_context
(
test_ctx_name
))
def
makeTester
(
name
,
op
,
gpu_op
,
cases
,
checks
=
None
,
mode_gpu
=
mode_with_gpu
,
def
makeTester
(
name
,
op
,
gpu_op
,
cases
,
checks
=
None
,
mode_gpu
=
mode_with_gpu
,
...
@@ -114,6 +84,7 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
...
@@ -114,6 +84,7 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
def
test_all
(
self
):
def
test_all
(
self
):
if
skip
:
if
skip
:
from
nose.plugins.skip
import
SkipTest
raise
SkipTest
(
skip
)
raise
SkipTest
(
skip
)
for
testname
,
inputs
in
iteritems
(
cases
):
for
testname
,
inputs
in
iteritems
(
cases
):
...
@@ -199,9 +170,9 @@ def test_transfer_cpu_gpu():
...
@@ -199,9 +170,9 @@ def test_transfer_cpu_gpu():
g
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
(
False
,
False
))(
'g'
)
g
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
(
False
,
False
))(
'g'
)
av
=
numpy
.
asarray
(
rng
.
rand
(
5
,
4
),
dtype
=
'float32'
)
av
=
numpy
.
asarray
(
rng
.
rand
(
5
,
4
),
dtype
=
'float32'
)
gv
=
gpuarray
.
array
(
av
)
gv
=
gpuarray
.
array
(
av
,
context
=
get_context
(
test_ctx_name
)
)
f
=
theano
.
function
([
a
],
gpu_from_host
(
a
))
f
=
theano
.
function
([
a
],
GpuFromHost
(
test_ctx_name
)
(
a
))
fv
=
f
(
av
)
fv
=
f
(
av
)
assert
GpuArrayType
.
values_eq
(
fv
,
gv
)
assert
GpuArrayType
.
values_eq
(
fv
,
gv
)
...
@@ -218,12 +189,12 @@ def test_transfer_strided():
...
@@ -218,12 +189,12 @@ def test_transfer_strided():
g
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
(
False
,
False
))(
'g'
)
g
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
(
False
,
False
))(
'g'
)
av
=
numpy
.
asarray
(
rng
.
rand
(
5
,
8
),
dtype
=
'float32'
)
av
=
numpy
.
asarray
(
rng
.
rand
(
5
,
8
),
dtype
=
'float32'
)
gv
=
gpuarray
.
array
(
av
)
gv
=
gpuarray
.
array
(
av
,
context
=
get_context
(
test_ctx_name
)
)
av
=
av
[:,
::
2
]
av
=
av
[:,
::
2
]
gv
=
gv
[:,
::
2
]
gv
=
gv
[:,
::
2
]
f
=
theano
.
function
([
a
],
gpu_from_host
(
a
))
f
=
theano
.
function
([
a
],
GpuFromHost
(
test_ctx_name
)
(
a
))
fv
=
f
(
av
)
fv
=
f
(
av
)
assert
GpuArrayType
.
values_eq
(
fv
,
gv
)
assert
GpuArrayType
.
values_eq
(
fv
,
gv
)
...
@@ -233,14 +204,14 @@ def test_transfer_strided():
...
@@ -233,14 +204,14 @@ def test_transfer_strided():
def
gpu_alloc_expected
(
x
,
*
shp
):
def
gpu_alloc_expected
(
x
,
*
shp
):
g
=
gpuarray
.
empty
(
shp
,
dtype
=
x
.
dtype
)
g
=
gpuarray
.
empty
(
shp
,
dtype
=
x
.
dtype
,
context
=
get_context
(
test_ctx_name
)
)
g
[:]
=
x
g
[:]
=
x
return
g
return
g
GpuAllocTester
=
makeTester
(
GpuAllocTester
=
makeTester
(
name
=
"GpuAllocTester"
,
name
=
"GpuAllocTester"
,
op
=
alloc
,
op
=
alloc
,
gpu_op
=
gpu_alloc
,
gpu_op
=
GpuAlloc
(
test_ctx_name
)
,
cases
=
dict
(
cases
=
dict
(
correct01
=
(
rand
(),
numpy
.
int32
(
7
)),
correct01
=
(
rand
(),
numpy
.
int32
(
7
)),
# just gives a DeepCopyOp with possibly wrong results on the CPU
# just gives a DeepCopyOp with possibly wrong results on the CPU
...
@@ -260,19 +231,19 @@ class TestAlloc(test_basic.TestAlloc):
...
@@ -260,19 +231,19 @@ class TestAlloc(test_basic.TestAlloc):
dtype
=
"float32"
dtype
=
"float32"
mode
=
mode_with_gpu
mode
=
mode_with_gpu
shared
=
staticmethod
(
gpuarray_shared_constructor
)
shared
=
staticmethod
(
gpuarray_shared_constructor
)
allocs
=
[
GpuAlloc
(
),
GpuAlloc
(
),
T
.
Alloc
()]
allocs
=
[
GpuAlloc
(
test_ctx_name
),
GpuAlloc
(
test_ctx_name
),
T
.
Alloc
()]
def
test_alloc_empty
():
def
test_alloc_empty
():
for
dt
in
[
'float32'
,
'int8'
]:
for
dt
in
[
'float32'
,
'int8'
]:
f
=
theano
.
function
([],
GpuAllocEmpty
(
dt
)(
2
,
3
))
f
=
theano
.
function
([],
GpuAllocEmpty
(
dt
,
context_name
=
test_ctx_name
)(
2
,
3
))
assert
len
(
f
.
maker
.
fgraph
.
apply_nodes
)
==
1
assert
len
(
f
.
maker
.
fgraph
.
apply_nodes
)
==
1
out
=
f
()
out
=
f
()
assert
out
.
shape
==
(
2
,
3
)
assert
out
.
shape
==
(
2
,
3
)
assert
out
.
dtype
==
dt
assert
out
.
dtype
==
dt
f
=
theano
.
function
([],
[
GpuAllocEmpty
(
'uint64'
)(
3
,
2
),
f
=
theano
.
function
([],
[
GpuAllocEmpty
(
'uint64'
,
test_ctx_name
)(
3
,
2
),
GpuAllocEmpty
(
'uint64'
)(
3
,
2
)])
GpuAllocEmpty
(
'uint64'
,
test_ctx_name
)(
3
,
2
)])
out
=
f
()
out
=
f
()
assert
out
[
0
]
.
shape
==
(
3
,
2
)
assert
out
[
0
]
.
shape
==
(
3
,
2
)
assert
out
[
0
]
.
dtype
==
'uint64'
assert
out
[
0
]
.
dtype
==
'uint64'
...
@@ -284,7 +255,7 @@ def test_alloc_empty():
...
@@ -284,7 +255,7 @@ def test_alloc_empty():
def
test_shape
():
def
test_shape
():
x
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
[
False
,
False
,
False
])()
x
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
[
False
,
False
,
False
])()
v
=
gpuarray
.
zeros
((
3
,
4
,
5
),
dtype
=
'float32'
)
v
=
gpuarray
.
zeros
((
3
,
4
,
5
),
dtype
=
'float32'
,
context
=
get_context
(
test_ctx_name
)
)
f
=
theano
.
function
([
x
],
x
.
shape
)
f
=
theano
.
function
([
x
],
x
.
shape
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
numpy
.
all
(
f
(
v
)
==
(
3
,
4
,
5
))
assert
numpy
.
all
(
f
(
v
)
==
(
3
,
4
,
5
))
...
@@ -436,12 +407,13 @@ def test_hostfromgpu_shape_i():
...
@@ -436,12 +407,13 @@ def test_hostfromgpu_shape_i():
ca
=
theano
.
sandbox
.
gpuarray
.
type
.
GpuArrayType
(
'float32'
,
(
False
,
False
))()
ca
=
theano
.
sandbox
.
gpuarray
.
type
.
GpuArrayType
(
'float32'
,
(
False
,
False
))()
av
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
5
,
4
),
dtype
=
'float32'
)
av
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
5
,
4
),
dtype
=
'float32'
)
cv
=
gpuarray
.
asarray
(
numpy
.
random
.
rand
(
5
,
4
),
cv
=
gpuarray
.
asarray
(
numpy
.
random
.
rand
(
5
,
4
),
dtype
=
'float32'
)
dtype
=
'float32'
,
context
=
get_context
(
test_ctx_name
))
f
=
theano
.
function
([
a
],
gpu_from_host
(
a
),
mode
=
m
)
f
=
theano
.
function
([
a
],
GpuFromHost
(
test_ctx_name
)
(
a
),
mode
=
m
)
assert
gpu_from_host
in
[
x
.
op
assert
any
(
isinstance
(
x
.
op
,
GpuFromHost
)
for
x
in
f
.
maker
.
fgraph
.
toposort
()]
for
x
in
f
.
maker
.
fgraph
.
toposort
())
f
=
theano
.
function
([
a
],
gpu_from_host
(
a
)
.
shape
,
mode
=
m
)
f
=
theano
.
function
([
a
],
GpuFromHost
(
test_ctx_name
)
(
a
)
.
shape
,
mode
=
m
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
isinstance
(
topo
[
0
]
.
op
,
T
.
opt
.
Shape_i
)
assert
isinstance
(
topo
[
0
]
.
op
,
T
.
opt
.
Shape_i
)
assert
isinstance
(
topo
[
1
]
.
op
,
T
.
opt
.
Shape_i
)
assert
isinstance
(
topo
[
1
]
.
op
,
T
.
opt
.
Shape_i
)
...
...
theano/sandbox/gpuarray/tests/test_blas.py
浏览文件 @
4814cd99
...
@@ -10,8 +10,8 @@ from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
...
@@ -10,8 +10,8 @@ from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
from
theano.tensor.tests.test_blas
import
TestGer
,
BaseGemv
from
theano.tensor.tests.test_blas
import
TestGer
,
BaseGemv
from
..
import
gpuarray_shared_constructor
from
..
import
gpuarray_shared_constructor
from
.
test_basic_ops
import
(
makeTester
,
rand
,
from
.
config
import
mode_with_gpu
mode_with_gpu
)
from
.test_basic_ops
import
makeTester
,
rand
from
..blas
import
(
gpugemv_inplace
,
gpugemv_no_inplace
,
from
..blas
import
(
gpugemv_inplace
,
gpugemv_no_inplace
,
gpugemm_inplace
,
gpugemm_inplace
,
...
@@ -100,7 +100,7 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
...
@@ -100,7 +100,7 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
self
.
ops
=
[
gpuger_no_inplace
,
gpuger_inplace
]
self
.
ops
=
[
gpuger_no_inplace
,
gpuger_inplace
]
def
clone
(
self
,
op
):
def
clone
(
self
,
op
):
return
GpuGer
(
destructive
=
op
.
destructiv
e
)
return
GpuGer
(
inplace
=
op
.
inplac
e
)
GpuDot22Tester
=
makeTester
(
GpuDot22Tester
=
makeTester
(
...
...
theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
浏览文件 @
4814cd99
...
@@ -14,8 +14,8 @@ from theano import tensor
...
@@ -14,8 +14,8 @@ from theano import tensor
from
theano.tests.unittest_tools
import
seed_rng
from
theano.tests.unittest_tools
import
seed_rng
# We let that import do the init of the back-end if needed.
# We let that import do the init of the back-end if needed.
from
.
test_basic_ops
import
mode_with_gpu
from
.
config
import
mode_with_gpu
,
test_ctx_name
from
..type
import
GpuArrayType
from
..type
import
GpuArrayType
,
get_context
from
..conv
import
GpuConv
from
..conv
import
GpuConv
from
theano.sandbox.gpuarray
import
dnn
from
theano.sandbox.gpuarray
import
dnn
...
@@ -28,7 +28,7 @@ try:
...
@@ -28,7 +28,7 @@ try:
except
ImportError
:
except
ImportError
:
pass
pass
gftensor4
=
GpuArrayType
(
'float32'
,
[
False
]
*
4
)
gftensor4
=
GpuArrayType
(
'float32'
,
[
False
]
*
4
,
context_name
=
test_ctx_name
)
def
py_conv_valid_numpy
(
img
,
kern
):
def
py_conv_valid_numpy
(
img
,
kern
):
...
@@ -135,8 +135,8 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
...
@@ -135,8 +135,8 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
numpy
.
prod
(
ishape
))
.
reshape
(
ishape
),
dtype
=
'float32'
)
+
1
numpy
.
prod
(
ishape
))
.
reshape
(
ishape
),
dtype
=
'float32'
)
+
1
npy_kern
=
-
(
theano
.
_asarray
(
numpy
.
arange
(
npy_kern
=
-
(
theano
.
_asarray
(
numpy
.
arange
(
numpy
.
prod
(
kshape
))
.
reshape
(
kshape
),
dtype
=
'float32'
)
+
1
)
numpy
.
prod
(
kshape
))
.
reshape
(
kshape
),
dtype
=
'float32'
)
+
1
)
img
=
pygpu
.
array
(
npy_img
)
img
=
pygpu
.
array
(
npy_img
,
context
=
get_context
(
test_ctx_name
)
)
kern
=
pygpu
.
array
(
npy_kern
)
kern
=
pygpu
.
array
(
npy_kern
,
context
=
get_context
(
test_ctx_name
)
)
# we take the stride after the transfert as we make c_contiguous
# we take the stride after the transfert as we make c_contiguous
# data on the GPU.
# data on the GPU.
...
...
theano/sandbox/gpuarray/tests/test_dnn.py
浏览文件 @
4814cd99
...
@@ -15,12 +15,12 @@ from theano.tensor.signal.downsample import MaxPoolGrad, AveragePoolGrad
...
@@ -15,12 +15,12 @@ from theano.tensor.signal.downsample import MaxPoolGrad, AveragePoolGrad
from
..
import
dnn
from
..
import
dnn
from
..basic_ops
import
GpuAllocEmpty
from
..basic_ops
import
GpuAllocEmpty
from
.
test_basic_ops
import
mode_with_gpu
,
mode_without_gpu
from
.
config
import
mode_with_gpu
,
mode_without_gpu
,
test_ctx_name
from
.
import
test_nnet
from
.
import
test_nnet
def
test_dnn_conv_desc_merge
():
def
test_dnn_conv_desc_merge
():
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
kern_shp
=
T
.
as_tensor_variable
(
kern_shp
=
T
.
as_tensor_variable
(
numpy
.
asarray
([
3
,
1
,
2
,
2
])
.
astype
(
'int64'
))
numpy
.
asarray
([
3
,
1
,
2
,
2
])
.
astype
(
'int64'
))
...
@@ -41,7 +41,7 @@ def test_dnn_conv_desc_merge():
...
@@ -41,7 +41,7 @@ def test_dnn_conv_desc_merge():
def
test_dnn_conv_merge
():
def
test_dnn_conv_merge
():
# This test that we merge correctly multiple dnn_conv.
# This test that we merge correctly multiple dnn_conv.
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
img_shp
=
[
2
,
5
,
6
,
8
]
img_shp
=
[
2
,
5
,
6
,
8
]
kern_shp
=
[
3
,
5
,
5
,
6
]
kern_shp
=
[
3
,
5
,
5
,
6
]
...
@@ -80,7 +80,7 @@ def test_dnn_conv_inplace():
...
@@ -80,7 +80,7 @@ def test_dnn_conv_inplace():
GpuAllocEmpty get merged together.
GpuAllocEmpty get merged together.
"""
"""
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
img_shp
=
[
2
,
5
,
6
,
8
]
img_shp
=
[
2
,
5
,
6
,
8
]
kern_shp
=
[
3
,
5
,
5
,
6
]
kern_shp
=
[
3
,
5
,
5
,
6
]
...
@@ -105,7 +105,7 @@ def test_dnn_conv_inplace():
...
@@ -105,7 +105,7 @@ def test_dnn_conv_inplace():
assert
len
([
n
for
n
in
topo
if
isinstance
(
n
.
op
,
GpuAllocEmpty
)])
==
2
assert
len
([
n
for
n
in
topo
if
isinstance
(
n
.
op
,
GpuAllocEmpty
)])
==
2
# Test grad w op
# Test grad w op
out
=
GpuAllocEmpty
(
kern
.
dtype
)(
*
kern
.
shape
)
out
=
GpuAllocEmpty
(
kern
.
dtype
,
test_ctx_name
)(
*
kern
.
shape
)
o1
=
dnn
.
GpuDnnConvGradW
()(
img
,
kern
,
out
,
desc1
)
o1
=
dnn
.
GpuDnnConvGradW
()(
img
,
kern
,
out
,
desc1
)
o2
=
dnn
.
GpuDnnConvGradW
()(
img
,
kern
,
out
,
desc2
)
o2
=
dnn
.
GpuDnnConvGradW
()(
img
,
kern
,
out
,
desc2
)
f
=
theano
.
function
([
img
,
kern
],
[
o1
,
o2
],
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
img
,
kern
],
[
o1
,
o2
],
mode
=
mode_with_gpu
)
...
@@ -116,7 +116,7 @@ def test_dnn_conv_inplace():
...
@@ -116,7 +116,7 @@ def test_dnn_conv_inplace():
assert
len
([
n
for
n
in
topo
if
isinstance
(
n
.
op
,
GpuAllocEmpty
)])
==
2
assert
len
([
n
for
n
in
topo
if
isinstance
(
n
.
op
,
GpuAllocEmpty
)])
==
2
# Test grad i op
# Test grad i op
out
=
GpuAllocEmpty
(
img
.
dtype
)(
*
img
.
shape
)
out
=
GpuAllocEmpty
(
img
.
dtype
,
test_ctx_name
)(
*
img
.
shape
)
o1
=
dnn
.
GpuDnnConvGradI
()(
img
,
kern
,
out
,
desc1
)
o1
=
dnn
.
GpuDnnConvGradI
()(
img
,
kern
,
out
,
desc1
)
o2
=
dnn
.
GpuDnnConvGradI
()(
img
,
kern
,
out
,
desc2
)
o2
=
dnn
.
GpuDnnConvGradI
()(
img
,
kern
,
out
,
desc2
)
f
=
theano
.
function
([
img
,
kern
],
[
o1
,
o2
],
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
img
,
kern
],
[
o1
,
o2
],
mode
=
mode_with_gpu
)
...
@@ -163,7 +163,7 @@ def pool_2d_i2n(input, ds=(2, 2), strides=None,
...
@@ -163,7 +163,7 @@ def pool_2d_i2n(input, ds=(2, 2), strides=None,
def
test_pooling
():
def
test_pooling
():
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
x
=
T
.
ftensor4
()
x
=
T
.
ftensor4
()
...
@@ -269,7 +269,7 @@ def test_pooling():
...
@@ -269,7 +269,7 @@ def test_pooling():
def
test_pooling_opt
():
def
test_pooling_opt
():
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
x
=
T
.
fmatrix
()
x
=
T
.
fmatrix
()
...
@@ -318,7 +318,7 @@ def test_dnn_tag():
...
@@ -318,7 +318,7 @@ def test_dnn_tag():
max_pool_2d
(
x
,
ds
=
(
2
,
2
),
ignore_border
=
True
),
max_pool_2d
(
x
,
ds
=
(
2
,
2
),
ignore_border
=
True
),
mode
=
mode_with_gpu
.
including
(
"cudnn"
))
mode
=
mode_with_gpu
.
including
(
"cudnn"
))
except
(
AssertionError
,
RuntimeError
):
except
(
AssertionError
,
RuntimeError
):
assert
not
dnn
.
dnn_available
()
assert
not
dnn
.
dnn_available
(
test_ctx_name
)
raised
=
True
raised
=
True
finally
:
finally
:
theano
.
config
.
on_opt_error
=
old
theano
.
config
.
on_opt_error
=
old
...
@@ -327,7 +327,7 @@ def test_dnn_tag():
...
@@ -327,7 +327,7 @@ def test_dnn_tag():
logging
.
getLogger
(
'theano'
)
.
addHandler
(
theano
.
logging_default_handler
)
logging
.
getLogger
(
'theano'
)
.
addHandler
(
theano
.
logging_default_handler
)
if
not
raised
:
if
not
raised
:
assert
dnn
.
dnn_available
()
assert
dnn
.
dnn_available
(
test_ctx_name
)
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnPool
)
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnPool
)
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
...
@@ -338,7 +338,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
...
@@ -338,7 +338,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
self
.
mode
=
mode_with_gpu
self
.
mode
=
mode_with_gpu
def
test_softmax
(
self
):
def
test_softmax
(
self
):
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
t
=
T
.
ftensor4
(
't'
)
t
=
T
.
ftensor4
(
't'
)
rand_tensor
=
numpy
.
asarray
(
rand_tensor
=
numpy
.
asarray
(
...
@@ -368,7 +368,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
...
@@ -368,7 +368,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
)
)
def
test_conv
(
self
):
def
test_conv
(
self
):
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
img
=
T
.
ftensor4
(
'img'
)
img
=
T
.
ftensor4
(
'img'
)
kerns
=
T
.
ftensor4
(
'kerns'
)
kerns
=
T
.
ftensor4
(
'kerns'
)
...
@@ -406,7 +406,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
...
@@ -406,7 +406,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
)
)
def
test_conv_gradw
(
self
):
def
test_conv_gradw
(
self
):
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
img
=
T
.
ftensor4
(
'img'
)
img
=
T
.
ftensor4
(
'img'
)
kerns
=
T
.
ftensor4
(
'kerns'
)
kerns
=
T
.
ftensor4
(
'kerns'
)
...
@@ -455,7 +455,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
...
@@ -455,7 +455,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
)
)
def
test_conv_gradi
(
self
):
def
test_conv_gradi
(
self
):
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
img
=
T
.
ftensor4
(
'img'
)
img
=
T
.
ftensor4
(
'img'
)
kerns
=
T
.
ftensor4
(
'kerns'
)
kerns
=
T
.
ftensor4
(
'kerns'
)
...
@@ -499,7 +499,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
...
@@ -499,7 +499,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
)
)
def
test_pool
(
self
):
def
test_pool
(
self
):
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
img
=
T
.
ftensor4
(
'img'
)
img
=
T
.
ftensor4
(
'img'
)
img_val
=
numpy
.
asarray
(
img_val
=
numpy
.
asarray
(
...
@@ -524,7 +524,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
...
@@ -524,7 +524,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
)
)
def
test_pool_grad
(
self
):
def
test_pool_grad
(
self
):
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
img
=
T
.
ftensor4
(
'img'
)
img
=
T
.
ftensor4
(
'img'
)
img_grad
=
T
.
ftensor4
(
'img_grad'
)
img_grad
=
T
.
ftensor4
(
'img_grad'
)
...
@@ -568,7 +568,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
...
@@ -568,7 +568,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
# this has been a problem in the past
# this has been a problem in the past
def
test_dnn_conv_border_mode
():
def
test_dnn_conv_border_mode
():
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
img
=
T
.
ftensor4
()
img
=
T
.
ftensor4
()
kern
=
T
.
ftensor4
()
kern
=
T
.
ftensor4
()
...
@@ -580,7 +580,7 @@ def test_dnn_conv_border_mode():
...
@@ -580,7 +580,7 @@ def test_dnn_conv_border_mode():
def
test_dnn_conv_alpha_output_merge
():
def
test_dnn_conv_alpha_output_merge
():
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
img
=
T
.
ftensor4
()
img
=
T
.
ftensor4
()
kern
=
T
.
ftensor4
()
kern
=
T
.
ftensor4
()
...
@@ -678,7 +678,7 @@ def test_dnn_conv_grad():
...
@@ -678,7 +678,7 @@ def test_dnn_conv_grad():
def
test_version
():
def
test_version
():
if
not
dnn
.
dnn_available
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
assert
isinstance
(
dnn
.
version
(),
int
)
assert
isinstance
(
dnn
.
version
(),
int
)
...
...
theano/sandbox/gpuarray/tests/test_elemwise.py
浏览文件 @
4814cd99
...
@@ -4,19 +4,19 @@ import theano
...
@@ -4,19 +4,19 @@ import theano
from
theano
import
scalar
,
gof
from
theano
import
scalar
,
gof
from
theano.tests.unittest_tools
import
SkipTest
,
assert_allclose
from
theano.tests.unittest_tools
import
SkipTest
,
assert_allclose
from
theano.tensor.tests.test_elemwise
import
(
test_Broadcast
,
test_DimShuffle
,
from
theano.tensor.tests
import
test_elemwise
test_CAReduce
,
T_reduce_dtype
)
from
.test_basic_ops
import
mode_with_gpu
,
rand_gpuarray
from
.config
import
mode_with_gpu
,
test_ctx_name
from
.test_basic_ops
import
rand_gpuarray
from
..elemwise
import
(
GpuElemwise
,
GpuDimShuffle
,
from
..elemwise
import
(
GpuElemwise
,
GpuDimShuffle
,
GpuCAReduceCuda
,
GpuCAReduceCPY
)
GpuCAReduceCuda
,
GpuCAReduceCPY
)
from
..type
import
GpuArrayType
from
..type
import
GpuArrayType
,
get_context
from
pygpu
import
ndgpuarray
as
gpuarray
from
pygpu
import
ndgpuarray
as
gpuarray
# This is acutally a test for GpuElemwise
# This is acutally a test for GpuElemwise
class
test_gpu_Broadcast
(
test_Broadcast
):
class
test_gpu_Broadcast
(
test_
elemwise
.
test_
Broadcast
):
op
=
GpuElemwise
op
=
GpuElemwise
type
=
GpuArrayType
type
=
GpuArrayType
cop
=
GpuElemwise
cop
=
GpuElemwise
...
@@ -25,8 +25,7 @@ class test_gpu_Broadcast(test_Broadcast):
...
@@ -25,8 +25,7 @@ class test_gpu_Broadcast(test_Broadcast):
linkers
=
[
gof
.
PerformLinker
,
gof
.
CLinker
]
linkers
=
[
gof
.
PerformLinker
,
gof
.
CLinker
]
def
setUp
(
self
):
def
setUp
(
self
):
dev
=
theano
.
sandbox
.
gpuarray
.
init_dev
.
device
if
get_context
(
test_ctx_name
)
.
kind
!=
'cuda'
:
if
not
dev
.
startswith
(
'cuda'
):
self
.
linkers
=
[
gof
.
PerformLinker
]
self
.
linkers
=
[
gof
.
PerformLinker
]
def
rand_val
(
self
,
shp
):
def
rand_val
(
self
,
shp
):
...
@@ -36,14 +35,12 @@ class test_gpu_Broadcast(test_Broadcast):
...
@@ -36,14 +35,12 @@ class test_gpu_Broadcast(test_Broadcast):
return
rand_gpuarray
(
*
shp
,
**
dict
(
cls
=
gpuarray
))
return
rand_gpuarray
(
*
shp
,
**
dict
(
cls
=
gpuarray
))
def
test_c
(
self
):
def
test_c
(
self
):
dev
=
theano
.
sandbox
.
gpuarray
.
init_dev
.
device
if
get_context
(
test_ctx_name
)
.
kind
!=
'cuda'
:
if
not
dev
.
startswith
(
'cuda'
):
raise
SkipTest
(
"Cuda specific tests"
)
raise
SkipTest
(
"Cuda specific tests"
)
super
(
test_gpu_Broadcast
,
self
)
.
test_c
()
super
(
test_gpu_Broadcast
,
self
)
.
test_c
()
def
test_c_inplace
(
self
):
def
test_c_inplace
(
self
):
dev
=
theano
.
sandbox
.
gpuarray
.
init_dev
.
device
if
get_context
(
test_ctx_name
)
.
kind
!=
'cuda'
:
if
not
dev
.
startswith
(
'cuda'
):
raise
SkipTest
(
"Cuda specific tests"
)
raise
SkipTest
(
"Cuda specific tests"
)
super
(
test_gpu_Broadcast
,
self
)
.
test_c_inplace
()
super
(
test_gpu_Broadcast
,
self
)
.
test_c_inplace
()
...
@@ -51,8 +48,7 @@ class test_gpu_Broadcast(test_Broadcast):
...
@@ -51,8 +48,7 @@ class test_gpu_Broadcast(test_Broadcast):
def
test_elemwise_pow
():
def
test_elemwise_pow
():
# Test that GpuElemwise(pow) can compile with any combination of integer
# Test that GpuElemwise(pow) can compile with any combination of integer
# or float input dtype.
# or float input dtype.
dev
=
theano
.
sandbox
.
gpuarray
.
init_dev
.
device
if
get_context
(
test_ctx_name
)
.
kind
!=
'cuda'
:
if
not
dev
.
startswith
(
'cuda'
):
raise
SkipTest
(
"Cuda specific tests"
)
raise
SkipTest
(
"Cuda specific tests"
)
dtypes
=
[
"uint8"
,
"uint16"
,
"uint32"
,
"uint64"
,
dtypes
=
[
"uint8"
,
"uint16"
,
"uint32"
,
"uint64"
,
...
@@ -77,11 +73,11 @@ def test_elemwise_pow():
...
@@ -77,11 +73,11 @@ def test_elemwise_pow():
assert_allclose
(
out
,
expected_out
)
assert_allclose
(
out
,
expected_out
)
class
test_GpuDimShuffle
(
test_DimShuffle
):
class
test_GpuDimShuffle
(
test_
elemwise
.
test_
DimShuffle
):
op
=
GpuDimShuffle
op
=
GpuDimShuffle
class
test_GpuCAReduceCPY
(
test_CAReduce
):
class
test_GpuCAReduceCPY
(
test_
elemwise
.
test_
CAReduce
):
dtypes
=
[
"float32"
]
dtypes
=
[
"float32"
]
bin_dtypes
=
[
"uint8"
,
"int8"
]
bin_dtypes
=
[
"uint8"
,
"int8"
]
op
=
GpuCAReduceCPY
op
=
GpuCAReduceCPY
...
@@ -120,7 +116,7 @@ class test_GpuCAReduceCPY(test_CAReduce):
...
@@ -120,7 +116,7 @@ class test_GpuCAReduceCPY(test_CAReduce):
def
test_infer_shape
(
self
):
def
test_infer_shape
(
self
):
for
dtype
in
self
.
dtypes
:
for
dtype
in
self
.
dtypes
:
test_CAReduce
.
test_infer_shape
(
self
,
dtype
)
super
(
test_GpuCAReduceCPY
,
self
)
.
test_infer_shape
(
dtype
)
class
test_GpuCAReduceCuda
(
test_GpuCAReduceCPY
):
class
test_GpuCAReduceCuda
(
test_GpuCAReduceCPY
):
...
@@ -133,15 +129,15 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
...
@@ -133,15 +129,15 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((
5
,
6
),
(
1
,
)),
((
5
,
6
),
(
1
,
)),
((
5
,
6
),
(
-
1
,
)),
((
5
,
6
),
(
-
1
,
)),
((
5
,
6
),
(
-
2
,
)),
((
5
,
6
),
(
-
2
,
)),
#((5, 6), ()), #reduce on no axis(copy) isn't implemented
#
((5, 6), ()), #reduce on no axis(copy) isn't implemented
#((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
#
((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
#((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
#
((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
((
5
,
0
),
None
),
((
5
,
0
),
None
),
((
5
,
0
),
(
0
,
)),
((
5
,
0
),
(
0
,
)),
((
5
,
0
),
(
1
,
)),
((
5
,
0
),
(
1
,
)),
#((5, 0), ()), reduce on no axis isn't implemented
#
((5, 0), ()), reduce on no axis isn't implemented
#((), None), reduce on no axis isn't implemented
#
((), None), reduce on no axis isn't implemented
#((), ()) reduce on no axis isn't implemented
#
((), ()) reduce on no axis isn't implemented
# Test all GPU cases implemented
# Test all GPU cases implemented
((
1
,
0
),
(
1
,)),
((
1
,
0
),
(
1
,)),
...
@@ -158,7 +154,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
...
@@ -158,7 +154,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((
0
,
0
,
0
,
0
),
[
0
,
1
,
2
,
3
]),
((
0
,
0
,
0
,
0
),
[
0
,
1
,
2
,
3
]),
((
5
,
4
,
3
,
20
),
[
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
0
,
1
,
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
0
,
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
1
,
2
,
3
]),
((
5
,
4
,
3
,
20
),
[
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
0
,
1
,
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
0
,
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
1
,
2
,
3
]),
# test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
# test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
((
4100
,
3
),
[
0
]),
((
3
,
4101
),
[
0
]),
# 10
((
4100
,
3
),
[
0
]),
((
3
,
4101
),
[
0
]),
# 10
((
1024
,
33
),
[
0
]),
((
33
,
1024
),
[
0
]),
# 10
((
1024
,
33
),
[
0
]),
((
33
,
1024
),
[
0
]),
# 10
((
1025
,
33
),
[
0
]),
((
33
,
1025
),
[
0
]),
# 10
((
1025
,
33
),
[
0
]),
((
33
,
1025
),
[
0
]),
# 10
...
@@ -176,7 +172,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
...
@@ -176,7 +172,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((
4100
,
4
,
3
),
[
2
]),
((
5
,
4100
,
3
),
[
2
]),
((
5
,
4
,
4100
),
[
2
]),
# 001
((
4100
,
4
,
3
),
[
2
]),
((
5
,
4100
,
3
),
[
2
]),
((
5
,
4
,
4100
),
[
2
]),
# 001
((
4100
,
4
,
3
),
[
0
,
1
]),
((
5
,
4100
,
3
),
[
0
,
1
]),
((
5
,
4
,
4100
),
[
0
,
1
]),
# 110
((
4100
,
4
,
3
),
[
0
,
1
]),
((
5
,
4100
,
3
),
[
0
,
1
]),
((
5
,
4
,
4100
),
[
0
,
1
]),
# 110
((
4100
,
4
,
3
),
[
1
,
2
]),
((
5
,
4100
,
3
),
[
1
,
2
]),
((
5
,
4
,
4100
),
[
1
,
2
]),
# 011
((
4100
,
4
,
3
),
[
1
,
2
]),
((
5
,
4100
,
3
),
[
1
,
2
]),
((
5
,
4
,
4100
),
[
1
,
2
]),
# 011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
#
((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((
4100
,
4
,
3
),
[
0
,
1
,
2
]),
((
5
,
4100
,
3
),
[
0
,
1
,
2
]),
((
5
,
4
,
4100
),
[
0
,
1
,
2
]),
# 111
((
4100
,
4
,
3
),
[
0
,
1
,
2
]),
((
5
,
4100
,
3
),
[
0
,
1
,
2
]),
((
5
,
4
,
4100
),
[
0
,
1
,
2
]),
# 111
((
65
,
4
,
3
),
[
0
,
1
,
2
]),
((
5
,
65
,
3
),
[
0
,
1
,
2
]),
((
5
,
4
,
65
),
[
0
,
1
,
2
]),
# 111
((
65
,
4
,
3
),
[
0
,
1
,
2
]),
((
5
,
65
,
3
),
[
0
,
1
,
2
]),
((
5
,
4
,
65
),
[
0
,
1
,
2
]),
# 111
...
@@ -189,17 +185,17 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
...
@@ -189,17 +185,17 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
# test pattern implemented by reshape
# test pattern implemented by reshape
# Skip them as this test the op directly, not the optimization with reshape
# Skip them as this test the op directly, not the optimization with reshape
#
((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
#
((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
#
((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
#
((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
#
((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
#
((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
#
((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
#
((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
#
((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
#
((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
#
((5,4,3,10,11),[1,2]),
#
((5,4,3,10,11),[1,2]),
]
]
op
=
GpuCAReduceCuda
op
=
GpuCAReduceCuda
reds
=
[
scalar
.
add
,
scalar
.
mul
,
reds
=
[
scalar
.
add
,
scalar
.
mul
,
scalar
.
maximum
,
scalar
.
minimum
]
scalar
.
maximum
,
scalar
.
minimum
]
pre_scalar_op
=
scalar
.
sqr
pre_scalar_op
=
None
def
test_perform
(
self
):
def
test_perform
(
self
):
return
return
...
@@ -209,12 +205,11 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
...
@@ -209,12 +205,11 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
def
setUp
(
self
):
def
setUp
(
self
):
super
(
test_GpuCAReduceCuda
,
self
)
.
setUp
()
super
(
test_GpuCAReduceCuda
,
self
)
.
setUp
()
dev
=
theano
.
sandbox
.
gpuarray
.
init_dev
.
device
if
get_context
(
test_ctx_name
)
.
kind
!=
'cuda'
:
if
not
dev
.
startswith
(
'cuda'
):
raise
SkipTest
(
"Cuda specific tests"
)
raise
SkipTest
(
"Cuda specific tests"
)
class
T_gpureduce_dtype
(
T_reduce_dtype
):
class
T_gpureduce_dtype
(
test_elemwise
.
T_reduce_dtype
):
mode
=
mode_with_gpu
.
excluding
(
'local_cut_useless_reduce'
)
mode
=
mode_with_gpu
.
excluding
(
'local_cut_useless_reduce'
)
op
=
GpuCAReduceCuda
op
=
GpuCAReduceCuda
# Currently we don't support reduction on 0 axis
# Currently we don't support reduction on 0 axis
...
@@ -225,8 +220,7 @@ class T_gpureduce_dtype(T_reduce_dtype):
...
@@ -225,8 +220,7 @@ class T_gpureduce_dtype(T_reduce_dtype):
'float32'
,
'float64'
]
'float32'
,
'float64'
]
def
setUp
(
self
):
def
setUp
(
self
):
dev
=
theano
.
sandbox
.
gpuarray
.
init_dev
.
device
if
get_context
(
test_ctx_name
)
.
kind
!=
'cuda'
:
if
not
dev
.
startswith
(
'cuda'
):
raise
SkipTest
(
"Cuda specific tests"
)
raise
SkipTest
(
"Cuda specific tests"
)
...
...
theano/sandbox/gpuarray/tests/test_neighbours.py
浏览文件 @
4814cd99
from
theano.tensor.nnet.tests
import
test_neighbours
from
theano.tensor.nnet.tests
import
test_neighbours
# We let that import do the init of the back-end if needed.
from
.
test_basic_ops
import
mode_with_gpu
from
.
config
import
mode_with_gpu
from
..neighbours
import
GpuImages2Neibs
from
..neighbours
import
GpuImages2Neibs
...
...
theano/sandbox/gpuarray/tests/test_nerv.py
浏览文件 @
4814cd99
...
@@ -6,7 +6,7 @@ from theano import function
...
@@ -6,7 +6,7 @@ from theano import function
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
from
theano.tensor
import
vector
,
matrix
,
dot
from
theano.tensor
import
vector
,
matrix
,
dot
from
.
test_basic_ops
import
mode_with_gpu
from
.
config
import
mode_with_gpu
from
..nerv
import
Gemm16
,
nerv
from
..nerv
import
Gemm16
,
nerv
...
...
theano/sandbox/gpuarray/tests/test_nnet.py
浏览文件 @
4814cd99
...
@@ -7,9 +7,7 @@ import theano
...
@@ -7,9 +7,7 @@ import theano
import
theano.tensor
as
T
import
theano.tensor
as
T
import
theano.tests.unittest_tools
as
utt
import
theano.tests.unittest_tools
as
utt
# We let that import do the init of the back-end if needed.
from
.config
import
mode_with_gpu
,
mode_without_gpu
from
.test_basic_ops
import
(
mode_with_gpu
,
mode_without_gpu
)
from
..nnet
import
(
from
..nnet
import
(
GpuCrossentropySoftmaxArgmax1HotWithBias
,
GpuCrossentropySoftmaxArgmax1HotWithBias
,
GpuCrossentropySoftmax1HotWithBiasDx
,
GpuCrossentropySoftmax1HotWithBiasDx
,
...
...
theano/sandbox/gpuarray/tests/test_opt.py
浏览文件 @
4814cd99
...
@@ -4,17 +4,16 @@ import theano
...
@@ -4,17 +4,16 @@ import theano
from
theano
import
tensor
from
theano
import
tensor
from
theano.tests.breakpoint
import
PdbBreakpoint
from
theano.tests.breakpoint
import
PdbBreakpoint
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests.unittest_tools
import
SkipTest
from
theano.tensor.tests
import
test_basic
from
theano.tensor.tests
import
test_basic
import
theano.sandbox.gpuarray
import
theano.sandbox.gpuarray
from
..
import
basic_ops
from
..
import
basic_ops
from
..type
import
GpuArrayType
,
gpuarray_shared_constructor
from
..type
import
GpuArrayType
,
gpuarray_shared_constructor
,
get_context
from
..basic_ops
import
(
GpuAlloc
,
GpuReshape
,
gpu_alloc
,
from
..basic_ops
import
GpuAlloc
,
GpuReshape
,
GpuFromHost
,
host_from_gpu
gpu_from_host
,
host_from_gpu
)
from
..elemwise
import
GpuCAReduceCuda
,
GpuCAReduceCPY
,
GpuElemwise
from
..elemwise
import
GpuCAReduceCuda
,
GpuCAReduceCPY
,
GpuElemwise
from
..subtensor
import
GpuSubtensor
from
..subtensor
import
GpuSubtensor
from
.test_basic_ops
import
rand_gpuarray
,
mode_with_gpu
,
mode_without_gpu
from
.config
import
mode_with_gpu
,
test_ctx_name
def
test_local_assert
():
def
test_local_assert
():
...
@@ -97,7 +96,7 @@ def test_flatten():
...
@@ -97,7 +96,7 @@ def test_flatten():
def
test_reduce
():
def
test_reduce
():
dev
=
theano
.
sandbox
.
gpuarray
.
init_dev
.
device
kind
=
get_context
(
test_ctx_name
)
.
kind
for
method
,
param
in
[(
'sum'
,
dict
(
acc_dtype
=
'float32'
)),
for
method
,
param
in
[(
'sum'
,
dict
(
acc_dtype
=
'float32'
)),
(
'prod'
,
dict
(
acc_dtype
=
'float32'
)),
(
'prod'
,
dict
(
acc_dtype
=
'float32'
)),
...
@@ -113,7 +112,7 @@ def test_reduce():
...
@@ -113,7 +112,7 @@ def test_reduce():
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
ops
=
[
type
(
node
.
op
)
for
node
in
topo
]
ops
=
[
type
(
node
.
op
)
for
node
in
topo
]
if
dev
.
startswith
(
'opencl'
)
and
method
in
[
"max"
,
"min"
]:
if
kind
==
'opencl'
and
method
in
[
"max"
,
"min"
]:
assert
not
(
GpuCAReduceCuda
in
ops
or
GpuCAReduceCPY
in
ops
)
assert
not
(
GpuCAReduceCuda
in
ops
or
GpuCAReduceCPY
in
ops
)
else
:
else
:
assert
GpuCAReduceCuda
in
ops
or
GpuCAReduceCPY
in
ops
assert
GpuCAReduceCuda
in
ops
or
GpuCAReduceCPY
in
ops
...
@@ -126,7 +125,7 @@ def test_local_gpualloc_memset_0():
...
@@ -126,7 +125,7 @@ def test_local_gpualloc_memset_0():
ones
=
numpy
.
ones
((
2
,),
dtype
=
'float32'
)
ones
=
numpy
.
ones
((
2
,),
dtype
=
'float32'
)
# Test with 0
# Test with 0
a
=
gpu_alloc
(
z
,
i
)
a
=
GpuAlloc
(
test_ctx_name
)
(
z
,
i
)
f
=
theano
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
1
assert
len
(
topo
)
==
1
...
@@ -134,7 +133,7 @@ def test_local_gpualloc_memset_0():
...
@@ -134,7 +133,7 @@ def test_local_gpualloc_memset_0():
assert
(
numpy
.
asarray
(
f
(
6
))
==
0
)
.
all
()
assert
(
numpy
.
asarray
(
f
(
6
))
==
0
)
.
all
()
# Test with 1
# Test with 1
a
=
gpu_alloc
(
o
,
i
)
a
=
GpuAlloc
(
test_ctx_name
)
(
o
,
i
)
f
=
theano
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
1
assert
len
(
topo
)
==
1
...
@@ -143,7 +142,7 @@ def test_local_gpualloc_memset_0():
...
@@ -143,7 +142,7 @@ def test_local_gpualloc_memset_0():
assert
(
numpy
.
asarray
(
f
(
6
))
==
1
)
.
all
()
assert
(
numpy
.
asarray
(
f
(
6
))
==
1
)
.
all
()
# Test with 1, 1
# Test with 1, 1
a
=
gpu_alloc
(
ones
,
i
)
a
=
GpuAlloc
(
test_ctx_name
)
(
ones
,
i
)
f
=
theano
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
1
assert
len
(
topo
)
==
1
...
@@ -180,7 +179,7 @@ def test_print_op():
...
@@ -180,7 +179,7 @@ def test_print_op():
f
=
theano
.
function
([
b
],
theano
.
printing
.
Print
()(
b
)
*
2
,
f
=
theano
.
function
([
b
],
theano
.
printing
.
Print
()(
b
)
*
2
,
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
topo
[
0
]
.
op
==
gpu_from_host
assert
isinstance
(
topo
[
0
]
.
op
,
GpuFromHost
)
assert
isinstance
(
topo
[
1
]
.
op
,
theano
.
printing
.
Print
)
assert
isinstance
(
topo
[
1
]
.
op
,
theano
.
printing
.
Print
)
assert
isinstance
(
topo
[
2
]
.
op
,
GpuElemwise
)
assert
isinstance
(
topo
[
2
]
.
op
,
GpuElemwise
)
assert
topo
[
3
]
.
op
==
host_from_gpu
assert
topo
[
3
]
.
op
==
host_from_gpu
...
@@ -208,7 +207,7 @@ def test_pdbbreakpoint_op():
...
@@ -208,7 +207,7 @@ def test_pdbbreakpoint_op():
def
test_local_gpu_elemwise_careduce
():
def
test_local_gpu_elemwise_careduce
():
x
=
theano
.
tensor
.
matrix
()
x
=
theano
.
tensor
.
matrix
()
o
=
(
x
*
x
)
.
sum
()
o
=
(
x
*
x
)
.
sum
()
f
=
theano
.
function
([
x
],
o
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
x
],
o
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
3
assert
len
(
topo
)
==
3
...
@@ -234,7 +233,7 @@ def test_local_gpu_subtensor():
...
@@ -234,7 +233,7 @@ def test_local_gpu_subtensor():
# Test multiple use of the input
# Test multiple use of the input
# We want the subtensor to be on the GPU to prevent multiple transfer.
# We want the subtensor to be on the GPU to prevent multiple transfer.
t
=
tensor
.
fmatrix
()
t
=
tensor
.
fmatrix
()
f
=
theano
.
function
([
t
],
[
t
[
3
:
4
],
t
+
1
],
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
t
],
[
t
[
3
:
4
],
t
+
1
],
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
not
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
not
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
...
@@ -242,7 +241,7 @@ def test_local_gpu_subtensor():
...
@@ -242,7 +241,7 @@ def test_local_gpu_subtensor():
# Test multiple use of the input + input as output
# Test multiple use of the input + input as output
# We want the subtensor to be on the GPU to prevent multiple transfer.
# We want the subtensor to be on the GPU to prevent multiple transfer.
t
=
tensor
.
fmatrix
()
t
=
tensor
.
fmatrix
()
f
=
theano
.
function
([
t
],
[
t
[
3
:
4
],
t
+
1
,
t
],
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
t
],
[
t
[
3
:
4
],
t
+
1
,
t
],
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
not
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
not
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
...
@@ -250,7 +249,7 @@ def test_local_gpu_subtensor():
...
@@ -250,7 +249,7 @@ def test_local_gpu_subtensor():
# Test shared forced on CPU end we do computation on the output of
# Test shared forced on CPU end we do computation on the output of
# the subtensor.
# the subtensor.
t
=
tensor
.
_shared
(
numpy
.
zeros
(
20
,
"float32"
))
t
=
tensor
.
_shared
(
numpy
.
zeros
(
20
,
"float32"
))
f
=
theano
.
function
([],
t
[
3
:
4
]
+
1
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([],
t
[
3
:
4
]
+
1
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
not
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
assert
not
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
...
@@ -319,7 +318,7 @@ def test_local_gpu_elemwise():
...
@@ -319,7 +318,7 @@ def test_local_gpu_elemwise():
utt
.
assert_allclose
(
out
[
1
],
a_v
*
c_v
)
utt
.
assert_allclose
(
out
[
1
],
a_v
*
c_v
)
# Test non-contiguous input
# Test non-contiguous input
c
=
cuda
.
shared_constructor
(
numpy
.
asarray
(
c_v
,
dtype
=
'float32'
))
c
=
gpuarray_
shared_constructor
(
numpy
.
asarray
(
c_v
,
dtype
=
'float32'
))
f
=
theano
.
function
([
a
,
b
],
outs_op
(
a
[::
2
],
b
[::
2
],
c
[::
2
]),
f
=
theano
.
function
([
a
,
b
],
outs_op
(
a
[::
2
],
b
[::
2
],
c
[::
2
]),
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
out
=
f
(
a_v
,
b_v
)
out
=
f
(
a_v
,
b_v
)
...
...
theano/sandbox/gpuarray/tests/test_scan.py
浏览文件 @
4814cd99
...
@@ -6,10 +6,10 @@ import theano
...
@@ -6,10 +6,10 @@ import theano
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
import
theano.sandbox.rng_mrg
import
theano.sandbox.rng_mrg
from
..basic_ops
import
gpu_from_host
,
GpuFromHost
,
HostFromGpu
from
..basic_ops
import
GpuFromHost
,
HostFromGpu
from
..elemwise
import
GpuElemwise
from
..elemwise
import
GpuElemwise
from
.
test_basic_ops
import
mode_with_gpu
from
.
config
import
mode_with_gpu
,
test_ctx_name
class
T_Scan
(
TestCase
):
class
T_Scan
(
TestCase
):
...
@@ -35,7 +35,7 @@ class T_Scan(TestCase):
...
@@ -35,7 +35,7 @@ class T_Scan(TestCase):
go_backwards
=
False
,
go_backwards
=
False
,
mode
=
mode
)
mode
=
mode
)
output
=
gpu_from_host
(
output
)
output
=
GpuFromHost
(
test_ctx_name
)
(
output
)
f2
=
theano
.
function
([
u
,
x0
,
W_in
,
W
],
f2
=
theano
.
function
([
u
,
x0
,
W_in
,
W
],
output
,
output
,
updates
=
updates
,
updates
=
updates
,
...
@@ -216,7 +216,7 @@ class T_Scan(TestCase):
...
@@ -216,7 +216,7 @@ class T_Scan(TestCase):
dtype
=
'float32'
)
dtype
=
'float32'
)
vsample
=
theano
.
shared
(
v_vsample
)
vsample
=
theano
.
shared
(
v_vsample
)
trng
=
theano
.
sandbox
.
rng_mrg
.
MRG_RandomStreams
(
trng
=
theano
.
sandbox
.
rng_mrg
.
MRG_RandomStreams
(
utt
.
fetch_seed
())
utt
.
fetch_seed
())
def
f
(
vsample_tm1
):
def
f
(
vsample_tm1
):
return
trng
.
binomial
(
vsample_tm1
.
shape
,
n
=
1
,
p
=
0.3
,
return
trng
.
binomial
(
vsample_tm1
.
shape
,
n
=
1
,
p
=
0.3
,
...
@@ -238,4 +238,4 @@ class T_Scan(TestCase):
...
@@ -238,4 +238,4 @@ class T_Scan(TestCase):
# I leave this to tested by debugmode, this test was anyway
# I leave this to tested by debugmode, this test was anyway
# more of does the graph compile kind of test
# more of does the graph compile kind of test
t_result
=
my_f
()
my_f
()
theano/sandbox/gpuarray/tests/test_subtensor.py
浏览文件 @
4814cd99
...
@@ -11,8 +11,7 @@ from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
...
@@ -11,8 +11,7 @@ from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1
)
GpuAdvancedIncSubtensor1
)
from
..type
import
gpuarray_shared_constructor
from
..type
import
gpuarray_shared_constructor
from
.test_basic_ops
import
mode_with_gpu
from
.config
import
mode_with_gpu
class
G_subtensor
(
test_subtensor
.
T_subtensor
):
class
G_subtensor
(
test_subtensor
.
T_subtensor
):
...
@@ -46,8 +45,8 @@ def test_advinc_subtensor1():
...
@@ -46,8 +45,8 @@ def test_advinc_subtensor1():
yval
[:]
=
10
yval
[:]
=
10
x
=
shared
(
xval
,
name
=
'x'
)
x
=
shared
(
xval
,
name
=
'x'
)
y
=
tensor
.
tensor
(
dtype
=
'float32'
,
y
=
tensor
.
tensor
(
dtype
=
'float32'
,
broadcastable
=
(
False
,)
*
len
(
shp
),
broadcastable
=
(
False
,)
*
len
(
shp
),
name
=
'y'
)
name
=
'y'
)
expr
=
tensor
.
advanced_inc_subtensor1
(
x
,
y
,
[
0
,
2
])
expr
=
tensor
.
advanced_inc_subtensor1
(
x
,
y
,
[
0
,
2
])
f
=
theano
.
function
([
y
],
expr
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
y
],
expr
,
mode
=
mode_with_gpu
)
assert
sum
([
isinstance
(
node
.
op
,
GpuAdvancedIncSubtensor1
)
assert
sum
([
isinstance
(
node
.
op
,
GpuAdvancedIncSubtensor1
)
...
...
theano/sandbox/gpuarray/type.py
浏览文件 @
4814cd99
...
@@ -14,14 +14,80 @@ try:
...
@@ -14,14 +14,80 @@ try:
except
ImportError
:
except
ImportError
:
pass
pass
_context_reg
=
{}
def
reg_context
(
name
,
ctx
):
"""
Register a context by mapping it to a name.
The context must be of type `GpuContext` and the name can be
anything hashable (but is usually a string). Only one context can
be registered per name and the second registration for a given
name will raise an error.
Parameters
----------
name : hashable object
Name to associate the context with (usually a string)
ctx : GpuContext
Context instance
"""
if
name
in
_context_reg
:
raise
ValueError
(
"context name
%
s is already defined"
%
(
name
,))
if
not
isinstance
(
ctx
,
gpuarray
.
GpuContext
):
raise
TypeError
(
"context is not GpuContext"
)
_context_reg
[
name
]
=
ctx
def
get_context
(
name
):
"""
Retrive the context associated with a name.
Return the context object mapped to `ref` that was previously
register through :func:`reg_context`. Trying to get the context
for an unregistered `ref` will raise a exception.
Parameters
----------
name : hashable object
Name associated with the context we want (usually a string)
"""
if
name
not
in
_context_reg
:
raise
ValueError
(
"context name
%
s not defined"
%
(
name
,))
return
_context_reg
[
name
]
def
list_contexts
():
"""
Return an iterable of all the registered context names.
"""
return
_context_reg
.
keys
()
# Private method
def
_name_for_ctx
(
ctx
):
for
k
,
v
in
_context_reg
:
if
v
==
ctx
:
return
k
raise
ValueError
(
'context is not registered'
)
# This is a private method for use by the tests only
def
_unreg_context
(
name
):
del
_context_reg
[
name
]
class
GpuArrayType
(
Type
):
class
GpuArrayType
(
Type
):
def
__init__
(
self
,
dtype
,
broadcastable
,
name
=
None
):
def
__init__
(
self
,
dtype
,
broadcastable
,
context_name
=
None
,
name
=
None
):
# In case this was not provided and no global value is available
# In case this was not provided and no global value is available
self
.
dtype
=
str
(
dtype
)
self
.
dtype
=
str
(
dtype
)
self
.
broadcastable
=
tuple
(
bool
(
b
)
for
b
in
broadcastable
)
self
.
broadcastable
=
tuple
(
bool
(
b
)
for
b
in
broadcastable
)
self
.
ndim
=
len
(
self
.
broadcastable
)
self
.
ndim
=
len
(
self
.
broadcastable
)
self
.
name
=
name
self
.
name
=
name
self
.
context_name
=
context_name
try
:
try
:
self
.
typecode
=
gpuarray
.
dtype_to_typecode
(
self
.
dtype
)
self
.
typecode
=
gpuarray
.
dtype_to_typecode
(
self
.
dtype
)
except
gpuarray
.
GpuArrayException
:
except
gpuarray
.
GpuArrayException
:
...
@@ -34,10 +100,16 @@ class GpuArrayType(Type):
...
@@ -34,10 +100,16 @@ class GpuArrayType(Type):
if
broadcastable
is
None
:
if
broadcastable
is
None
:
broadcastable
=
self
.
broadcastable
broadcastable
=
self
.
broadcastable
return
self
.
__class__
(
dtype
=
dtype
,
broadcastable
=
broadcastable
,
return
self
.
__class__
(
dtype
=
dtype
,
broadcastable
=
broadcastable
,
name
=
self
.
name
)
context_name
=
self
.
context_name
,
name
=
self
.
name
)
# This is a property to keep the type pickleable
@property
def
context
(
self
):
return
get_context
(
self
.
context_name
)
def
__repr__
(
self
):
def
__repr__
(
self
):
return
"GpuArrayType(
%
s,
%
s)"
%
(
self
.
dtype
,
self
.
broadcastable
)
return
"GpuArrayType<
%
s>(
%
s,
%
s)"
%
(
self
.
context_name
,
self
.
dtype
,
self
.
broadcastable
)
def
filter
(
self
,
data
,
strict
=
False
,
allow_downcast
=
None
):
def
filter
(
self
,
data
,
strict
=
False
,
allow_downcast
=
None
):
if
(
isinstance
(
data
,
gpuarray
.
GpuArray
)
and
if
(
isinstance
(
data
,
gpuarray
.
GpuArray
)
and
...
@@ -54,25 +126,28 @@ class GpuArrayType(Type):
...
@@ -54,25 +126,28 @@ class GpuArrayType(Type):
"got
%
d (dtype
%
s)."
%
"got
%
d (dtype
%
s)."
%
(
self
,
self
.
typecode
,
self
.
dtype
,
(
self
,
self
.
typecode
,
self
.
dtype
,
data
.
typecode
,
str
(
data
.
dtype
)))
data
.
typecode
,
str
(
data
.
dtype
)))
if
self
.
context
!=
data
.
context
:
raise
TypeError
(
"data context does not match type context"
)
# fallthrough to ndim check
# fallthrough to ndim check
elif
(
allow_downcast
or
elif
(
allow_downcast
or
(
allow_downcast
is
None
and
(
allow_downcast
is
None
and
type
(
data
)
==
float
and
type
(
data
)
==
float
and
self
.
dtype
==
config
.
floatX
)):
self
.
dtype
==
config
.
floatX
)):
data
=
gpuarray
.
array
(
data
,
dtype
=
self
.
typecode
,
copy
=
False
,
data
=
gpuarray
.
array
(
data
,
dtype
=
self
.
typecode
,
copy
=
False
,
ndmin
=
len
(
self
.
broadcastable
))
ndmin
=
len
(
self
.
broadcastable
),
context
=
self
.
context
)
else
:
else
:
if
not
hasattr
(
data
,
'dtype'
):
if
not
hasattr
(
data
,
'dtype'
):
# This is to convert objects that don't have a dtype
# This is to convert objects that don't have a dtype
# (like lists). We anticipate that the type below
# (like lists). We anticipate that the type below
# will match and we pass copy=False so it won't make a
# will match and we pass copy=False so it won't make a
# second object on the GPU.
# second object on the GPU.
data
=
gpuarray
.
array
(
data
,
copy
=
False
)
data
=
gpuarray
.
array
(
data
,
copy
=
False
,
context
=
self
.
context
)
up_dtype
=
scalar
.
upcast
(
self
.
dtype
,
data
.
dtype
)
up_dtype
=
scalar
.
upcast
(
self
.
dtype
,
data
.
dtype
)
if
up_dtype
==
self
.
dtype
:
if
up_dtype
==
self
.
dtype
:
data
=
gpuarray
.
array
(
data
,
dtype
=
self
.
dtype
,
data
=
gpuarray
.
array
(
data
,
dtype
=
self
.
dtype
,
copy
=
False
,
co
py
=
False
)
co
ntext
=
self
.
context
)
else
:
else
:
raise
TypeError
(
"
%
s cannot store a value of dtype
%
s "
raise
TypeError
(
"
%
s cannot store a value of dtype
%
s "
"without risking loss of precision."
%
"without risking loss of precision."
%
...
@@ -90,8 +165,10 @@ class GpuArrayType(Type):
...
@@ -90,8 +165,10 @@ class GpuArrayType(Type):
return
data
return
data
def
filter_variable
(
self
,
other
,
allow_convert
=
True
):
def
filter_variable
(
self
,
other
,
allow_convert
=
True
):
from
theano.sandbox.gpuarray
import
GpuFromHost
if
hasattr
(
other
,
'_as_GpuArrayVariable'
):
if
hasattr
(
other
,
'_as_GpuArrayVariable'
):
other
=
other
.
_as_GpuArrayVariable
()
other
=
other
.
_as_GpuArrayVariable
(
self
.
context_name
)
if
not
isinstance
(
other
,
Variable
):
if
not
isinstance
(
other
,
Variable
):
other
=
self
.
Constant
(
type
=
self
,
data
=
other
)
other
=
self
.
Constant
(
type
=
self
,
data
=
other
)
...
@@ -120,7 +197,7 @@ class GpuArrayType(Type):
...
@@ -120,7 +197,7 @@ class GpuArrayType(Type):
str
(
self
.
broadcastable
)))
str
(
self
.
broadcastable
)))
other
=
other2
other
=
other2
return
theano
.
sandbox
.
gpuarray
.
basic_ops
.
gpu_from_host
(
other
)
return
GpuFromHost
(
self
.
context_name
)
(
other
)
@staticmethod
@staticmethod
def
values_eq
(
a
,
b
):
def
values_eq
(
a
,
b
):
...
@@ -189,7 +266,8 @@ class GpuArrayType(Type):
...
@@ -189,7 +266,8 @@ class GpuArrayType(Type):
return
pygpu
.
gpuarray
.
may_share_memory
(
a
,
b
)
return
pygpu
.
gpuarray
.
may_share_memory
(
a
,
b
)
def
value_zeros
(
self
,
shape
):
def
value_zeros
(
self
,
shape
):
return
pygpu
.
gpuarray
.
zeros
(
shape
,
dtype
=
self
.
typecode
)
return
pygpu
.
gpuarray
.
zeros
(
shape
,
dtype
=
self
.
typecode
,
context
=
self
.
context
)
def
make_variable
(
self
,
name
=
None
):
def
make_variable
(
self
,
name
=
None
):
return
self
.
Variable
(
self
,
name
=
name
)
return
self
.
Variable
(
self
,
name
=
name
)
...
@@ -197,19 +275,22 @@ class GpuArrayType(Type):
...
@@ -197,19 +275,22 @@ class GpuArrayType(Type):
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
and
return
(
type
(
self
)
==
type
(
other
)
and
self
.
typecode
==
other
.
typecode
and
self
.
typecode
==
other
.
typecode
and
self
.
broadcastable
==
other
.
broadcastable
)
self
.
broadcastable
==
other
.
broadcastable
and
self
.
context_name
==
other
.
context_name
)
def
convert_variable
(
self
,
var
):
def
convert_variable
(
self
,
var
):
vt
=
var
.
type
vt
=
var
.
type
if
(
type
(
self
)
==
type
(
vt
)
and
if
(
type
(
self
)
==
type
(
vt
)
and
self
.
typecode
==
vt
.
typecode
and
self
.
typecode
==
vt
.
typecode
and
self
.
ndim
==
vt
.
ndim
and
self
.
ndim
==
vt
.
ndim
and
self
.
context_name
==
vt
.
context_name
and
all
(
sb
==
ob
or
ob
for
sb
,
ob
in
zip
(
self
.
broadcastable
,
all
(
sb
==
ob
or
ob
for
sb
,
ob
in
zip
(
self
.
broadcastable
,
vt
.
broadcastable
))):
vt
.
broadcastable
))):
return
theano
.
tensor
.
patternbroadcast
(
var
,
self
.
broadcastable
)
return
theano
.
tensor
.
patternbroadcast
(
var
,
self
.
broadcastable
)
def
__hash__
(
self
):
def
__hash__
(
self
):
return
(
hash
(
self
.
typecode
)
^
hash
(
self
.
broadcastable
))
return
hash
((
type
(
self
),
self
.
typecode
,
self
.
broadcastable
,
self
.
context_name
))
def
dtype_specs
(
self
):
def
dtype_specs
(
self
):
"""
"""
...
@@ -324,8 +405,12 @@ class _operators(_tensor_py_operators):
...
@@ -324,8 +405,12 @@ class _operators(_tensor_py_operators):
from
.basic_ops
import
host_from_gpu
from
.basic_ops
import
host_from_gpu
return
host_from_gpu
(
self
)
return
host_from_gpu
(
self
)
def
_as_GpuArrayVariable
(
self
):
def
_as_GpuArrayVariable
(
self
,
context_name
):
return
self
if
self
.
type
.
context_name
==
context_name
:
return
self
else
:
from
.basic_ops
import
GpuToGpu
return
GpuToGpu
(
context_name
)(
self
)
class
GpuArrayVariable
(
_operators
,
Variable
):
class
GpuArrayVariable
(
_operators
,
Variable
):
...
@@ -370,7 +455,8 @@ class GpuArraySharedVariable(_operators, SharedVariable):
...
@@ -370,7 +455,8 @@ class GpuArraySharedVariable(_operators, SharedVariable):
def
set_value
(
self
,
value
,
borrow
=
False
):
def
set_value
(
self
,
value
,
borrow
=
False
):
if
isinstance
(
value
,
pygpu
.
gpuarray
.
GpuArray
):
if
isinstance
(
value
,
pygpu
.
gpuarray
.
GpuArray
):
value
=
pygpu
.
gpuarray
.
array
(
value
,
copy
=
(
not
borrow
))
value
=
pygpu
.
gpuarray
.
array
(
value
,
copy
=
(
not
borrow
),
context
=
self
.
type
.
context
)
self
.
container
.
value
=
value
self
.
container
.
value
=
value
def
__getitem__
(
self
,
*
args
):
def
__getitem__
(
self
,
*
args
):
...
@@ -382,7 +468,8 @@ GpuArrayType.SharedVariable = GpuArraySharedVariable
...
@@ -382,7 +468,8 @@ GpuArrayType.SharedVariable = GpuArraySharedVariable
def
gpuarray_shared_constructor
(
value
,
name
=
None
,
strict
=
False
,
def
gpuarray_shared_constructor
(
value
,
name
=
None
,
strict
=
False
,
allow_downcast
=
None
,
borrow
=
False
,
allow_downcast
=
None
,
borrow
=
False
,
broadcastable
=
None
):
broadcastable
=
None
,
context_name
=
None
):
"""
"""
SharedVariable constructor for GpuArrayType.
SharedVariable constructor for GpuArrayType.
...
@@ -390,10 +477,20 @@ def gpuarray_shared_constructor(value, name=None, strict=False,
...
@@ -390,10 +477,20 @@ def gpuarray_shared_constructor(value, name=None, strict=False,
if
not
isinstance
(
value
,
(
numpy
.
ndarray
,
pygpu
.
gpuarray
.
GpuArray
)):
if
not
isinstance
(
value
,
(
numpy
.
ndarray
,
pygpu
.
gpuarray
.
GpuArray
)):
raise
TypeError
(
'ndarray or GpuArray required'
)
raise
TypeError
(
'ndarray or GpuArray required'
)
try
:
get_context
(
context_name
)
except
ValueError
:
# Don't make this a hard error if we attempt to make a shared
# variable while there is no default context.
if
context_name
is
None
:
raise
TypeError
(
'No default context and no context specified'
)
raise
if
broadcastable
is
None
:
if
broadcastable
is
None
:
broadcastable
=
(
False
,)
*
value
.
ndim
broadcastable
=
(
False
,)
*
value
.
ndim
type
=
GpuArrayType
(
value
.
dtype
,
broadcastable
)
type
=
GpuArrayType
(
value
.
dtype
,
broadcastable
,
context_name
=
context_name
)
deviceval
=
pygpu
.
gpuarray
.
array
(
value
,
copy
=
(
not
borrow
))
deviceval
=
pygpu
.
gpuarray
.
array
(
value
,
copy
=
(
not
borrow
),
context
=
type
.
context
)
return
GpuArraySharedVariable
(
type
=
type
,
value
=
deviceval
,
name
=
name
,
return
GpuArraySharedVariable
(
type
=
type
,
value
=
deviceval
,
name
=
name
,
strict
=
strict
)
strict
=
strict
)
...
@@ -485,3 +582,63 @@ theano.compile.register_specify_shape_c_code(
...
@@ -485,3 +582,63 @@ theano.compile.register_specify_shape_c_code(
"""
,
"""
,
version
=
1
,
version
=
1
,
c_support_code_apply
=
'#include <numpy_compat.h>'
)
c_support_code_apply
=
'#include <numpy_compat.h>'
)
class
GpuContextType
(
Type
):
def
filter
(
self
,
data
,
strict
=
False
,
allow_downcast
=
None
):
if
not
isinstance
(
data
,
gpuarray
.
GpuContext
):
raise
TypeError
(
'context is not a GpuContext'
)
return
data
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
def
__hash__
(
self
):
return
hash
(
type
(
self
))
@staticmethod
def
values_eq
(
a
,
b
):
return
a
==
b
def
c_declare
(
self
,
name
,
sub
,
check_input
=
True
):
return
"PyGpuContextObject *
%
s;"
%
(
name
,)
def
c_init
(
self
,
name
,
sub
):
return
"
%
s = NULL;"
%
(
name
,)
def
c_extract
(
self
,
name
,
sub
,
check_input
=
True
):
if
check_input
:
res
=
"""
if (!PyObject_TypeCheck(py_
%(name)
s, &PyGpuContextType)) {
PyErr_SetString(PyExc_TypeError, "expected a GpuContext");
%(fail)
s
}
"""
%
dict
(
name
=
name
,
fail
=
sub
[
'fail'
])
else
:
res
=
""
return
res
+
"""
%(name)
s = (PyGpuContextObject *)py_
%(name)
s;
Py_INCREF(
%(name)
s);
"""
%
dict
(
name
=
name
)
def
c_cleanup
(
self
,
name
,
sub
):
return
"Py_XDECREF(
%(name)
s);
%(name)
s = NULL;"
%
dict
(
name
=
name
)
# c_sync is intentionally not declared to prevent normal usage
def
c_init_code
(
self
):
return
[
'import_pygpu__gpuarray();'
]
def
c_headers
(
self
):
return
[
'<gpuarray_api.h>'
]
def
c_header_dirs
(
self
):
return
[
pygpu
.
get_include
()]
def
c_code_cache_version
(
self
):
ver
=
pygpu
.
gpuarray
.
api_version
()
return
(
0
,
ver
[
0
])
# Variable, Contstant, ... not declared
gpu_context_type
=
GpuContextType
()
theano/sandbox/rng_mrg.py
浏览文件 @
4814cd99
...
@@ -771,6 +771,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
...
@@ -771,6 +771,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
# GpuArray version
# GpuArray version
_f16_ok
=
True
_f16_ok
=
True
def
get_context
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
@classmethod
@classmethod
def
new
(
cls
,
rstate
,
ndim
,
dtype
,
size
):
def
new
(
cls
,
rstate
,
ndim
,
dtype
,
size
):
v_size
=
as_tensor_variable
(
size
)
v_size
=
as_tensor_variable
(
size
)
...
...
theano/scan_module/scan_opt.py
浏览文件 @
4814cd99
...
@@ -1014,9 +1014,9 @@ class ScanInplaceOptimizer(Optimizer):
...
@@ -1014,9 +1014,9 @@ class ScanInplaceOptimizer(Optimizer):
"""
"""
def
__init__
(
self
,
type
Constructo
r
=
None
,
gpu_flag
=
False
,
gpua_flag
=
False
):
def
__init__
(
self
,
type
Infe
r
=
None
,
gpu_flag
=
False
,
gpua_flag
=
False
):
Optimizer
.
__init__
(
self
)
Optimizer
.
__init__
(
self
)
self
.
type
Constructor
=
typeConstructo
r
self
.
type
Infer
=
typeInfe
r
self
.
gpu_flag
=
gpu_flag
self
.
gpu_flag
=
gpu_flag
self
.
gpua_flag
=
gpua_flag
self
.
gpua_flag
=
gpua_flag
...
@@ -1062,10 +1062,15 @@ class ScanInplaceOptimizer(Optimizer):
...
@@ -1062,10 +1062,15 @@ class ScanInplaceOptimizer(Optimizer):
ls
[
idx
]
=
deep_copy_op
(
ls
[
idx
])
ls
[
idx
]
=
deep_copy_op
(
ls
[
idx
])
inputs
=
ls_begin
+
ls
+
ls_end
inputs
=
ls_begin
+
ls
+
ls_end
if
self
.
typeInfer
is
None
:
typeConstructor
=
None
else
:
typeConstructor
=
self
.
typeInfer
(
node
)
new_op
=
scan_op
.
Scan
(
op
.
inputs
,
new_op
=
scan_op
.
Scan
(
op
.
inputs
,
op
.
outputs
,
op
.
outputs
,
info
,
info
,
typeConstructor
=
self
.
typeConstructor
)
typeConstructor
=
typeConstructor
)
# Do not call make_node for test_value
# Do not call make_node for test_value
new_outs
=
new_op
(
*
inputs
,
**
dict
(
return_list
=
True
))
new_outs
=
new_op
(
*
inputs
,
**
dict
(
return_list
=
True
))
...
@@ -2325,7 +2330,7 @@ scan_eqopt2 = theano.gof.EquilibriumDB()
...
@@ -2325,7 +2330,7 @@ scan_eqopt2 = theano.gof.EquilibriumDB()
optdb
.
register
(
'scan_eqopt1'
,
scan_eqopt1
,
.
1
,
'fast_run'
,
'scan'
)
optdb
.
register
(
'scan_eqopt1'
,
scan_eqopt1
,
.
1
,
'fast_run'
,
'scan'
)
optdb
.
register
(
'scan_eqopt2'
,
scan_eqopt2
,
1.6
,
'fast_run'
,
'scan'
)
optdb
.
register
(
'scan_eqopt2'
,
scan_eqopt2
,
1.6
,
'fast_run'
,
'scan'
)
optdb
.
register
(
'scanOp_make_inplace'
,
optdb
.
register
(
'scanOp_make_inplace'
,
ScanInplaceOptimizer
(
type
Constructo
r
=
None
,
ScanInplaceOptimizer
(
type
Infe
r
=
None
,
gpu_flag
=
False
),
gpu_flag
=
False
),
75
,
75
,
'fast_run'
,
'fast_run'
,
...
...
theano/scan_module/tests/test_scan.py
浏览文件 @
4814cd99
...
@@ -4874,6 +4874,12 @@ class T_Scan_Gpuarray(unittest.TestCase, ScanGpuTests):
...
@@ -4874,6 +4874,12 @@ class T_Scan_Gpuarray(unittest.TestCase, ScanGpuTests):
def
__init__
(
self
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
from
theano.sandbox
import
gpuarray
from
theano.sandbox
import
gpuarray
self
.
gpu_backend
=
gpuarray
self
.
gpu_backend
=
gpuarray
# This is unfortunate, but required
def
gpu_from_host
(
v
):
return
gpuarray
.
GpuFromHost
(
None
)(
v
)
self
.
gpu_backend
.
gpu_from_host
=
gpu_from_host
self
.
mode_with_gpu
=
mode_with_opt
.
including
(
'gpuarray'
,
'scan'
)
self
.
mode_with_gpu
=
mode_with_opt
.
including
(
'gpuarray'
,
'scan'
)
self
.
mode_with_gpu_nodebug
=
mode_nodebug
.
including
(
'gpuarray'
,
'scan'
)
self
.
mode_with_gpu_nodebug
=
mode_nodebug
.
including
(
'gpuarray'
,
'scan'
)
super
(
T_Scan_Gpuarray
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
super
(
T_Scan_Gpuarray
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
...
...
theano/tests/test_flake8.py
浏览文件 @
4814cd99
...
@@ -158,10 +158,6 @@ whitelist_flake8 = [
...
@@ -158,10 +158,6 @@ whitelist_flake8 = [
"sandbox/linalg/__init__.py"
,
"sandbox/linalg/__init__.py"
,
"sandbox/linalg/tests/test_linalg.py"
,
"sandbox/linalg/tests/test_linalg.py"
,
"sandbox/gpuarray/__init__.py"
,
"sandbox/gpuarray/__init__.py"
,
"sandbox/gpuarray/tests/test_subtensor.py"
,
"sandbox/gpuarray/tests/test_scan.py"
,
"sandbox/gpuarray/tests/test_opt.py"
,
"sandbox/gpuarray/tests/test_elemwise.py"
,
"scan_module/scan_utils.py"
,
"scan_module/scan_utils.py"
,
"scan_module/scan_views.py"
,
"scan_module/scan_views.py"
,
"scan_module/scan.py"
,
"scan_module/scan.py"
,
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论