Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
c1366d70
提交
c1366d70
authored
4月 03, 2012
作者:
lamblin
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #590 from nouiz/test_fix
Test fix
上级
a2027539
72986083
显示空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
297 行增加
和
121 行删除
+297
-121
do_nightly_build
theano/misc/do_nightly_build
+1
-1
pycuda_example.py
theano/misc/pycuda_example.py
+230
-80
test_pycuda_example.py
theano/misc/tests/test_pycuda_example.py
+62
-39
test_pycuda_theano_simple.py
theano/misc/tests/test_pycuda_theano_simple.py
+4
-1
没有找到文件。
theano/misc/do_nightly_build
浏览文件 @
c1366d70
...
@@ -59,7 +59,7 @@ echo "Number of elements in the compiledir:"
...
@@ -59,7 +59,7 @@ echo "Number of elements in the compiledir:"
ls
${
COMPILEDIR
}
|wc
-l
ls
${
COMPILEDIR
}
|wc
-l
echo
"Executing nosetests with mode=FAST_RUN"
echo
"Executing nosetests with mode=FAST_RUN"
THEANO_FLAGS
=
${
FLAGS
}
,mode
=
FAST_RUN
${
NOSETESTS
}
${
PROFILING
}
${
ARGS
}
THEANO_FLAGS
=
cmodule.warn_no_version
=
True,
${
FLAGS
}
,mode
=
FAST_RUN
${
NOSETESTS
}
${
PROFILING
}
${
ARGS
}
echo
"Number of elements in the compiledir:"
echo
"Number of elements in the compiledir:"
ls
${
COMPILEDIR
}
|wc
-l
ls
${
COMPILEDIR
}
|wc
-l
...
...
theano/misc/pycuda_example.py
浏览文件 @
c1366d70
"""
"""
This file show how we can use Pycuda compiled fct in a Theano
This file show how we can use Pycuda compiled fct in a Theano
Op. Do no use those op in production code. See the TODO.
Op. Do no use those op in production code. See the TODO.
You can use them as a guide to use your pycuda code into a Theano op.
You can use them as a guide to use your pycuda code into a Theano op.
The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code generated with pycuda.compiler.SourceModule
The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code
generated with pycuda.compiler.SourceModule
The PycudaElemwiseKernelOp op use pycuda code generated with pycuda.elementwise.ElementwiseKernel. It must be wrapper by TheanoElementwiseKernel.
Their is a test in test_pycuda.py.
Their is a test in test_pycuda.py.
This don't work with broadcast and non-contiguous memory as pycuda don't support that, but we make sure we don't introduce problem.
This don't work with broadcast and non-contiguous memory as pycuda
don't support that, but we make sure we don't introduce problem.
If the memory is non-contiguous, we create a new copy that is contiguous.
If the memory is non-contiguous, we create a new copy that is contiguous.
If their is broadcasted dimensions, we raise an error.
If their is broadcasted dimensions, we raise an error.
#The following is commented as it work only with old pycuda version
The PycudaElemwiseKernelOp op use pycuda code generated with
pycuda.elementwise.ElementwiseKernel. It must be wrapper by
TheanoElementwiseKernel.
"""
"""
import
numpy
import
numpy
...
@@ -19,7 +25,8 @@ import numpy
...
@@ -19,7 +25,8 @@ import numpy
import
theano
import
theano
from
theano.gof
import
Op
,
Apply
,
local_optimizer
,
EquilibriumDB
from
theano.gof
import
Op
,
Apply
,
local_optimizer
,
EquilibriumDB
from
theano.sandbox.cuda
import
GpuElemwise
,
CudaNdarrayType
,
GpuOp
from
theano.sandbox.cuda
import
GpuElemwise
,
CudaNdarrayType
,
GpuOp
from
theano.sandbox.cuda.basic_ops
import
as_cuda_ndarray_variable
,
gpu_contiguous
from
theano.sandbox.cuda.basic_ops
import
(
as_cuda_ndarray_variable
,
gpu_contiguous
)
from
theano.sandbox.cuda.opt
import
gpu_seqopt
from
theano.sandbox.cuda.opt
import
gpu_seqopt
import
pycuda_init
import
pycuda_init
...
@@ -30,30 +37,36 @@ import pycuda
...
@@ -30,30 +37,36 @@ import pycuda
from
pycuda.elementwise
import
ElementwiseKernel
from
pycuda.elementwise
import
ElementwiseKernel
from
pycuda.compiler
import
SourceModule
from
pycuda.compiler
import
SourceModule
from
pycuda.tools
import
VectorArg
from
pycuda.tools
import
VectorArg
import
pycuda.gpuarray
def
theano_parse_c_arg
(
c_arg
):
def
theano_parse_c_arg
(
c_arg
):
c_arg
=
c_arg
.
replace
(
'npy_float32'
,
'float'
)
c_arg
=
c_arg
.
replace
(
'npy_float32'
,
'float'
)
c_arg
=
c_arg
.
replace
(
'npy_float64'
,
'double'
)
c_arg
=
c_arg
.
replace
(
'npy_float64'
,
'double'
)
c_arg
=
c_arg
.
replace
(
'npy_int32'
,
'int'
)
c_arg
=
c_arg
.
replace
(
'npy_int32'
,
'int'
)
c_arg
=
c_arg
.
replace
(
'npy_int8'
,
'char'
)
c_arg
=
c_arg
.
replace
(
'npy_int8'
,
'char'
)
c_arg
=
c_arg
.
replace
(
'npy_ucs4'
,
'unsigned int'
)
c_arg
=
c_arg
.
replace
(
'npy_ucs4'
,
'unsigned int'
)
c_arg
=
c_arg
.
replace
(
'npy_uint32'
,
'unsigned int'
)
c_arg
=
c_arg
.
replace
(
'npy_uint32'
,
'unsigned int'
)
c_arg
=
c_arg
.
replace
(
'npy_uint16'
,
'unsigned short'
)
c_arg
=
c_arg
.
replace
(
'npy_uint16'
,
'unsigned short'
)
c_arg
=
c_arg
.
replace
(
'npy_uint8'
,
'unsigned char'
)
c_arg
=
c_arg
.
replace
(
'npy_uint8'
,
'unsigned char'
)
return
pycuda
.
tools
.
parse_c_arg
(
c_arg
)
return
pycuda
.
tools
.
parse_c_arg
(
c_arg
)
"""
class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
def __init__(self, arguments, operation,
def __init__(self, arguments, operation,
name="kernel", keep=False, options=[], **kwargs):
name="kernel", keep=False, options=[], **kwargs):
if isinstance(arguments, basestring):
if isinstance(arguments, basestring):
arguments
=
[
theano_parse_c_arg
(
arg
)
for
arg
in
arguments
.
split
(
","
)]
arguments = [theano_parse_c_arg(arg)
pycuda
.
elementwise
.
ElementwiseKernel
.
__init__
(
self
,
arguments
,
operation
,
name
,
keep
,
options
,
**
kwargs
)
for arg in arguments.split(",")]
pycuda.elementwise.ElementwiseKernel.__init__(self, arguments,
operation, name, keep,
options, **kwargs)
def __call__(self, *args):
def __call__(self, *args):
vectors = []
vectors = []
invocation_args = []
invocation_args = []
for
arg
,
arg_descr
in
zip
(
args
,
self
.
arguments
):
for arg, arg_descr in zip(args, self.
gen_kwargs["arguments"]
):
if isinstance(arg_descr, VectorArg):
if isinstance(arg_descr, VectorArg):
vectors.append(arg)
vectors.append(arg)
invocation_args.append(arg.gpudata)
invocation_args.append(arg.gpudata)
...
@@ -62,7 +75,7 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
...
@@ -62,7 +75,7 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
repr_vec = vectors[0]
repr_vec = vectors[0]
invocation_args.append(repr_vec.mem_size)
invocation_args.append(repr_vec.mem_size)
if
hasattr
(
repr_vec
,
"_block"
)
and
hasattr
(
repr_vec
,
"_grid"
):
if hasattr(repr_vec,
"_block") and hasattr(repr_vec,
"_grid"):
self.func.set_block_shape(*repr_vec._block)
self.func.set_block_shape(*repr_vec._block)
self.func.prepared_call(repr_vec._grid, *invocation_args)
self.func.prepared_call(repr_vec._grid, *invocation_args)
else:
else:
...
@@ -71,26 +84,120 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
...
@@ -71,26 +84,120 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
self.func.prepared_call(_grid, *invocation_args)
self.func.prepared_call(_grid, *invocation_args)
class PycudaElemwiseKernelOp(GpuOp):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern={}, name=None):
self.name = name
self.scalar_op = scalar_op
self.inplace_pattern = None
def __str__(self):
if self.name is None:
if self.inplace_pattern:
items = self.inplace_pattern.items()
items.sort()
return self.__class__.__name__ + "{
%
s}
%
s"
%
(self.scalar_op,
str(items))
else:
return self.__class__.__name__ + "{
%
s}"
%
(self.scalar_op)
else:
return self.name
def __eq__(self, other):
return (type(self) == type(other) and
self.scalar_op == other.scalar_op and
self.inplace_pattern == other.inplace_pattern)
def __hash__(self):
return (hash(type(self)) ^ hash(self.scalar_op) ^
hash(self.inplace_pattern))
def make_node(self, *inputs):
_inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
for i in _inputs[1:]:
if i.type.ndim != inputs[0].type.ndim:
raise TypeError('different ranks among inputs')
if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs) == 2 # TODO remove
# output is broadcastable only along dimensions where all inputs are
# broadcastable
broadcastable = []
for d in xrange(_inputs[0].type.ndim):
bcast_d = True
for i in _inputs:
if not i.type.broadcastable[d]:
bcast_d = False
break
broadcastable.append(bcast_d)
assert len(broadcastable) == _inputs[0].type.ndim
otype = CudaNdarrayType(broadcastable=broadcastable)
assert self.nout == 1
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i" + str(id) for id in range(len(inputs))]
out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name",
tuple([n + "[i]"for n in in_name]),
tuple(n + "[i]"for n in out_name), {})
self.pycuda_fct = TheanoElementwiseKernel(
", ".join([var.type.dtype_specs()[1] + " *" + name
for var, name in (zip(inputs, in_name) +
zip(out_node.outputs, out_name))]),
c_code,
"pycuda_elemwise_kernel_
%
s"
%
str(self.scalar_op),
preamble=("#include<Python.h>
\n
"
"#include <numpy/arrayobject.h>"))
return out_node
def perform(self, node, inputs, out):
#TODO assert all input have the same shape
z, = out
if z[0] is None or z[0].shape != inputs[0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
i = inputs + z
self.pycuda_fct(*i)
"""
class
PycudaElemwiseSourceModuleOp
(
GpuOp
):
class
PycudaElemwiseSourceModuleOp
(
GpuOp
):
nin
=
property
(
lambda
self
:
self
.
scalar_op
.
nin
)
nin
=
property
(
lambda
self
:
self
.
scalar_op
.
nin
)
nout
=
property
(
lambda
self
:
self
.
scalar_op
.
nout
)
nout
=
property
(
lambda
self
:
self
.
scalar_op
.
nout
)
def
__init__
(
self
,
scalar_op
,
inplace_pattern
=
{},
name
=
None
):
def
__init__
(
self
,
scalar_op
,
inplace_pattern
=
{},
name
=
None
):
self
.
name
=
name
self
.
name
=
name
self
.
scalar_op
=
scalar_op
self
.
scalar_op
=
scalar_op
self
.
inplace_pattern
=
None
self
.
inplace_pattern
=
None
def
__str__
(
self
):
def
__str__
(
self
):
if
self
.
name
is
None
:
if
self
.
name
is
None
:
if
self
.
inplace_pattern
:
if
self
.
inplace_pattern
:
items
=
self
.
inplace_pattern
.
items
()
items
=
self
.
inplace_pattern
.
items
()
items
.
sort
()
items
.
sort
()
return
self
.
__class__
.
__name__
+
"{
%
s}
%
s"
%
(
self
.
scalar_op
,
str
(
items
))
return
self
.
__class__
.
__name__
+
"{
%
s}
%
s"
%
(
self
.
scalar_op
,
str
(
items
))
else
:
else
:
return
self
.
__class__
.
__name__
+
"{
%
s}"
%
(
self
.
scalar_op
)
return
self
.
__class__
.
__name__
+
"{
%
s}"
%
(
self
.
scalar_op
)
else
:
else
:
return
self
.
name
return
self
.
name
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
and
self
.
scalar_op
==
other
.
scalar_op
and
self
.
inplace_pattern
==
other
.
inplace_pattern
)
def
__hash__
(
self
):
return
(
hash
(
type
(
self
))
^
hash
(
self
.
scalar_op
)
^
hash
(
self
.
inplace_pattern
))
def
make_node
(
self
,
*
inputs
):
def
make_node
(
self
,
*
inputs
):
_inputs
=
[
gpu_contiguous
(
as_cuda_ndarray_variable
(
i
))
for
i
in
inputs
]
_inputs
=
[
gpu_contiguous
(
as_cuda_ndarray_variable
(
i
))
for
i
in
inputs
]
if
self
.
nin
>
0
and
len
(
_inputs
)
!=
self
.
nin
:
if
self
.
nin
>
0
and
len
(
_inputs
)
!=
self
.
nin
:
...
@@ -101,17 +208,23 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
...
@@ -101,17 +208,23 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
if
any
([
any
(
i
.
type
.
broadcastable
)
for
i
in
inputs
]):
if
any
([
any
(
i
.
type
.
broadcastable
)
for
i
in
inputs
]):
raise
Exception
(
"pycuda don't support broadcasted dimensions"
)
raise
Exception
(
"pycuda don't support broadcasted dimensions"
)
assert
len
(
inputs
)
==
2
#
TODO remove
assert
len
(
inputs
)
==
2
#
TODO remove
otype
=
CudaNdarrayType
(
broadcastable
=
[
False
]
*
_inputs
[
0
]
.
type
.
ndim
)
otype
=
CudaNdarrayType
(
broadcastable
=
[
False
]
*
_inputs
[
0
]
.
type
.
ndim
)
assert
self
.
nout
==
1
assert
self
.
nout
==
1
fct_name
=
"pycuda_elemwise_
%
s"
%
str
(
self
.
scalar_op
)
fct_name
=
"pycuda_elemwise_
%
s"
%
str
(
self
.
scalar_op
)
out_node
=
Apply
(
self
,
_inputs
,
[
otype
()
for
o
in
xrange
(
self
.
nout
)])
out_node
=
Apply
(
self
,
_inputs
,
[
otype
()
for
o
in
xrange
(
self
.
nout
)])
in_name
=
[
"i"
+
str
(
id
)
for
id
in
range
(
len
(
inputs
))]
in_name
=
[
"i"
+
str
(
id
)
for
id
in
range
(
len
(
inputs
))]
out_name
=
[
"o"
+
str
(
id
)
for
id
in
range
(
self
.
nout
)]
out_name
=
[
"o"
+
str
(
id
)
for
id
in
range
(
self
.
nout
)]
c_code
=
self
.
scalar_op
.
c_code
(
out_node
,
"some_name"
,
tuple
([
n
+
"[i]"
for
n
in
in_name
]),
tuple
(
n
+
"[i]"
for
n
in
out_name
),
{})
c_code
=
self
.
scalar_op
.
c_code
(
out_node
,
"some_name"
,
c_code_param
=
", "
.
join
([
var
.
type
.
dtype_specs
()[
1
]
+
" *"
+
name
for
var
,
name
in
zip
(
inputs
,
in_name
)
+
zip
(
out_node
.
outputs
,
out_name
)]
+
[
"int size"
])
tuple
([
n
+
"[i]"
for
n
in
in_name
]),
tuple
(
n
+
"[i]"
for
n
in
out_name
),
{})
c_code_param
=
", "
.
join
([
var
.
type
.
dtype_specs
()[
1
]
+
" *"
+
name
for
var
,
name
in
(
zip
(
inputs
,
in_name
)
+
zip
(
out_node
.
outputs
,
out_name
))]
+
[
"int size"
])
mod
=
SourceModule
(
"""
mod
=
SourceModule
(
"""
#include<Python.h>
#include<Python.h>
#include <numpy/arrayobject.h>
#include <numpy/arrayobject.h>
...
@@ -123,7 +236,7 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
...
@@ -123,7 +236,7 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
%
s
%
s
}
}
}
}
"""
%
(
fct_name
,
c_code_param
,
c_code
))
"""
%
(
fct_name
,
c_code_param
,
c_code
))
self
.
pycuda_fct
=
mod
.
get_function
(
fct_name
)
self
.
pycuda_fct
=
mod
.
get_function
(
fct_name
)
return
out_node
return
out_node
...
@@ -131,41 +244,46 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
...
@@ -131,41 +244,46 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
#TODO support broadcast!
#TODO support broadcast!
#TODO assert all input have the same shape
#TODO assert all input have the same shape
z
,
=
out
z
,
=
out
if
z
[
0
]
is
None
or
z
[
0
]
.
shape
!=
inputs
[
0
]
.
shape
:
if
z
[
0
]
is
None
or
z
[
0
]
.
shape
!=
inputs
[
0
]
.
shape
:
z
[
0
]
=
theano
.
sandbox
.
cuda
.
CudaNdarray
.
zeros
(
inputs
[
0
]
.
shape
)
z
[
0
]
=
theano
.
sandbox
.
cuda
.
CudaNdarray
.
zeros
(
inputs
[
0
]
.
shape
)
if
inputs
[
0
]
.
shape
!=
inputs
[
1
]
.
shape
:
if
inputs
[
0
]
.
shape
!=
inputs
[
1
]
.
shape
:
raise
TypeError
(
"PycudaElemwiseSourceModuleOp: inputs don't have the same shape!"
)
raise
TypeError
(
"PycudaElemwiseSourceModuleOp:"
" inputs don't have the same shape!"
)
if
inputs
[
0
]
.
size
>
512
:
if
inputs
[
0
]
.
size
>
512
:
grid
=
(
int
(
numpy
.
ceil
(
inputs
[
0
]
.
size
/
512.
)),
1
)
grid
=
(
int
(
numpy
.
ceil
(
inputs
[
0
]
.
size
/
512.
)),
1
)
block
=
(
512
,
1
,
1
)
block
=
(
512
,
1
,
1
)
else
:
else
:
grid
=
(
1
,
1
)
grid
=
(
1
,
1
)
block
=
(
inputs
[
0
]
.
shape
[
0
],
inputs
[
0
]
.
shape
[
1
],
1
)
block
=
(
inputs
[
0
]
.
shape
[
0
],
inputs
[
0
]
.
shape
[
1
],
1
)
self
.
pycuda_fct
(
inputs
[
0
],
inputs
[
1
],
z
[
0
],
numpy
.
intc
(
inputs
[
1
]
.
size
),
block
=
block
,
grid
=
grid
)
self
.
pycuda_fct
(
inputs
[
0
],
inputs
[
1
],
z
[
0
],
numpy
.
intc
(
inputs
[
1
]
.
size
),
block
=
block
,
grid
=
grid
)
class
PycudaElemwise
KernelOp
(
Gpu
Op
):
class
PycudaElemwise
SourceModuleMakeThunkOp
(
Op
):
nin
=
property
(
lambda
self
:
self
.
scalar_op
.
nin
)
nin
=
property
(
lambda
self
:
self
.
scalar_op
.
nin
)
nout
=
property
(
lambda
self
:
self
.
scalar_op
.
nout
)
nout
=
property
(
lambda
self
:
self
.
scalar_op
.
nout
)
def
__init__
(
self
,
scalar_op
,
inplace_pattern
=
{},
name
=
None
):
def
__init__
(
self
,
scalar_op
,
inplace_pattern
=
{},
name
=
None
):
self
.
name
=
name
self
.
name
=
name
self
.
scalar_op
=
scalar_op
self
.
scalar_op
=
scalar_op
self
.
inplace_pattern
=
None
self
.
inplace_pattern
=
None
def
__str__
(
self
):
def
__str__
(
self
):
if
self
.
name
is
None
:
if
self
.
name
is
None
:
if
self
.
inplace_pattern
:
if
self
.
inplace_pattern
:
items
=
self
.
inplace_pattern
.
items
()
items
=
self
.
inplace_pattern
.
items
()
items
.
sort
()
items
.
sort
()
return
self
.
__class__
.
__name__
+
"{
%
s}
%
s"
%
(
self
.
scalar_op
,
str
(
items
))
return
self
.
__class__
.
__name__
+
"{
%
s}
%
s"
%
(
self
.
scalar_op
,
str
(
items
))
else
:
else
:
return
self
.
__class__
.
__name__
+
"{
%
s}"
%
(
self
.
scalar_op
)
return
self
.
__class__
.
__name__
+
"{
%
s}"
%
(
self
.
scalar_op
)
else
:
else
:
return
self
.
name
return
self
.
name
def
make_node
(
self
,
*
inputs
):
def
make_node
(
self
,
*
inputs
):
assert
self
.
nout
==
1
assert
len
(
inputs
)
==
2
# TODO remove
_inputs
=
[
gpu_contiguous
(
as_cuda_ndarray_variable
(
i
))
for
i
in
inputs
]
_inputs
=
[
gpu_contiguous
(
as_cuda_ndarray_variable
(
i
))
for
i
in
inputs
]
if
self
.
nin
>
0
and
len
(
_inputs
)
!=
self
.
nin
:
if
self
.
nin
>
0
and
len
(
_inputs
)
!=
self
.
nin
:
raise
TypeError
(
'Wrong argument count'
,
(
self
.
nin
,
len
(
_inputs
)))
raise
TypeError
(
'Wrong argument count'
,
(
self
.
nin
,
len
(
_inputs
)))
...
@@ -175,57 +293,86 @@ class PycudaElemwiseKernelOp(GpuOp):
...
@@ -175,57 +293,86 @@ class PycudaElemwiseKernelOp(GpuOp):
if
any
([
any
(
i
.
type
.
broadcastable
)
for
i
in
inputs
]):
if
any
([
any
(
i
.
type
.
broadcastable
)
for
i
in
inputs
]):
raise
Exception
(
"pycuda don't support broadcasted dimensions"
)
raise
Exception
(
"pycuda don't support broadcasted dimensions"
)
assert
len
(
inputs
)
==
2
#TODO remove
# output is broadcastable only along dimensions where all inputs are broadcastable
broadcastable
=
[]
for
d
in
xrange
(
_inputs
[
0
]
.
type
.
ndim
):
bcast_d
=
True
for
i
in
_inputs
:
if
not
i
.
type
.
broadcastable
[
d
]:
bcast_d
=
False
break
broadcastable
.
append
(
bcast_d
)
assert
len
(
broadcastable
)
==
_inputs
[
0
]
.
type
.
ndim
otype
=
CudaNdarrayType
(
broadcastable
=
broadcastable
)
assert
self
.
nout
==
1
otype
=
CudaNdarrayType
(
broadcastable
=
[
False
]
*
_inputs
[
0
]
.
type
.
ndim
)
out_node
=
Apply
(
self
,
_inputs
,
[
otype
()
for
o
in
xrange
(
self
.
nout
)])
out_node
=
Apply
(
self
,
_inputs
,
[
otype
()
for
o
in
xrange
(
self
.
nout
)])
in_name
=
[
"i"
+
str
(
id
)
for
id
in
range
(
len
(
inputs
))]
out_name
=
[
"o"
+
str
(
id
)
for
id
in
range
(
self
.
nout
)]
c_code
=
self
.
scalar_op
.
c_code
(
out_node
,
"some_name"
,
tuple
([
n
+
"[i]"
for
n
in
in_name
]),
tuple
(
n
+
"[i]"
for
n
in
out_name
),
{})
self
.
pycuda_fct
=
TheanoElementwiseKernel
(
", "
.
join
([
var
.
type
.
dtype_specs
()[
1
]
+
" *"
+
name
for
var
,
name
in
zip
(
inputs
,
in_name
)
+
zip
(
out_node
.
outputs
,
out_name
)]),
c_code
,
"pycuda_elemwise_kernel_
%
s"
%
str
(
self
.
scalar_op
),
preamble
=
"""#include<Python.h>
#include <numpy/arrayobject.h>"""
)
return
out_node
return
out_node
def
perform
(
self
,
node
,
inputs
,
out
):
def
make_thunk
(
self
,
node
,
storage_map
,
_
,
_2
):
#TODO support broadcast!
#TODO assert all input have the same shape
#TODO assert all input have the same shape
z
,
=
out
fct_name
=
"pycuda_elemwise_
%
s"
%
str
(
self
.
scalar_op
)
if
z
[
0
]
is
None
or
z
[
0
]
.
shape
!=
inputs
[
0
]
.
shape
:
in_name
=
[
"i"
+
str
(
id
)
for
id
in
range
(
len
(
node
.
inputs
))]
z
[
0
]
=
theano
.
sandbox
.
cuda
.
CudaNdarray
.
zeros
(
inputs
[
0
]
.
shape
)
out_name
=
[
"o"
+
str
(
id
)
for
id
in
range
(
self
.
nout
)]
i
=
inputs
+
z
self
.
pycuda_fct
(
*
i
)
c_code
=
self
.
scalar_op
.
c_code
(
node
,
"some_name"
,
tuple
([
n
+
"[i]"
for
n
in
in_name
]),
tuple
(
n
+
"[i]"
for
n
in
out_name
),
{})
c_code_param
=
", "
.
join
([
var
.
type
.
dtype_specs
()[
1
]
+
" *"
+
name
for
var
,
name
in
zip
(
node
.
inputs
,
in_name
)
+
zip
(
node
.
outputs
,
out_name
)]
+
[
"int size"
])
mod
=
SourceModule
(
"""
#include<Python.h>
#include <numpy/arrayobject.h>
__global__ void
%
s(
%
s)
{
int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
i += threadIdx.x + threadIdx.y*blockDim.x;
if(i<size){
%
s
}
}
"""
%
(
fct_name
,
c_code_param
,
c_code
))
pycuda_fct
=
mod
.
get_function
(
fct_name
)
inputs
=
[
storage_map
[
v
]
for
v
in
node
.
inputs
]
outputs
=
[
storage_map
[
v
]
for
v
in
node
.
outputs
]
def
thunk
():
z
=
outputs
[
0
]
if
z
[
0
]
is
None
or
z
[
0
]
.
shape
!=
inputs
[
0
][
0
]
.
shape
:
z
[
0
]
=
theano
.
sandbox
.
cuda
.
CudaNdarray
.
zeros
(
inputs
[
0
][
0
]
.
shape
)
if
inputs
[
0
][
0
]
.
shape
!=
inputs
[
1
][
0
]
.
shape
:
raise
TypeError
(
"PycudaElemwiseSourceModuleMakeThunkOp:"
" inputs don't have the same shape!"
)
if
inputs
[
0
][
0
]
.
size
>
512
:
grid
=
(
int
(
numpy
.
ceil
(
inputs
[
0
][
0
]
.
size
/
512.
)),
1
)
block
=
(
512
,
1
,
1
)
else
:
grid
=
(
1
,
1
)
block
=
(
inputs
[
0
][
0
]
.
shape
[
0
],
inputs
[
0
][
0
]
.
shape
[
1
],
1
)
out
=
pycuda_fct
(
inputs
[
0
][
0
],
inputs
[
1
][
0
],
z
[
0
],
numpy
.
intc
(
inputs
[
1
][
0
]
.
size
),
block
=
block
,
grid
=
grid
)
thunk
.
inputs
=
inputs
thunk
.
outputs
=
outputs
thunk
.
lazy
=
False
return
thunk
pycuda_optimizer
=
EquilibriumDB
()
pycuda_optimizer
=
EquilibriumDB
()
gpu_seqopt
.
register
(
"pycuda_optimizer"
,
pycuda_optimizer
,
1.5
,
"fast_run"
)
gpu_seqopt
.
register
(
"pycuda_optimizer"
,
pycuda_optimizer
,
1.5
,
"fast_run"
)
@local_optimizer
([])
@local_optimizer
([])
def
local_pycuda_gpu_elemwise
(
node
):
def
local_pycuda_gpu_elemwise
(
node
):
"""
"""
GpuElemwise -> PycudaElemwiseSourceModuleOp
GpuElemwise -> PycudaElemwiseSourceModuleOp
"""
"""
if
isinstance
(
node
.
op
,
GpuElemwise
):
if
isinstance
(
node
.
op
,
GpuElemwise
):
if
not
any
([
any
(
i
.
type
.
broadcastable
)
for
i
in
node
.
inputs
])
and
all
([
i
.
ndim
<=
2
for
i
in
node
.
inputs
]):
if
(
not
any
([
any
(
i
.
type
.
broadcastable
)
for
i
in
node
.
inputs
])
and
new_op
=
PycudaElemwiseSourceModuleOp
(
node
.
op
.
scalar_op
,
node
.
op
.
inplace_pattern
)(
*
node
.
inputs
)
all
([
i
.
ndim
<=
2
for
i
in
node
.
inputs
])):
new_op
=
PycudaElemwiseSourceModuleOp
(
node
.
op
.
scalar_op
,
node
.
op
.
inplace_pattern
)(
*
node
.
inputs
)
return
[
new_op
]
return
[
new_op
]
pycuda_optimizer
.
register
(
"local_pycuda_gpu_elemwise"
,
local_pycuda_gpu_elemwise
)
pycuda_optimizer
.
register
(
"local_pycuda_gpu_elemwise"
,
local_pycuda_gpu_elemwise
)
@local_optimizer
([])
@local_optimizer
([])
def
local_pycuda_gpu_elemwise_kernel
(
node
):
def
local_pycuda_gpu_elemwise_kernel
(
node
):
...
@@ -233,8 +380,11 @@ def local_pycuda_gpu_elemwise_kernel(node):
...
@@ -233,8 +380,11 @@ def local_pycuda_gpu_elemwise_kernel(node):
GpuElemwise -> PycudaElemwiseKernelOp
GpuElemwise -> PycudaElemwiseKernelOp
"""
"""
if
isinstance
(
node
.
op
,
GpuElemwise
):
if
isinstance
(
node
.
op
,
GpuElemwise
):
if
not
any
([
any
(
i
.
type
.
broadcastable
)
for
i
in
node
.
inputs
]):
if
not
any
([
any
(
i
.
type
.
broadcastable
)
for
i
in
node
.
inputs
]):
new_op
=
PycudaElemwiseKernelOp
(
node
.
op
.
scalar_op
,
node
.
op
.
inplace_pattern
)(
*
node
.
inputs
)
new_op
=
PycudaElemwiseKernelOp
(
node
.
op
.
scalar_op
,
node
.
op
.
inplace_pattern
)(
*
node
.
inputs
)
return
[
new_op
]
return
[
new_op
]
pycuda_optimizer
.
register
(
"local_pycuda_gpu_elemwise_kernel"
,
local_pycuda_gpu_elemwise_kernel
,
1.5
)
pycuda_optimizer
.
register
(
"local_pycuda_gpu_elemwise_kernel"
,
local_pycuda_gpu_elemwise_kernel
,
1.5
)
theano/misc/tests/test_pycuda_example.py
浏览文件 @
c1366d70
...
@@ -5,7 +5,8 @@ import theano.misc.pycuda_init
...
@@ -5,7 +5,8 @@ import theano.misc.pycuda_init
if
not
theano
.
misc
.
pycuda_init
.
pycuda_available
:
if
not
theano
.
misc
.
pycuda_init
.
pycuda_available
:
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
raise
SkipTest
(
"Pycuda not installed. Skip test of theano op with pycuda code."
)
raise
SkipTest
(
"Pycuda not installed. Skip test of theano op"
" with pycuda code."
)
import
theano.sandbox.cuda
as
cuda_ndarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
if
cuda_ndarray
.
cuda_available
==
False
:
...
@@ -14,71 +15,93 @@ if cuda_ndarray.cuda_available == False:
...
@@ -14,71 +15,93 @@ if cuda_ndarray.cuda_available == False:
import
theano
import
theano
import
theano.tensor
as
T
import
theano.tensor
as
T
from
theano.misc.pycuda_example
import
PycudaElemwiseSourceModuleOp
,
PycudaElemwiseKernelOp
,
PycudaElemwiseSourceModuleMakeThunkOp
from
theano.misc.pycuda_example
import
(
PycudaElemwiseSourceModuleOp
,
# PycudaElemwiseKernelOp,
PycudaElemwiseSourceModuleMakeThunkOp
)
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpu'
)
else
:
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpu'
)
def
test_pycuda_elemwise_source_module
():
def
test_pycuda_elemwise_source_module
():
for
shape
in
[(
5
,
5
),
(
10
,
49
),
(
50
,
49
),(
500
,
501
),(
5000
,
50
01
)]:
for
shape
in
[(
5
,
5
),
(
10
,
49
),
(
50
,
49
),
(
500
,
5
01
)]:
for
op
in
[
theano
.
scalar
.
basic
.
mul
,
theano
.
scalar
.
basic
.
add
]:
for
op
in
[
theano
.
scalar
.
basic
.
mul
,
theano
.
scalar
.
basic
.
add
]:
x
=
T
.
fmatrix
(
'x'
)
x
=
T
.
fmatrix
(
'x'
)
y
=
T
.
fmatrix
(
'y'
)
y
=
T
.
fmatrix
(
'y'
)
elemwise_op
=
theano
.
tensor
.
Elemwise
(
op
)
elemwise_op
=
theano
.
tensor
.
Elemwise
(
op
)
pycuda_op
=
PycudaElemwiseSourceModuleOp
(
op
)
pycuda_op
=
PycudaElemwiseSourceModuleOp
(
op
)
pycuda_op_thunk
=
PycudaElemwiseSourceModuleMakeThunkOp
(
op
)
pycuda_op_thunk
=
PycudaElemwiseSourceModuleMakeThunkOp
(
op
)
f
=
theano
.
function
([
x
,
y
],
elemwise_op
(
x
,
y
),
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
x
,
y
],
elemwise_op
(
x
,
y
),
mode
=
mode_with_gpu
)
f2
=
theano
.
function
([
x
,
y
],
theano
.
sandbox
.
cuda
.
host_from_gpu
(
pycuda_op
(
x
,
y
)))
f2
=
theano
.
function
([
x
,
y
],
f3
=
theano
.
function
([
x
,
y
],
elemwise_op
(
x
,
y
),
theano
.
sandbox
.
cuda
.
host_from_gpu
(
mode
=
mode_with_gpu
.
including
(
"local_pycuda_gpu_elemwise"
))
pycuda_op
(
x
,
y
)),
f4
=
theano
.
function
([
x
,
y
],
theano
.
sandbox
.
cuda
.
host_from_gpu
(
pycuda_op_thunk
(
x
,
y
)))
mode
=
mode_with_gpu
)
mode_pycuda
=
mode_with_gpu
.
including
(
"local_pycuda_gpu_elemwise"
)
f3
=
theano
.
function
([
x
,
y
],
elemwise_op
(
x
,
y
),
mode
=
mode_pycuda
)
f4
=
theano
.
function
([
x
,
y
],
theano
.
sandbox
.
cuda
.
host_from_gpu
(
pycuda_op_thunk
(
x
,
y
)),
mode
=
mode_with_gpu
)
assert
any
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
GpuElemwise
)
for
node
in
f
.
maker
.
env
.
toposort
()])
assert
any
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
GpuElemwise
)
assert
any
([
isinstance
(
node
.
op
,
PycudaElemwiseSourceModuleOp
)
for
node
in
f2
.
maker
.
env
.
toposort
()])
for
node
in
f
.
maker
.
env
.
toposort
()])
assert
any
([
isinstance
(
node
.
op
,
PycudaElemwiseSourceModuleOp
)
for
node
in
f3
.
maker
.
env
.
toposort
()])
assert
any
([
isinstance
(
node
.
op
,
PycudaElemwiseSourceModuleOp
)
assert
any
([
isinstance
(
node
.
op
,
PycudaElemwiseSourceModuleMakeThunkOp
)
for
node
in
f4
.
maker
.
env
.
toposort
()])
for
node
in
f2
.
maker
.
env
.
toposort
()])
assert
any
([
isinstance
(
node
.
op
,
PycudaElemwiseSourceModuleOp
)
for
node
in
f3
.
maker
.
env
.
toposort
()])
assert
any
([
isinstance
(
node
.
op
,
PycudaElemwiseSourceModuleMakeThunkOp
)
for
node
in
f4
.
maker
.
env
.
toposort
()])
val1
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
val1
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
val2
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
val2
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
assert
(
f
(
val1
,
val2
)
==
f2
(
val1
,
val2
))
.
all
()
assert
(
f
(
val1
,
val2
)
==
f2
(
val1
,
val2
))
.
all
()
assert
(
f
(
val1
,
val2
)
==
f3
(
val1
,
val2
))
.
all
()
assert
(
f
(
val1
,
val2
)
==
f3
(
val1
,
val2
))
.
all
()
assert
(
f
(
val1
,
val2
)
==
f4
(
val1
,
val2
))
.
all
()
assert
(
f
(
val1
,
val2
)
==
f4
(
val1
,
val2
))
.
all
()
#print f(val1,val2)
#print f(val1,val2)
#print f2(val1,val2)
#print f2(val1,val2)
"""
#commented as it work only with old pycuda version.
def test_pycuda_elemwise_kernel():
def test_pycuda_elemwise_kernel():
x
=
T
.
fmatrix
(
'x'
)
x
=
T.fmatrix('x')
y
=
T
.
fmatrix
(
'y'
)
y
=
T.fmatrix('y')
f
=
theano
.
function
([
x
,
y
],
x
+
y
,
mode
=
mode_with_gpu
)
f
= theano.function([x, y], x +
y, mode=mode_with_gpu)
print f.maker.env.toposort()
print f.maker.env.toposort()
f2
=
theano
.
function
([
x
,
y
],
x
+
y
,
mode
=
mode_with_gpu
.
including
(
"local_pycuda_gpu_elemwise_kernel"
))
mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")
f2 = theano.function([x, y], x + y, mode=mode_pycuda)
print f2.maker.env.toposort()
print f2.maker.env.toposort()
assert
any
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
GpuElemwise
)
for
node
in
f
.
maker
.
env
.
toposort
()])
assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
assert
any
([
isinstance
(
node
.
op
,
PycudaElemwiseKernelOp
)
for
node
in
f2
.
maker
.
env
.
toposort
()])
for node in f.maker.env.toposort()])
assert any([isinstance(node.op, PycudaElemwiseKernelOp)
for node in f2.maker.env.toposort()])
val1
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
5
,
5
),
dtype
=
'float32'
)
val1 = numpy.asarray(numpy.random.rand(5,
5), dtype='float32')
val2
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
5
,
5
),
dtype
=
'float32'
)
val2 = numpy.asarray(numpy.random.rand(5,
5), dtype='float32')
#val1 = numpy.ones((5,5))
#val1 = numpy.ones((5,5))
#val2 = numpy.arange(25).reshape(5,5)
#val2 = numpy.arange(25).reshape(5,5)
assert
(
f
(
val1
,
val2
)
==
f2
(
val1
,
val2
))
.
all
()
assert (f(val1, val2) == f2(val1, val2)).all()
print
f
(
val1
,
val2
)
print f(val1, val2)
print
f2
(
val1
,
val2
)
print f2(val1, val2)
x3
=
T
.
ftensor3
(
'x'
)
x3
=
T.ftensor3('x')
y3
=
T
.
ftensor3
(
'y'
)
y3
=
T.ftensor3('y')
z3
=
T
.
ftensor3
(
'y'
)
z3
=
T.ftensor3('y')
f4
=
theano
.
function
([
x3
,
y3
,
z3
],
x3
*
y3
+
z3
,
mode
=
mode_with_gpu
.
including
(
"local_pycuda_gpu_elemwise_kernel"
)
)
f4 = theano.function([x3,
y3, z3], x3 * y3 + z3, mode=mode_pycuda
)
print f4.maker.env.toposort()
print f4.maker.env.toposort()
assert
any
([
isinstance
(
node
.
op
,
PycudaElemwiseKernelOp
)
for
node
in
f4
.
maker
.
env
.
toposort
()])
assert any([isinstance(node.op, PycudaElemwiseKernelOp)
for node in f4.maker.env.toposort()])
val1
=
numpy
.
random
.
rand
(
2
,
2
,
2
)
val1 = numpy.random.rand(2,
2,
2)
print val1
print val1
print
f4
(
val1
,
val1
,
val1
)
print f4(val1, val1, val1)
assert
numpy
.
allclose
(
f4
(
val1
,
val1
,
val1
),
val1
*
val1
+
val1
)
assert numpy.allclose(f4(val1, val1, val1), val1 * val1 + val1)
"""
theano/misc/tests/test_pycuda_theano_simple.py
浏览文件 @
c1366d70
...
@@ -78,7 +78,10 @@ __global__ void multiply_them(float *dest, float *a, float *b)
...
@@ -78,7 +78,10 @@ __global__ void multiply_them(float *dest, float *a, float *b)
def
test_pycuda_memory_to_theano
():
def
test_pycuda_memory_to_theano
():
#Test that we can use the GpuArray memory space in pycuda in a CudaNdarray
#Test that we can use the GpuArray memory space in pycuda in a CudaNdarray
y
=
pycuda
.
gpuarray
.
zeros
((
3
,
4
,
5
),
'float32'
)
y
=
pycuda
.
gpuarray
.
zeros
((
3
,
4
,
5
),
'float32'
)
print
numpy
.
asarray
(
y
)
print
sys
.
getrefcount
(
y
)
# This increase the ref count with never pycuda. Do pycuda also
# cache ndarray?
# print y.get()
print
"gpuarray ref count before creating a CudaNdarray"
,
print
"gpuarray ref count before creating a CudaNdarray"
,
print
sys
.
getrefcount
(
y
)
print
sys
.
getrefcount
(
y
)
assert
sys
.
getrefcount
(
y
)
==
2
assert
sys
.
getrefcount
(
y
)
==
2
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论