Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
06cdb1fa
提交
06cdb1fa
authored
1月 30, 2014
作者:
abergeron
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1710 from nouiz/gpu_reduce
new Gpu reduce based on the version in Theano that is much faster.
上级
262f59a2
1121fe45
显示空白字符变更
内嵌
并排
正在显示
17 个修改的文件
包含
2115 行增加
和
163 行删除
+2115
-163
ctype.txt
doc/extending/ctype.txt
+17
-0
__init__.py
theano/compile/__init__.py
+1
-0
ops.py
theano/compile/ops.py
+88
-0
blas.py
theano/sandbox/cuda/blas.py
+4
-4
conv.cu
theano/sandbox/cuda/conv.cu
+1
-1
conv_kernel.cu
theano/sandbox/cuda/conv_kernel.cu
+1
-1
basic_ops.py
theano/sandbox/gpuarray/basic_ops.py
+0
-9
elemwise.py
theano/sandbox/gpuarray/elemwise.py
+1793
-12
opt.py
theano/sandbox/gpuarray/opt.py
+52
-20
test_basic_ops.py
theano/sandbox/gpuarray/tests/test_basic_ops.py
+2
-2
test_elemwise.py
theano/sandbox/gpuarray/tests/test_elemwise.py
+88
-3
test_opt.py
theano/sandbox/gpuarray/tests/test_opt.py
+2
-2
type.py
theano/sandbox/gpuarray/type.py
+17
-0
basic.py
theano/tensor/basic.py
+2
-81
nnet.py
theano/tensor/nnet/nnet.py
+1
-1
test_elemwise.py
theano/tensor/tests/test_elemwise.py
+27
-27
type.py
theano/tensor/type.py
+19
-0
没有找到文件。
doc/extending/ctype.txt
浏览文件 @
06cdb1fa
...
@@ -553,3 +553,20 @@ respectively. See an example for the type ``CudaNdarrayType`` (GPU
...
@@ -553,3 +553,20 @@ respectively. See an example for the type ``CudaNdarrayType`` (GPU
array) in the file `theano/sandbox/cuda/type.py`. The version
array) in the file `theano/sandbox/cuda/type.py`. The version
parameter is what is returned by ViewOp.c_code_cache_version(). By
parameter is what is returned by ViewOp.c_code_cache_version(). By
default, it will recompile the c code for each process.
default, it will recompile the c code for each process.
Shape and Shape_i
=================
We have 2 generic Ops Shape and Shape_i that return the shape of any
Theano Variable that have a shape attribute and Shape_i return only of
the element of the shape.
.. code-block:: python
theano.compile.ops.register_shape_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
theano.compile.ops.register_shape_i_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
The c code work as the ViewOp. Shape_i have the additional i parameter
that you can use with %(i)s.
theano/compile/__init__.py
浏览文件 @
06cdb1fa
from
theano.compile.ops
import
(
from
theano.compile.ops
import
(
DeepCopyOp
,
deep_copy_op
,
register_deep_copy_op_c_code
,
DeepCopyOp
,
deep_copy_op
,
register_deep_copy_op_c_code
,
Shape
,
shape
,
register_shape_c_code
,
Shape_i
,
register_shape_i_c_code
,
Shape_i
,
register_shape_i_c_code
,
ViewOp
,
view_op
,
register_view_op_c_code
)
ViewOp
,
view_op
,
register_view_op_c_code
)
...
...
theano/compile/ops.py
浏览文件 @
06cdb1fa
...
@@ -181,6 +181,94 @@ class DeepCopyOp(gof.Op):
...
@@ -181,6 +181,94 @@ class DeepCopyOp(gof.Op):
deep_copy_op
=
DeepCopyOp
()
deep_copy_op
=
DeepCopyOp
()
def
register_shape_c_code
(
type
,
code
,
version
=
()):
""" Tell Shape Op how to generate C code for a Theano Type
:param typ: A Theano type. It must be the Theano class itself and not an
instance of the class.
:param code: C code that deep copies the Theano type 'typ'.
Use
%(iname)
s and
%(oname)
s for the input and output C
variable names respectively.
:param version: A number indicating the version of the code, for cache.
"""
Shape
.
c_code_and_version
[
type
]
=
(
code
,
version
)
class
Shape
(
gof
.
Op
):
"""
L{Op} to return the shape of a matrix.
@note: Non-differentiable.
"""
# Mapping from Type to C code (and version) to use.
# In the C code, the name of the input variable is %(iname)s,
# the output variable is %(oname)s.
c_code_and_version
=
{}
def
__hash__
(
self
):
return
hash
(
type
(
self
))
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
def
__str__
(
self
):
return
self
.
__class__
.
__name__
def
make_node
(
self
,
x
):
# Must work for all type that have a shape attribute.
# This will fail at execution time.
if
not
isinstance
(
x
,
theano
.
Variable
):
x
=
theano
.
tensor
.
as_tensor_variable
(
x
)
return
gof
.
Apply
(
self
,
[
x
],
[
theano
.
tensor
.
lvector
()])
def
perform
(
self
,
node
,
inp
,
out_
):
x
,
=
inp
out
,
=
out_
out
[
0
]
=
theano
.
_asarray
(
x
.
shape
,
dtype
=
'int64'
)
def
infer_shape
(
self
,
node
,
in_shapes
):
return
[[
len
(
in_shapes
[
0
])]]
def
connection_pattern
(
self
,
node
):
# the grad returns the gradient with respect to the
# elements of a tensor variable
# the elements of the tensor variable do not participate
# in the computation of the shape, so they are not really
# part of the graph
return
[[
False
]]
def
grad
(
self
,
inp
,
grads
):
# the grad returns the gradient with respect to the
# elements of a tensor variable
# the elements of the tensor variable do not participate
# in the computation of the shape, so they are not really
# part of the graph
return
[
DisconnectedType
()()]
def
R_op
(
self
,
inputs
,
eval_points
):
return
[
None
]
def
c_code
(
self
,
node
,
name
,
inames
,
onames
,
sub
):
iname
,
=
inames
oname
,
=
onames
fail
=
sub
[
'fail'
]
itype
=
node
.
inputs
[
0
]
.
type
.
__class__
if
itype
in
self
.
c_code_and_version
:
code
,
version
=
self
.
c_code_and_version
[
itype
]
return
code
%
locals
()
# Else, no C code
return
super
(
Shape
,
self
)
.
c_code
(
node
,
name
,
inames
,
onames
,
sub
)
def
c_code_cache_version
(
self
):
return
(
1
,)
shape
=
Shape
()
_shape
=
shape
# was used in the past, now use shape directly.
#pprint.assign(_shape, printing.MemberPrinter('shape'))
class
Shape_i
(
gof
.
Op
):
class
Shape_i
(
gof
.
Op
):
"""
"""
L{Op} to return the shape of a matrix.
L{Op} to return the shape of a matrix.
...
...
theano/sandbox/cuda/blas.py
浏览文件 @
06cdb1fa
...
@@ -523,21 +523,21 @@ class GpuConv(GpuOp):
...
@@ -523,21 +523,21 @@ class GpuConv(GpuOp):
imshp
=
None
,
imshp
=
None
,
max_threads_dim0
=
None
):
max_threads_dim0
=
None
):
"""
"""
:param version: each version of c_code implement many kernel for the
:param version: each version of c_code implement
s
many kernel for the
convolution. By default we try to guess the best one.
convolution. By default we try to guess the best one.
You can force one version with this parameter. This
You can force one version with this parameter. This
parameter is used by the tests.
parameter is used by the tests.
:param verbose: for value of 1,2 and 3. Print more information during
:param verbose: for value of 1,2 and 3. Print more information during
the execution of the convolution. Mostly used for
the execution of the convolution. Mostly used for
optimization or debugging.
optimization or debugging.
:param kshp: The size of the kernel. If provided, can genera
:param kshp: The size of the kernel. If provided, can genera
te
faster code. If the GpuConv op is automatically
faster code. If the GpuConv op is automatically
inserted,
inserted,
we take its value automatically from the Conv op.
we take its value automatically from the Conv op.
:param imshp: The size of the image. Not used for code generation but
:param imshp: The size of the image. Not used for code generation but
allow to select an experimental new version in another
allow
s
to select an experimental new version in another
repo.
repo.
:param max_threads_dim0: The maximum number of thread for the
:param max_threads_dim0: The maximum number of thread
s
for the
block size dimensions 0 (blockDim.x) used by the
block size dimensions 0 (blockDim.x) used by the
GPU function.
GPU function.
...
...
theano/sandbox/cuda/conv.cu
浏览文件 @
06cdb1fa
// REMEMBER TO
RAI
SE c_code_cache_version when changing this file
// REMEMBER TO
INCREA
SE c_code_cache_version when changing this file
//
//
enum
{
ConvMode_FULL
,
ConvMode_VALID
};
enum
{
ConvMode_FULL
,
ConvMode_VALID
};
PyObject
*
CudaNdarray_Conv
(
CudaNdarray
*
img
,
CudaNdarray
*
kern
,
CudaNdarray
*
out
,
const
int
mode
,
const
int
subsample_rows
,
const
int
subsample_cols
,
const
int
version
,
const
int
verbose
);
PyObject
*
CudaNdarray_Conv
(
CudaNdarray
*
img
,
CudaNdarray
*
kern
,
CudaNdarray
*
out
,
const
int
mode
,
const
int
subsample_rows
,
const
int
subsample_cols
,
const
int
version
,
const
int
verbose
);
...
...
theano/sandbox/cuda/conv_kernel.cu
浏览文件 @
06cdb1fa
// REMEMBER TO
RAI
SE c_code_cache_version when changing this file
// REMEMBER TO
INCREA
SE c_code_cache_version when changing this file
//
//
//implement the valid convolution only
//implement the valid convolution only
...
...
theano/sandbox/gpuarray/basic_ops.py
浏览文件 @
06cdb1fa
...
@@ -654,15 +654,6 @@ class GpuAlloc(HideC, Alloc):
...
@@ -654,15 +654,6 @@ class GpuAlloc(HideC, Alloc):
gpu_alloc
=
GpuAlloc
()
gpu_alloc
=
GpuAlloc
()
class
GpuShape
(
HideC
,
tensor
.
Shape
):
"""
Implement Shape on the gpu.
"""
def
make_node
(
self
,
x
):
return
Apply
(
self
,
[
x
],
[
tensor
.
lvector
()])
gpu_shape
=
GpuShape
()
class
GpuReshape
(
HideC
,
tensor
.
Reshape
):
class
GpuReshape
(
HideC
,
tensor
.
Reshape
):
"""
"""
Implement Reshape on the gpu.
Implement Reshape on the gpu.
...
...
theano/sandbox/gpuarray/elemwise.py
浏览文件 @
06cdb1fa
import
copy
import
copy
from
itertools
import
izip
from
itertools
import
izip
from
StringIO
import
StringIO
import
numpy
import
numpy
from
theano
import
Op
,
Apply
,
scalar
,
config
from
theano
import
Op
,
Apply
,
scalar
,
config
from
theano.tensor.elemwise
import
Elemwise
,
DimShuffle
,
CAReduceDtype
from
theano
import
scalar
as
scal
from
theano.scalar
import
Scalar
from
theano.tensor.elemwise
import
(
Elemwise
,
DimShuffle
,
CAReduce
,
CAReduceDtype
)
from
theano.sandbox.cuda.nvcc_compiler
import
NVCC_compiler
from
theano.sandbox.cuda.nvcc_compiler
import
NVCC_compiler
try
:
try
:
...
@@ -165,7 +169,8 @@ class GpuElemwise(HideC, Elemwise):
...
@@ -165,7 +169,8 @@ class GpuElemwise(HideC, Elemwise):
return
ElemwiseKernel
(
None
,
inps
+
outs
,
kop
,
preamble
=
support_code
)
return
ElemwiseKernel
(
None
,
inps
+
outs
,
kop
,
preamble
=
support_code
)
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<compyte/extension.h>'
,
'<numpy_compat.h>'
]
return
[
'cuda.h'
,
'<compyte/extension.h>'
,
'<numpy_compat.h>'
,
'<compyte/ext_cuda.h>'
]
def
c_compiler
(
self
):
def
c_compiler
(
self
):
return
NVCC_compiler
return
NVCC_compiler
...
@@ -201,8 +206,7 @@ class GpuElemwise(HideC, Elemwise):
...
@@ -201,8 +206,7 @@ class GpuElemwise(HideC, Elemwise):
#define GDIM_1 gridDim.y
#define GDIM_1 gridDim.y
#define GDIM_2 gridDim.z
#define GDIM_2 gridDim.z
"""
"""
res
=
[
"CUdeviceptr (*cuda_get_ptr)(gpudata *g);"
,
res
=
[
CLUDA_PREAMBLE
]
CLUDA_PREAMBLE
]
for
i
in
range
(
0
,
nd
+
1
):
for
i
in
range
(
0
,
nd
+
1
):
res
.
append
(
k
.
render_basic
(
i
,
name
=
"elem_"
+
str
(
i
))
+
';'
)
res
.
append
(
k
.
render_basic
(
i
,
name
=
"elem_"
+
str
(
i
))
+
';'
)
res
.
append
(
k
.
contig_src
+
';'
)
res
.
append
(
k
.
contig_src
+
';'
)
...
@@ -210,8 +214,7 @@ class GpuElemwise(HideC, Elemwise):
...
@@ -210,8 +214,7 @@ class GpuElemwise(HideC, Elemwise):
return
'
\n
'
.
join
(
res
)
return
'
\n
'
.
join
(
res
)
def
c_init_code
(
self
):
def
c_init_code
(
self
):
return
[
'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))'
return
[
'setup_ext_cuda();'
]
'compyte_get_extension("cuda_get_ptr");'
]
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
nd
=
node
.
outputs
[
0
]
.
ndim
nd
=
node
.
outputs
[
0
]
.
ndim
...
@@ -413,7 +416,7 @@ class GpuElemwise(HideC, Elemwise):
...
@@ -413,7 +416,7 @@ class GpuElemwise(HideC, Elemwise):
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
ver
=
self
.
scalar_op
.
c_code_cache_version
()
ver
=
self
.
scalar_op
.
c_code_cache_version
()
if
ver
:
if
ver
:
return
(
1
,
ver
)
return
(
2
,
ver
)
else
:
else
:
return
ver
return
ver
...
@@ -515,10 +518,1786 @@ class GpuDimShuffle(HideC, DimShuffle):
...
@@ -515,10 +518,1786 @@ class GpuDimShuffle(HideC, DimShuffle):
return
process
return
process
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
3
,)
return
(
4
,)
class
GpuCAReduce
(
GpuKernelBase
,
HideC
,
CAReduceDtype
):
class
GpuCAReduceCuda
(
HideC
,
CAReduce
):
"""GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.
The dimensions along which to reduce is specified by the
`reduce_mask` that you pass to the constructor. The `reduce_mask`
is a tuple of booleans (actually integers 0 or 1) that specify for
each input dimension, whether to reduce it (1) or not (0).
For example, when scalar_op is a theano.scalar.basic.Add instance:
- reduce_mask == (1,) sums a vector to a scalar
- reduce_mask == (1,0) computes the sum of each column in a matrix
- reduce_mask == (0,1) computes the sum of each row in a matrix
- reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor.
:note: any reduce_mask of all zeros is a sort of 'copy', and may
be removed during graph optimization
This Op is a work in progress.
This op was recently upgraded from just GpuSum a general CAReduce. Not
many code cases are supported for scalar_op being anything other than
scal.Add instances yet.
Important note: if you implement new cases for this op, be sure to
benchmark them and make sure that they actually result in a speedup.
GPUs are not especially well-suited to reduction operations so it is
quite possible that the GPU might be slower for some cases.
"""
def
__init__
(
self
,
scalar_op
,
axis
=
None
,
reduce_mask
=
None
):
if
reduce_mask
is
not
None
:
reduce_mask
=
tuple
(
reduce_mask
)
self
.
reduce_mask
=
reduce_mask
# used to make sure that calls to scalar op
# have unique name arguments
self
.
_n_scalar_op_calls
=
0
if
not
hasattr
(
scalar_op
,
'identity'
):
raise
ValueError
(
"No identity on scalar op"
)
CAReduce
.
__init__
(
self
,
scalar_op
,
axis
=
axis
)
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
and
self
.
axis
==
other
.
axis
and
self
.
reduce_mask
==
other
.
reduce_mask
and
self
.
scalar_op
==
other
.
scalar_op
)
def
__hash__
(
self
):
return
(
hash
(
type
(
self
))
^
hash
(
self
.
axis
)
^
hash
(
self
.
reduce_mask
)
^
hash
(
type
(
self
.
scalar_op
)))
def
__str__
(
self
):
ax
=
''
if
self
.
axis
is
not
None
:
ax
=
'{
%
s}'
%
(
', '
.
join
(
str
(
x
)
for
x
in
self
.
axis
),)
return
"GpuCAReduceCuda{
%
s}
%
s"
%
(
str
(
self
.
scalar_op
),
ax
)
def
make_node
(
self
,
x
):
x
=
as_gpuarray_variable
(
x
)
assert
x
.
dtype
==
"float32"
ret
=
super
(
GpuCAReduceCuda
,
self
)
.
make_node
(
x
)
self
=
copy
.
copy
(
self
)
self
.
axis
=
ret
.
op
.
axis
if
self
.
reduce_mask
is
None
:
if
self
.
axis
is
None
:
reduce_mask
=
[
1
]
*
x
.
type
.
ndim
else
:
reduce_mask
=
[
0
]
*
x
.
type
.
ndim
for
a
in
self
.
axis
:
assert
reduce_mask
[
a
]
==
0
reduce_mask
[
a
]
=
1
self
.
reduce_mask
=
tuple
(
reduce_mask
)
if
(
x
.
type
.
ndim
!=
len
(
self
.
reduce_mask
)):
raise
TypeError
(
"x must have rank
%
i"
%
len
(
self
.
reduce_mask
))
return
Apply
(
self
,
[
x
],
[
GpuArrayType
(
x
.
dtype
,
ret
.
outputs
[
0
]
.
type
.
broadcastable
)()])
"""
This method must be commented, because there's no way
to communicate that it's OK to call for + but not for
max
def perform(self, node, inp, out):
x, = inp
z, = out
# reduce_max is declared but does nothing but
# raise NotImplementedError.
# We can't call it here anyway because it hasn't
# been added to the python bindings yet
z[0] = x.reduce_sum(self.reduce_mask)
"""
def
perform
(
self
,
node
,
inp
,
out
):
raise
MethodNotDefined
(
""
)
def
supports_c_code
(
self
,
inputs
):
""" Returns True if the current op and reduce pattern
has functioning C code """
# If we don't even have the right method, we certainly
# don't support the C code
# (This is the test that used to be implemented by
# local_gpu_sum)
pattern
=
(
''
.
join
(
str
(
i
)
for
i
in
self
.
reduce_mask
))
if
not
hasattr
(
self
,
'c_code_reduce_
%
s'
%
pattern
):
return
False
# Now that this is a general reduction op, we might
# have a method for a pattern, but that pattern
# might not be implemented for the current scalar op.
# To detect this more complicated situation, we
# make fake arguments to c_code, try to run them,
# and see if NotImplementedError gets raised.
node
=
self
.
make_node
(
*
inputs
)
name
=
'fake_name'
inp
=
[
'fake_input_name_
%
d'
%
i
for
i
in
xrange
(
len
(
inputs
))]
out
=
[
'fake_output_name_
%
d'
%
i
for
i
in
xrange
(
len
(
node
.
outputs
))]
sub
=
{
'fail'
:
'fake failure code'
}
try
:
self
.
c_code
(
node
,
name
,
inp
,
out
,
sub
)
self
.
c_support_code_apply
(
node
,
name
)
except
NotImplementedError
:
return
False
return
True
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<compyte/extension.h>'
,
'<numpy_compat.h>'
,
'<compyte/ext_cuda.h>'
]
def
c_compiler
(
self
):
return
NVCC_compiler
def
c_init_code
(
self
):
return
[
'setup_ext_cuda();'
]
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
x
,
=
inp
z
,
=
out
nd_in
=
node
.
inputs
[
0
]
.
type
.
ndim
nd_out
=
node
.
outputs
[
0
]
.
type
.
ndim
assert
nd_in
-
nd_out
==
sum
(
self
.
reduce_mask
)
sio
=
StringIO
()
fail
=
sub
[
'fail'
]
#check input
print
>>
sio
,
"""
if (PyGpuArray_NDIM(
%(x)
s) !=
%(nd_in)
s)
{
PyErr_Format(PyExc_TypeError,
"required nd=
%(nd_in)
s, got nd=
%%
i", PyGpuArray_NDIM(
%(x)
s));
%(fail)
s;
}
"""
%
locals
()
# It might be nice to use a property of the op class to do this,
# but tensor.elemwise.CAReduce has this exact same check so I guess
# this is OK to do
if
self
.
scalar_op
in
[
scal
.
minimum
,
scal
.
maximum
]:
conds
=
[
"(PyGpuArray_DIMS(
%
s)[
%
d] == 0)"
%
(
x
,
i
)
for
i
in
xrange
(
nd_in
)
if
self
.
reduce_mask
[
i
]]
assert
len
(
conds
)
>
0
cond
=
"("
+
" || "
.
join
(
conds
)
+
")"
print
>>
sio
,
"""
if
%(cond)
s
{
PyErr_Format(PyExc_ValueError," tried to reduce a 0-length axis.");
%(fail)
s;
}
"""
%
locals
()
#
# alloc an output if we need one
#
# check the basics of out output
print
>>
sio
,
"""
if ( !
%(z)
s
|| (PyGpuArray_NDIM(
%(z)
s) !=
%(nd_out)
s)
"""
%
locals
()
#ensure that the output has the right non-reduced dimensions
j
=
0
for
i
in
xrange
(
nd_in
):
if
not
self
.
reduce_mask
[
i
]:
print
>>
sio
,
" || (PyGpuArray_DIMS(
%(z)
s)[
%(j)
s] != PyGpuArray_DIMS(
%(x)
s)[
%(i)
d]) "
%
locals
()
j
+=
1
print
>>
sio
,
"""
)
{
"""
%
locals
()
if
nd_out
>
0
:
print
>>
sio
,
"size_t new_dims[
%(nd_out)
s]; "
%
locals
()
else
:
print
>>
sio
,
"size_t *new_dims=NULL; "
j
=
0
for
i
in
xrange
(
nd_in
):
if
not
self
.
reduce_mask
[
i
]:
print
>>
sio
,
'new_dims[
%(j)
s] = PyGpuArray_DIMS(
%(x)
s)[
%(i)
s];'
%
locals
()
j
+=
1
out_typecode
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
dtype
)
print
>>
sio
,
"""
Py_XDECREF(
%(z)
s);
%(z)
s = pygpu_empty(
%(nd_out)
s, new_dims,
%(out_typecode)
s, GA_C_ORDER,
pygpu_default_context(),
Py_None);
if (NULL ==
%(z)
s)
{
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
%(fail)
s;
}
}
"""
%
locals
()
# \begin bracket the reduction in a check that there is
# actually work to do
if
getattr
(
self
.
scalar_op
,
'identity'
,
None
)
==
0
:
zero_shp
=
"cudaMemset((float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset), 0, PyGpuArray_SIZE(
%(z)
s) * sizeof(float))"
%
locals
()
#TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
else
:
scalar_op
=
self
.
scalar_op
zero_shp
=
"""
PyErr_Format(PyExc_NotImplementedError,
"GpuCAReduceCuda not implemented when input shape is 0"
" for this scalar_op:
%(scalar_op)
s");
%(fail)
s;
"""
%
locals
()
print
>>
sio
,
"""
if (PyGpuArray_SIZE(
%(z)
s) && ! PyGpuArray_SIZE(
%(x)
s)){
%(zero_shp)
s;
}
else if (PyGpuArray_SIZE(
%(z)
s))
{
"""
%
locals
()
#
# Now perform the reduction
#
if
all
(
i
==
1
for
i
in
self
.
reduce_mask
):
#check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
#TODO: check if we are ccontiguous when we un-dimshuffle
#TODO: if only some dims are ccontiguous, call version with less dims.
print
>>
sio
,
'if(
%(x)
s->ga.flags & GA_C_CONTIGUOUS){'
%
locals
()
self
.
c_code_reduce_ccontig
(
sio
,
node
,
name
,
x
,
z
,
fail
)
print
>>
sio
,
"}else{"
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
str
(
i
)
for
i
in
self
.
reduce_mask
)))(
sio
,
node
,
name
,
x
,
z
,
fail
)
print
>>
sio
,
"}"
else
:
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
str
(
i
)
for
i
in
self
.
reduce_mask
)))(
sio
,
node
,
name
,
x
,
z
,
fail
)
# \end bracket the reduction ...
print
>>
sio
,
"""
}
"""
%
locals
()
return
sio
.
getvalue
()
def
_makecall
(
self
,
node
,
name
,
x
,
z
,
fail
,
pattern
=
None
):
"""Return a string for making a kernel call.
The return value looks something like:
.. code-block:: c
if (verbose)
printf("running kernel_reduce_10_
%(name)
s
\\
n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_10_
%(name)
s<<<n_blocks, n_threads,
n_shared>>>(
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[1],
(float *)(((char *)cuda_get_ptr(
%(x)
s->ga.data))+
%(x)
s->ga.offset),
PyGpuArray_STRIDES(
%(x)
s)[0]/4,
PyGpuArray_STRIDES(
%(x)
s)[1]/4,
(float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset),
PyGpuArray_STRIDES(
%(z)
s)[0]/4
);
[
if config.gpuarray.sync:
code += "GpuArray_sync(&
%(zz)
s->ga);
\n
"
%
dict(zz=zz)
]
if (cudaSuccess != cudaGetLastError())
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: ... );
%(fail)
s;
}
"""
sio
=
StringIO
()
if
pattern
is
None
:
pattern
=
''
.
join
(
str
(
c
)
for
c
in
self
.
reduce_mask
)
ndim
=
len
(
self
.
reduce_mask
)
nd_out
=
ndim
-
sum
(
self
.
reduce_mask
)
shapes_format
=
"shape=(
%
s)"
%
","
.
join
([
"
%
llu"
]
*
node
.
inputs
[
0
]
.
ndim
)
shapes_data
=
","
.
join
([
"(unsigned long long) PyGpuArray_DIMS(
%
s)[
%
d]"
%
(
x
,
i
)
for
i
in
range
(
node
.
inputs
[
0
]
.
ndim
)])
print
>>
sio
,
"""
if (verbose)
printf("running kernel_reduce_
%(pattern)
s_
%(name)
s
\\
n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
if (verbose>1)
printf("n_threads.x=
%%
d, n_threads.y=
%%
d, n_threads.z=
%%
d,"
" nb_threads=
%%
d, n_blocks.x=
%%
d, n_blocks.y=
%%
d,"
" nb_block=
%%
d, n_shared=
%%
d,
%(shapes_format)
s
\\
n",
n_threads.x,n_threads.y,n_threads.z,
n_threads.x*n_threads.y*n_threads.z,
n_blocks.x,n_blocks.y,
n_blocks.x*n_blocks.y, n_shared,
%(shapes_data)
s);
kernel_reduce_
%(pattern)
s_
%(name)
s<<<n_blocks, n_threads, n_shared>>>(
"""
%
locals
()
for
i
in
xrange
(
ndim
):
print
>>
sio
,
"""
PyGpuArray_DIMS(
%(x)
s)[
%(i)
s],
"""
%
locals
()
print
>>
sio
,
"""
(float *)(((char *)cuda_get_ptr(
%(x)
s->ga.data))+
%(x)
s->ga.offset)
"""
%
locals
()
for
i
in
xrange
(
ndim
):
print
>>
sio
,
"""
,PyGpuArray_STRIDES(
%(x)
s)[
%(i)
s]/4
"""
%
locals
()
print
>>
sio
,
"""
,(float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset)
"""
%
locals
()
for
i
in
xrange
(
nd_out
):
print
>>
sio
,
"""
,PyGpuArray_STRIDES(
%(z)
s)[
%(i)
s]/4
"""
%
locals
()
sync
=
""
if
config
.
gpuarray
.
sync
:
sync
=
"""GpuArray_sync(&
%(z)
s->ga);"""
%
locals
()
print
>>
sio
,
"""
);
%(sync)
s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error:
%%
s:
%%
s."
" (grid:
%%
i x
%%
i; block:
%%
i x
%%
i x
%%
i)"
"
%(shapes_format)
s
\\
n",
"kernel_reduce_
%(pattern)
s_
%(name)
s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
%(shapes_data)
s);
%(fail)
s;
}
"""
%
locals
()
return
sio
.
getvalue
()
def
_k_decl
(
self
,
node
,
nodename
,
pattern
=
None
,
ndim
=
None
,
reduce_mask
=
None
):
"""Return a string to declare a kernel function
The result will look something like this:
.. code-block:: c
static __global__ void kernel_reduce_110_
%(nodename)
s(
const int d0,
const int d1,
const int d2,
const float *A,
const int sA0,
const int sA1,
const int sA2,
float * Z,
const int sZ0)
Since the nodename is unique, we don't need to put the name
of the scalar_op in here.
"""
if
reduce_mask
is
None
:
reduce_mask
=
self
.
reduce_mask
if
ndim
is
None
:
ndim
=
len
(
reduce_mask
)
if
pattern
is
None
:
pattern
=
''
.
join
(
str
(
i
)
for
i
in
reduce_mask
)
sio
=
StringIO
()
print
>>
sio
,
"""
static __global__ void kernel_reduce_
%(pattern)
s_
%(nodename)
s(
"""
%
locals
()
for
i
in
xrange
(
ndim
):
print
>>
sio
,
"""
const int d
%(i)
s,
"""
%
locals
()
print
>>
sio
,
"""
const float *A,
"""
%
locals
()
for
i
in
xrange
(
ndim
):
print
>>
sio
,
"""
const int sA
%(i)
s,
"""
%
locals
()
print
>>
sio
,
"""
float * Z
"""
%
locals
()
for
i
in
xrange
(
ndim
-
sum
(
reduce_mask
)):
print
>>
sio
,
"""
, const int sZ
%(i)
s
"""
%
locals
()
print
>>
sio
,
")"
return
sio
.
getvalue
()
def
_k_init
(
self
,
*
args
):
return
"""
const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y
+ threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float myresult = 0.0f;
//This is caught in cuda/init.py when we init the gpu. I keep
//it here to ease finding code that rely on this.
if (warpSize != 32)
{
Z[0] = -666;
return;
}
"""
def
_assign_init
(
self
,
first_item
):
"""
This return the initial value for myresult.
If the scalar op have an identity value, return it.
Otherwise, check that the scalar op is maximum or minimum
and return first_item. It should be the first element of the reduction.
As the maximum and minimum of the same value don't change, this work.
"""
if
hasattr
(
self
.
scalar_op
,
'identity'
):
return
str
(
self
.
scalar_op
.
identity
)
else
:
assert
isinstance
(
self
.
scalar_op
,
(
scal
.
Maximum
,
scal
.
Minimum
))
return
first_item
def
_assign_reduce
(
self
,
node
,
name
,
left
,
right
,
sub
):
"""
node: the node argument to this op's c_code
name: the name argument to this op's c_code
left: a C code string identifying an lvalue
right: a C code string identifying an expression
sub: the sub argument to this op's c_code
returns C code to reduce left and right, assigning the
result to left."""
x
,
=
node
.
inputs
dtype
=
x
.
dtype
dummy_left
=
Scalar
(
dtype
=
dtype
)()
dummy_right
=
Scalar
(
dtype
=
dtype
)()
dummy_node
=
self
.
scalar_op
.
make_node
(
dummy_left
,
dummy_right
)
dummy_name
=
name
+
'_scalar_op'
+
str
(
self
.
_n_scalar_op_calls
)
self
.
_n_scalar_op_calls
+=
1
return
self
.
scalar_op
.
c_code
(
dummy_node
,
dummy_name
,
(
left
,
right
),
(
left
,),
sub
)
def
_k_reduce_buf
(
self
,
z_pos
,
node
,
name
,
sub
):
"""
WRITEME
node, name, sub: these should be passed through from the original
call to c_code
"""
# This code (the code in new_version) is currently ignored.
# Code produced later in this function is returned instead.
# The code here works with all nvidia driver
# But only for powers or multiples of 2!
new_version
=
"""
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult;
__syncthreads();
if (threadNum >= ((threadCount >> 1) * 2))
{
int idx = threadNum - (threadCount >> 1) * 2;"""
new_version
+=
self
.
_assign_reduce
(
node
,
name
,
'buf[idx]'
,
'buf[threadNum]'
,
sub
)
new_version
+=
"""
}
__syncthreads();
// Works for power of 2 only.
int nTotalThreads = threadCount; // Total number of active threads
while(nTotalThreads > 1)
{
int halfPoint = (nTotalThreads >> 1); // divide by two
// only the first half of the threads will be active.
if (threadNum < halfPoint)
{
// Get the shared value stored by another thread
float temp = buf[threadNum + halfPoint];
"""
new_version
+=
self
.
_assign_reduce
(
node
,
name
,
'buf[threadNum]'
,
'temp'
,
sub
)
new_version
+=
"""
}
__syncthreads();
nTotalThreads = (nTotalThreads >> 1); // divide by two.
}
__syncthreads();
if (threadNum == 0)
{
%(z_pos)
s = buf[0];
}
__syncthreads();"""
new_version
=
new_version
%
locals
()
current_version
=
"""
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult;
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
//round up all the partial sums into the first `warpSize` elements
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
"""
current_version
+=
self
.
_assign_reduce
(
node
,
name
,
'myresult'
,
'buf[i]'
,
sub
)
+
"""
}
buf[threadNum] = myresult;
/*Comment this optimization as it don't work on Fermi GPU.
TODO: find why it don't work or put the GPU compute capability into the version
// no sync because only one warp is running
if(threadCount >32)
{"""
for
num
in
[
16
,
8
,
4
,
2
,
1
]:
current_version
+=
self
.
_assign_reduce
(
node
,
name
,
'buf[threadNum]'
,
'buf[threadNum+
%
d]'
%
num
,
sub
)
current_version
+=
"""
if (threadNum == 0)
{
%(z_pos)
s = buf[0];
}
}
else */
if (threadNum < 16)
{
//reduce so that threadNum 0 has the reduction of everything
"""
for
num
in
[
16
,
8
,
4
,
2
,
1
]:
this_if
=
"if (threadNum +
%
d < threadCount) "
%
num
+
\
self
.
_assign_reduce
(
node
,
name
,
'buf[threadNum]'
,
'buf[threadNum+
%
d]'
%
num
,
sub
)
current_version
+=
this_if
current_version
+=
"""
if (threadNum == 0)
{
%(z_pos)
s = buf[0];
}
}
}
"""
current_version
=
current_version
%
locals
()
return
current_version
#Threads must be organized as: threadNum%nb_reduce correspond to the same sum
#nb_reduce<=warpSize
def
_k_reduce_buf_multiple
(
self
,
z_pos
,
node
,
name
,
nb_reduce
):
reduce_fct
=
self
.
_assign_reduce
(
node
,
name
,
'myresult'
,
'buf[i]'
,
{})
return
"""
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult;
__syncthreads();
// rest of function is handled by one warp
if (threadNum <
%(nb_reduce)
s)
{
//round up all the partial sums into the first `nb_reduce` elements
for (int i = threadNum +
%(nb_reduce)
s; i < threadCount; i +=
%(nb_reduce)
s)
{
%(reduce_fct)
s;
}
%(z_pos)
s = myresult;
}
"""
%
locals
()
def
c_code_reduce_ccontig
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
"""
WRITEME
IG: I believe, based on how this is called in c_code, that it
is for the case where we are reducing on all axes and x is
C contiguous.
"""
if
getattr
(
self
.
scalar_op
,
'identity'
,
None
)
==
0
:
zero_shp
=
"cudaMemset((float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset), 0, PyGpuArray_SIZE(
%(z)
s) * sizeof(float))"
%
locals
()
#TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
else
:
zero_shp
=
"""
PyErr_Format(PyExc_NotImplementedError,
"GpuCAReduceCuda not implemented when input shape is 0 for this scalar_op");
%(fail)
s;
"""
%
locals
()
sync
=
""
if
config
.
gpuarray
.
sync
:
sync
=
"""GpuArray_sync(&
%(z)
s->ga);"""
%
locals
()
print
>>
sio
,
"""
{
if(PyGpuArray_SIZE(
%(x)
s)==0){
%(zero_shp)
s;
}else{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_SIZE(
%(x)
s),
(size_t) 256));
dim3 n_blocks(1);
if (verbose) printf("running kernel_reduce_ccontig_
%(name)
s"
" n_threads.x=
%%
d, size=
%%
d, ndim=
%%
d
\\
n",
n_threads.x,PyGpuArray_SIZE(
%(x)
s),
PyGpuArray_NDIM(
%(x)
s));
int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_ccontig_
%(name)
s<<<n_blocks, n_threads, n_shared>>>(
PyGpuArray_SIZE(
%(x)
s),
(float *)(((char *)cuda_get_ptr(
%(x)
s->ga.data))+
%(x)
s->ga.offset),
(float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset));
%(sync)
s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error:
%%
s:
%%
s."
" (grid:
%%
i x
%%
i; block:
%%
i x
%%
i x
%%
i)
\\
n",
"kernel_reduce_ccontig_
%(name)
s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)
s;
}
}
}
"""
%
locals
()
def
c_code_reduce_1
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
makecall
=
self
.
_makecall
(
node
,
name
,
x
,
z
,
fail
)
print
>>
sio
,
"""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
(size_t) 256));
dim3 n_blocks(1);
%(makecall)
s
}
"""
%
locals
()
def
c_code_reduce_11
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
makecall
=
self
.
_makecall
(
node
,
name
,
x
,
z
,
fail
)
print
>>
sio
,
"""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[1],
(size_t) 256));
while (n_threads.y * n_threads.x <= 256) ++n_threads.y;
n_threads.y -= 1;
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[0])
n_threads.y = PyGpuArray_DIMS(
%(x)
s)[0];
dim3 n_blocks(1);
%(makecall)
s
}
"""
%
locals
()
def
c_code_reduce_01X
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
,
N
):
"""
:param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111
Work for N=1,2,3
"""
assert
N
in
[
1
,
2
,
3
]
makecall
=
self
.
_makecall
(
node
,
name
,
x
,
z
,
fail
)
N_pattern
=
''
.
join
([
'1'
]
*
N
)
param_dim
=
","
.
join
([
"PyGpuArray_DIMS(
%
s)[
%
d]"
%
(
x
,
i
)
for
i
in
xrange
(
N
+
1
)])
strides_dim
=
","
.
join
([
"PyGpuArray_STRIDES(
%
s)[
%
d]/4"
%
(
x
,
i
)
for
i
in
xrange
(
N
+
1
)])
threads_y
=
"""
//get as many y threads as we can fit
while (n_threads.x * (n_threads.y+1) <= 256)
{
if (n_threads.y < PyGpuArray_DIMS(
%(x)
s)[
%(N)
s-1])
n_threads.y += 1;
else
break;
}"""
%
locals
()
threads_z
=
"""
//get as many z threads as we can fit
while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256)
{
if (n_threads.z < PyGpuArray_DIMS(
%(x)
s)[
%(N)
s-2])
n_threads.z += 1;
else
break;
}"""
%
locals
()
if
len
(
self
.
reduce_mask
)
==
2
:
threads_y
=
''
threads_z
=
''
if
len
(
self
.
reduce_mask
)
==
3
:
threads_z
=
''
print
>>
sio
,
"""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[
%(N)
s],
(size_t) 256));
%(threads_y)
s
%(threads_z)
s
dim3 n_blocks(std::min(PyGpuArray_DIMS(
%(x)
s)[0],
(size_t) 4096));
%(makecall)
s
}
"""
%
locals
()
def
c_code_reduce_01
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
self
.
c_code_reduce_01X
(
sio
,
node
,
name
,
x
,
z
,
fail
,
1
)
def
c_code_reduce_011
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
self
.
c_code_reduce_01X
(
sio
,
node
,
name
,
x
,
z
,
fail
,
2
)
def
c_code_reduce_0111
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
self
.
c_code_reduce_01X
(
sio
,
node
,
name
,
x
,
z
,
fail
,
3
)
def
c_code_reduce_10
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
sync
=
""
if
config
.
gpuarray
.
sync
:
sync
=
"""GpuArray_sync(&
%(z)
s->ga);"""
%
locals
()
print
>>
sio
,
"""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
(size_t) 256));
dim3 n_blocks(1,
std::min(PyGpuArray_DIMS(
%(x)
s)[1],
(size_t) 4096));
if (verbose) {
fprintf(stderr,
"running kernel_reduce_10_
%(name)
s n_blocks=(
%%
i,
%%
i)
\\
n",
n_blocks.x,
n_blocks.y);
}
assert( PyGpuArray_DIMS(
%(x)
s)[1] == PyGpuArray_DIMS(
%(z)
s)[0]);
int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_010_
%(name)
s<<<n_blocks, n_threads, n_shared>>>(
1,
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[1],
(float *)(((char *)cuda_get_ptr(
%(x)
s->ga.data))+
%(x)
s->ga.offset),
1,
PyGpuArray_STRIDES(
%(x)
s)[0]/4,
PyGpuArray_STRIDES(
%(x)
s)[1]/4,
(float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset),
1,
PyGpuArray_STRIDES(
%(z)
s)[0]/4
);
%(sync)
s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error:
%%
s:
%%
s."
" (grid:
%%
i x
%%
i; block:
%%
i x
%%
i x
%%
i)
\\
n",
"kernel_reduce_010_
%(name)
s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)
s;
}
}
"""
%
locals
()
def
c_code_reduce_010
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
makecall
=
self
.
_makecall
(
node
,
name
,
x
,
z
,
fail
)
makecall_inner
=
self
.
_makecall
(
node
,
name
,
x
,
z
,
fail
,
pattern
=
"010_inner"
)
pattern
=
''
.
join
(
str
(
i
)
for
i
in
self
.
reduce_mask
)
sync
=
""
if
config
.
gpuarray
.
sync
:
sync
=
"""GpuArray_sync(&
%(z)
s->ga);"""
%
locals
()
print
>>
sio
,
"""
{
//int n_summations = PyGpuArray_DIMS(
%(x)
s)[0] * PyGpuArray_DIMS(
%(x)
s)[2];
//if ((n_summations >= 15 * 32) && (PyGpuArray_DIMS(
%(x)
s)[2]>=16))
if (1) // if the alternative is less buggy, consider not using this branch
{
// If there are a lot of summations to do, then we can use simple parallelization -
// use each thread to do one sum.
// we might as well launch blocks of 32 threads because that's the warp size.
// we could schedule more threads if we were maxing out the gridsize below, but
// the gridsize is way more than the physical hardware and I think 32 threads
// on a huge grid is enough to fully use the hardware.
dim3 n_threads(32,1,1);
// We kindof reshape the input implicitly to something 4D:
// the shape A,B,C -> A, B, D, E
// where C <= D*E < C+32
// where E==32
int A = PyGpuArray_DIMS(
%(x)
s)[0];
int B = PyGpuArray_DIMS(
%(x)
s)[1];
int C = PyGpuArray_DIMS(
%(x)
s)[2];
int D = C/32;
if (32*D < C) D+= 1;
assert ((C <= 32*D) && (32*D < C+32));
// The gridsize would ideally be (A, D). But we do the following logic to make
// sure we don't ask for a grid that is too big.
dim3 n_blocks(A,D);
if (n_blocks.x > 4096) n_blocks.x = 4096;
if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x;
int n_shared = 0;
kernel_reduce_010_AD_
%(name)
s<<<n_blocks, n_threads, n_shared>>>(
A,B,C,D,
(float *)(((char *)cuda_get_ptr(
%(x)
s->ga.data))+
%(x)
s->ga.offset),
PyGpuArray_STRIDES(
%(x)
s)[0]/4,
PyGpuArray_STRIDES(
%(x)
s)[1]/4,
PyGpuArray_STRIDES(
%(x)
s)[2]/4,
(float *)(((char *)cuda_get_ptr(
%(z)
s->ga.data))+
%(z)
s->ga.offset),
PyGpuArray_STRIDES(
%(z)
s)[0]/4,
PyGpuArray_STRIDES(
%(z)
s)[1]/4
);
%(sync)
s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error:
%%
s:
%%
s."
" (grid:
%%
i x
%%
i; block:
%%
i x
%%
i x
%%
i)
\\
n",
"kernel_reduce_010_
%(name)
s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)
s;
}
}
else
{
int verbose = 2;
dim3 n_threads(std::min((size_t) 32,
PyGpuArray_DIMS(
%(x)
s)[2]));
while( (n_threads.x*(n_threads.y+1)<=256)
&& (n_threads.y<PyGpuArray_DIMS(
%(x)
s)[1])){
n_threads.y++;
}
dim3 n_blocks(std::min(PyGpuArray_DIMS(
%(x)
s)[0],
(size_t)4096));
n_blocks.y = std::min(
ceil_intdiv(PyGpuArray_DIMS(
%(x)
s)[2],
(size_t)n_threads.x),
(size_t)(4096 / n_blocks.x)
);
if(std::min(std::min(PyGpuArray_STRIDES(
%(x)
s)[0]/4,
PyGpuArray_STRIDES(
%(x)
s)[1]/4),
PyGpuArray_STRIDES(
%(x)
s)[2]/4)
==PyGpuArray_STRIDES(
%(x)
s)[2]/4
&& n_blocks.y==ceil_intdiv(PyGpuArray_DIMS(
%(x)
s)[2],
(size_t)n_threads.x)){
if(verbose>1)
printf("n_block.x.1=
%%
d, n_block.x.2=
%%
d, n_block.y.1=
%%
d, n_block.y.2=
%%
d,
\\
n",
PyGpuArray_DIMS(
%(x)
s)[0],4096,
ceil_intdiv(PyGpuArray_DIMS(
%(x)
s)[2],(size_t)n_threads.x),
(size_t)(4096 / n_blocks.x));
assert(n_threads.x<=32);
%(makecall_inner)
s
}else{
n_threads.x = std::min(PyGpuArray_DIMS(
%(x)
s)[1],
(size_t) 256);
n_blocks.x = std::min(PyGpuArray_DIMS(
%(x)
s)[0], (size_t)4096);
n_blocks.y = std::min(
PyGpuArray_DIMS(
%(x)
s)[2],
(size_t)(4096 / n_blocks.x)
);
%(makecall)
s
}
%(sync)
s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error:
%%
s:
%%
s. (grid:
%%
i x
%%
i; block:
%%
i x
%%
i x
%%
i)
\\
n",
"kernel_reduce_
%(pattern)
s_
%(name)
s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)
s;
}
}
}
"""
%
locals
()
def
c_code_reduce_0101
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
makecall
=
self
.
_makecall
(
node
,
name
,
x
,
z
,
fail
)
print
>>
sio
,
"""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[3],
(size_t) 256));
while (n_threads.x * n_threads.y <= 256)
{
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[1]) break;
n_threads.y += 1;
}
n_threads.y -= 1;
dim3 n_blocks(PyGpuArray_DIMS(
%(x)
s)[0], PyGpuArray_DIMS(
%(x)
s)[2]);
%(makecall)
s
}
"""
%
locals
()
def
c_code_reduce_100
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
makecall
=
self
.
_makecall
(
node
,
name
,
x
,
z
,
fail
)
# use threadIdx.x for i0
# use blockIdx.x for i1
# use blockIdx.y for i2
print
>>
sio
,
"""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
(size_t) 256));
dim3 n_blocks(std::min(PyGpuArray_DIMS(
%(x)
s)[1], (size_t)4096));
while (n_blocks.x * (n_blocks.y+1) <= 4096 && n_blocks.y <= PyGpuArray_DIMS(
%(x)
s)[2])
{
n_blocks.y += 1;
}
%(makecall)
s
}
"""
%
locals
()
def
c_code_reduce_110
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
makecall
=
self
.
_makecall
(
node
,
name
,
x
,
z
,
fail
)
print
>>
sio
,
"""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[1],
(size_t) 256));
while (n_threads.x*n_threads.y <= 256)
{
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[0])
break;
n_threads.y += 1;
}
n_threads.y -= 1;
dim3 n_blocks(PyGpuArray_DIMS(
%(x)
s)[2]);
%(makecall)
s
}
"""
%
locals
()
def
c_code_reduce_001
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
makecall
=
self
.
_makecall
(
node
,
name
,
x
,
z
,
fail
)
print
>>
sio
,
"""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[2],
(size_t) 256));
dim3 n_blocks(
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
(size_t) 4096));
while (n_blocks.x * n_blocks.y <= 4096)
{
if (n_blocks.y > PyGpuArray_DIMS(
%(x)
s)[1])
break;
n_blocks.y += 1;
}
n_blocks.y -= 1;
%(makecall)
s
}
"""
%
locals
()
def
c_code_reduce_111
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
makecall
=
self
.
_makecall
(
node
,
name
,
x
,
z
,
fail
)
print
>>
sio
,
"""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[2],
(size_t) 256));
//get as many y threads as we can fit
while (n_threads.x * n_threads.y <= 256)
{
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[1])
break;
n_threads.y += 1;
}
n_threads.y -= 1;
//get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <= 256)
{
if (n_threads.z > PyGpuArray_DIMS(
%(x)
s)[0])
break;
n_threads.z += 1;
}
n_threads.z -= 1;
dim3 n_blocks(1,1,1);
%(makecall)
s
}
"""
%
locals
()
def
c_code_reduce_0011
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
makecall
=
self
.
_makecall
(
node
,
name
,
x
,
z
,
fail
)
print
>>
sio
,
"""
{
int verbose = 0;
dim3 n_blocks(
std::min(PyGpuArray_DIMS(
%(x)
s)[0],
(size_t) 4096));
while (n_blocks.x * n_blocks.y <= 4096 &&
n_blocks.y < PyGpuArray_DIMS(
%(x)
s)[1])
{
n_blocks.y += 1;
}
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[3],
(size_t) 256));
while (n_threads.x * n_threads.y <= 256
&& n_threads.y < PyGpuArray_DIMS(
%(x)
s)[2]
&& n_threads.x * n_threads.y * sizeof(float) <=(15*1024-200))
{
n_threads.y += 1;
}
%(makecall)
s
}
"""
%
locals
()
def
c_code_reduce_1111
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
makecall
=
self
.
_makecall
(
node
,
name
,
x
,
z
,
fail
)
print
>>
sio
,
"""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[2],
(size_t) 256));
//get as many y threads as we can fit
while (n_threads.x * n_threads.y <= 256)
{
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[1])
break;
n_threads.y += 1;
}
n_threads.y -= 1;
//get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <= 256)
{
if (n_threads.z > PyGpuArray_DIMS(
%(x)
s)[0])
break;
n_threads.z += 1;
}
n_threads.z -= 1;
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
dim3 n_blocks(1,1,1);
%(makecall)
s
}
"""
%
locals
()
def
c_code_reduce_1011
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
):
makecall
=
self
.
_makecall
(
node
,
name
,
x
,
z
,
fail
)
print
>>
sio
,
"""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(
%(x)
s)[3],
(size_t) 256));
while (n_threads.x * (n_threads.y+1) <= 256) ++n_threads.y;
if (n_threads.y > PyGpuArray_DIMS(
%(x)
s)[2])
n_threads.y = PyGpuArray_DIMS(
%(x)
s)[2];
while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256) ++n_threads.z;
if (n_threads.z > 64)
n_threads.z = 64;
if (n_threads.z > PyGpuArray_DIMS(
%(x)
s)[0])
n_threads.z = PyGpuArray_DIMS(
%(x)
s)[0];
dim3 n_blocks(PyGpuArray_DIMS(
%(x)
s)[1]);
%(makecall)
s
}
"""
%
locals
()
def
c_code_cache_version_apply
(
self
,
node
):
version
=
[
8
]
# the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node
=
Apply
(
self
.
scalar_op
,
[
Scalar
(
dtype
=
input
.
type
.
dtype
)()
for
input
in
node
.
inputs
],
[
Scalar
(
dtype
=
output
.
type
.
dtype
)()
for
output
in
node
.
outputs
])
version
.
extend
(
self
.
scalar_op
.
c_code_cache_version
())
for
i
in
node
.
inputs
+
node
.
outputs
:
version
.
extend
(
Scalar
(
dtype
=
i
.
type
.
dtype
)
.
c_code_cache_version
())
if
all
(
version
):
return
tuple
(
version
)
else
:
return
()
def
c_support_code_apply
(
self
,
node
,
nodename
):
sio
=
StringIO
()
nd_in
=
len
(
self
.
reduce_mask
)
if
all
(
i
==
1
for
i
in
self
.
reduce_mask
):
#this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
reducebuf
=
self
.
_k_reduce_buf
(
'Z[0]'
,
node
,
nodename
,
sub
=
{})
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i0]"
,
{})
reduce_init
=
self
.
_assign_init
(
"A[0]"
)
print
>>
sio
,
"""
static __global__ void kernel_reduce_ccontig_
%(nodename)
s(
const unsigned int d0,
const float *A,
float * Z)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
float myresult =
%(reduce_init)
s;
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
%(reduce_fct)
s
}
%(reducebuf)
s
}
"""
%
locals
()
if
self
.
reduce_mask
==
(
1
,):
#this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
reducebuf
=
self
.
_k_reduce_buf
(
'Z[0]'
,
node
,
nodename
,
sub
=
{})
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i0 * sA0]"
,
{})
reduce_init
=
self
.
_assign_init
(
"A[0]"
)
print
>>
sio
,
"""
static __global__ void kernel_reduce_1_
%(nodename)
s(
const unsigned int d0,
const float *A, const int sA0,
float * Z)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
float myresult =
%(reduce_init)
s;
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
%(reduce_fct)
s
}
%(reducebuf)
s
}
"""
%
locals
()
if
self
.
reduce_mask
==
(
1
,
1
):
#this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
reducebuf
=
self
.
_k_reduce_buf
(
'Z[0]'
,
node
,
nodename
,
sub
=
{})
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i0 * sA0 + i1 * sA1]"
,
{})
reduce_init
=
self
.
_assign_init
(
"A[0]"
)
print
>>
sio
,
"""
static __global__ void kernel_reduce_11_
%(nodename)
s(
const int d0,
const int d1,
const float *A, const int sA0, const int sA1,
float * Z)
{
const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float myresult =
%(reduce_init)
s;
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{
%(reduce_fct)
s;
}
}
%(reducebuf)
s
}
"""
%
locals
()
#01, 011, 0111
if
(
0
==
self
.
reduce_mask
[
0
]
and
all
(
self
.
reduce_mask
[
1
:])
and
nd_in
in
[
2
,
3
,
4
]):
# this kernel uses one block for each row.
# threads per block for each element per row.
N_pattern
=
''
.
join
([
'1'
]
*
(
nd_in
-
1
))
# TODO: is it faster to hardcode sA3, etc. in the later code, rather
# than have the for_* variables declare them and the later code use
# their names?
if
nd_in
==
2
:
for_i1
=
"for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)"
first_i1
=
'threadIdx.x'
sA1
=
'sA1'
for_i2
=
"int i2=0, sA2=0;"
sA2
=
'0'
first_i2
=
'0'
for_i3
=
"int i3=0, sA3=0;"
sA3
=
'0'
first_i3
=
'0'
if
nd_in
==
3
:
for_i1
=
"for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)"
first_i1
=
'threadIdx.y'
sA1
=
'sA1'
for_i2
=
"for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)"
first_i2
=
'threadIdx.x'
sA2
=
'sA2'
for_i3
=
"int i3=0, sA3=0;"
first_i3
=
0
sA3
=
'0'
if
nd_in
==
4
:
for_i1
=
"for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)"
first_i1
=
'threadIdx.z'
sA1
=
'sA1'
for_i2
=
"for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)"
first_i2
=
'threadIdx.y'
sA2
=
'sA2'
for_i3
=
"for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"
first_i3
=
'threadIdx.x'
sA3
=
'sA3'
reducebuf
=
self
.
_k_reduce_buf
(
'Z[i0 * sZ0]'
,
node
,
nodename
,
sub
=
{})
param_dim
=
","
.
join
([
"const int d
%
d"
%
i
for
i
in
xrange
(
nd_in
)])
param_strides
=
","
.
join
([
"const int sA
%
d"
%
i
for
i
in
xrange
(
nd_in
)])
decl
=
self
.
_k_decl
(
node
,
nodename
)
init
=
self
.
_k_init
(
node
,
nodename
)
reduce_init
=
self
.
_assign_init
(
"A[
%(first_i3)
s *
%(sA3)
s +
%(first_i2)
s *
%(sA2)
s +
%(first_i1)
s *
%(sA1)
s + i0 * sA0]"
%
locals
())
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]"
,
{})
print
>>
sio
,
"""
%(decl)
s{
%(init)
s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
myresult =
%(reduce_init)
s;
%(for_i1)
s{
%(for_i2)
s{
%(for_i3)
s{
%(reduce_fct)
s;
}
}
}
%(reducebuf)
s
}
}
"""
%
locals
()
if
self
.
reduce_mask
==
(
0
,
1
,
0
)
or
self
.
reduce_mask
==
(
1
,
0
):
# this kernel uses one block for each column,
# threads per block for each element per column.
#TODO: This kernel is pretty inefficient in terms of reading, because if A is
# c_contiguous (typical case) then each warp is accessing non-contigous
# memory (a segment of a column).
reducebuf
=
self
.
_k_reduce_buf
(
'Z[i0 * sZ0 + i2*sZ1]'
,
node
,
nodename
,
sub
=
{})
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]"
,
{})
reduce_init
=
self
.
_assign_init
(
"A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2]"
)
print
>>
sio
,
"""
static __global__ void kernel_reduce_010_
%(nodename)
s(
const int d0,
const int d1,
const int d2,
const float *A, const int sA0,
const int sA1, const int sA2,
float * Z, const int sZ0, const int sZ1)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{
float myresult =
%(reduce_init)
s;
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{
%(reduce_fct)
s;
}
%(reducebuf)
s
}
}
}
"""
%
locals
()
if
self
.
reduce_mask
==
(
0
,
1
,
0
):
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"X[a * sX0 + b * sX1 + c * sX2]"
,
{})
reduce_init
=
self
.
_assign_init
(
"X[a * sX0 + 0 * sX1 + c * sX2]"
)
print
>>
sio
,
"""
static __global__ void kernel_reduce_010_AD_
%(nodename)
s(
const int A,
const int B,
const int C,
const int D,
//const int E, // THIS is 32
const float *X, const int sX0,
const int sX1, const int sX2,
float * Z, const int sZ0, const int sZ1)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
float myresult = 0.0f;
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int a = blockIdx.x; a < A; a += gridDim.x)
{
for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y)
{
int c = i2_D * 32 + threadIdx.x;
if (c < C)
{
myresult =
%(reduce_init)
s;
for (int b = 0; b < B; ++b)
{
%(reduce_fct)
s;
}
Z[a * sZ0 + c * sZ1] = myresult;
}
}
}
}
"""
%
locals
()
if
self
.
reduce_mask
==
(
0
,
1
,
0
):
#
# This kernel is optimized when the inner most dimensions
# have the smallest stride.
# this kernel uses one block for multiple column(up to 32TODO),
# threads per block for each element per column.
#thread.x = dim 2 contiguous
#thread.y = dim 1
#block.x = dim 0
#block.y = dim 1 rest
init
=
self
.
_k_init
(
node
,
nodename
)
decl
=
self
.
_k_decl
(
node
,
nodename
,
pattern
=
"010_inner"
)
reducebuf
=
self
.
_k_reduce_buf_multiple
(
'Z[i0 * sZ0 + i2*sZ1]'
,
node
,
nodename
,
'blockDim.x'
)
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]"
,
{})
reduce_init
=
self
.
_assign_init
(
"A[i0 * sA0 + 0 * sA1 + i2 * sA2]"
)
print
>>
sio
,
"""
%(decl)
s
{
if(warpSize<blockDim.x){
//TODO: set error code
Z[0] = -666;
return;
}
%(init)
s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
for (int i2 = blockIdx.y*blockDim.x+threadIdx.x; i2 < d2; i2 += gridDim.y*blockDim.x)
{
myresult =
%(reduce_init)
s;
for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
{
%(reduce_fct)
s;
}
%(reducebuf)
s
}
}
}
"""
%
locals
()
if
self
.
reduce_mask
==
(
1
,
1
,
0
):
# this kernel uses one block for each column,
# threads per block for each element per column.
#TODO: This kernel is pretty inefficient in terms of reading, because if A is
# c_contiguous (typical case) then each warp is accessing non-contigous
# memory (a segment of a column).
reducebuf
=
self
.
_k_reduce_buf
(
'Z[blockIdx.x * sZ0]'
,
node
,
nodename
,
sub
=
{})
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]"
,
{})
reduce_init
=
self
.
_assign_init
(
"A[blockIdx.x * sA2]"
)
print
>>
sio
,
"""
static __global__ void kernel_reduce_110_
%(nodename)
s(
const int d0,
const int d1,
const int d2,
const float *A, const int sA0,
const int sA1, const int sA2,
float * Z, const int sZ0)
{
const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float myresult =
%(reduce_init)
s;
if (warpSize != 32)
{
//TODO: set error code
Z[blockIdx.x * sZ0] = -666;
return;
}
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{
%(reduce_fct)
s;
}
}
%(reducebuf)
s
}
"""
%
locals
()
if
self
.
reduce_mask
==
(
1
,
0
,
0
):
reducebuf
=
self
.
_k_reduce_buf
(
'Z[i1 * sZ0 + i2 * sZ1]'
,
node
,
nodename
,
sub
=
{})
decl
=
self
.
_k_decl
(
node
,
nodename
)
init
=
self
.
_k_init
(
node
,
nodename
)
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]"
,
{})
reduce_init
=
self
.
_assign_init
(
"A[i1 * sA1 + i2 * sA2]"
)
print
>>
sio
,
"""
%(decl)
s
{
%(init)
s
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{
for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x)
{
myresult =
%(reduce_init)
s;
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
%(reduce_fct)
s
}
%(reducebuf)
s
}
}
}
"""
%
locals
()
if
self
.
reduce_mask
==
(
1
,
1
,
1
):
reducebuf
=
self
.
_k_reduce_buf
(
'Z[0]'
,
node
,
nodename
,
sub
=
{})
decl
=
self
.
_k_decl
(
node
,
nodename
)
init
=
self
.
_k_init
(
node
,
nodename
)
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]"
,
{})
reduce_init
=
self
.
_assign_init
(
"A[0]"
)
print
>>
sio
,
"""
%(decl)
s
{
%(init)
s
myresult =
%(reduce_init)
s;
for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
{
for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
{
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
{
%(reduce_fct)
s;
}
}
}
%(reducebuf)
s
}
"""
%
locals
()
if
self
.
reduce_mask
==
(
0
,
0
,
1
):
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf
=
self
.
_k_reduce_buf
(
'Z[i0 * sZ0 + i1 * sZ1]'
,
node
,
nodename
,
sub
=
{})
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]"
,
{})
reduce_init
=
self
.
_assign_init
(
"A[i0 * sA0 + i1 * sA1]"
)
print
>>
sio
,
"""
static __global__ void kernel_reduce_001_
%(nodename)
s(
const int d0,
const int d1,
const int d2,
const float *A, const int sA0,
const int sA1, const int sA2,
float * Z, const int sZ0, const int sZ1)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{
float myresult =
%(reduce_init)
s;
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
{
%(reduce_fct)
s;
}
%(reducebuf)
s
}
}
}
"""
%
locals
()
if
self
.
reduce_mask
==
(
0
,
0
,
1
,
1
):
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf
=
self
.
_k_reduce_buf
(
'Z[i0 * sZ0 + i1 * sZ1]'
,
node
,
nodename
,
sub
=
{})
decl
=
self
.
_k_decl
(
node
,
nodename
)
init
=
self
.
_k_init
(
node
,
nodename
)
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]"
,
{})
reduce_init
=
self
.
_assign_init
(
"A[i0 * sA0 + i1 * sA1]"
)
print
>>
sio
,
"""
%(decl)
s
{
%(init)
s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{
float myresult =
%(reduce_init)
s;
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{
%(reduce_fct)
s;
}
}
%(reducebuf)
s
}
}
}
"""
%
locals
()
if
self
.
reduce_mask
==
(
0
,
1
,
0
,
1
):
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf
=
self
.
_k_reduce_buf
(
'Z[i0 * sZ0 + i2 * sZ1]'
,
node
,
nodename
,
sub
=
{})
decl
=
self
.
_k_decl
(
node
,
nodename
)
init
=
self
.
_k_init
(
node
,
nodename
)
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]"
,
{})
reduce_init
=
self
.
_assign_init
(
"A[i0 * sA0 + i2 * sA2]"
)
print
>>
sio
,
"""
%(decl)
s
{
%(init)
s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{
float myresult =
%(reduce_init)
s;
for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{
%(reduce_fct)
s;
}
}
%(reducebuf)
s
}
}
}
"""
%
locals
()
if
self
.
reduce_mask
==
(
1
,
1
,
1
,
1
):
reducebuf
=
self
.
_k_reduce_buf
(
'Z[0]'
,
node
,
nodename
,
sub
=
{})
decl
=
self
.
_k_decl
(
node
,
nodename
)
init
=
self
.
_k_init
(
node
,
nodename
)
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]"
,
{})
reduce_init
=
self
.
_assign_init
(
"A[0]"
)
print
>>
sio
,
"""
%(decl)
s
{
%(init)
s
myresult =
%(reduce_init)
s;
for (int i0 = 0; i0 < d0; i0++)
for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)
{
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{
%(reduce_fct)
s;
}
}
}
%(reducebuf)
s
}
"""
%
locals
()
if
self
.
reduce_mask
==
(
1
,
0
,
1
,
1
):
reducebuf
=
self
.
_k_reduce_buf
(
'Z[blockIdx.x*sZ0]'
,
node
,
nodename
,
sub
=
{})
reduce_fct
=
self
.
_assign_reduce
(
node
,
nodename
,
"myresult"
,
"A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]"
,
{})
reduce_init
=
self
.
_assign_init
(
"A[blockIdx.x * sA1]"
)
print
>>
sio
,
"""
static __global__ void kernel_reduce_1011_
%(nodename)
s(
const unsigned int d0,
const unsigned int d1,
const unsigned int d2,
const unsigned int d3,
const float *A, const int sA0, const int sA1,
const int sA2, const int sA3,
float * Z, const int sZ0)
{
const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float myresult =
%(reduce_init)
s;
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
{
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{
%(reduce_fct)
s;
}
}
}
%(reducebuf)
s
}
"""
%
locals
()
print
>>
sio
,
"""
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a
%
b) ? 1: 0);
}
"""
return
sio
.
getvalue
()
class
GpuCAReduceCPY
(
GpuKernelBase
,
HideC
,
CAReduceDtype
):
"""CAReduce that reuse the python code from compyte.
Too slow for now as it only have a python interface.
"""
def
__init__
(
self
,
scalar_op
,
axis
=
None
,
dtype
=
None
,
acc_dtype
=
None
):
def
__init__
(
self
,
scalar_op
,
axis
=
None
,
dtype
=
None
,
acc_dtype
=
None
):
if
not
hasattr
(
scalar_op
,
'identity'
):
if
not
hasattr
(
scalar_op
,
'identity'
):
raise
ValueError
(
"No identity on scalar op"
)
raise
ValueError
(
"No identity on scalar op"
)
...
@@ -550,7 +2329,7 @@ class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -550,7 +2329,7 @@ class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
def
make_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
def
make_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
# cache the kernel object
# cache the kernel object
self
.
get_kernel_cache
(
node
)
self
.
get_kernel_cache
(
node
)
return
super
(
GpuCAReduce
,
self
)
.
make_thunk
(
node
,
storage_map
,
return
super
(
GpuCAReduce
CPY
,
self
)
.
make_thunk
(
node
,
storage_map
,
compute_map
,
no_recycling
)
compute_map
,
no_recycling
)
def
get_kernel_cache
(
self
,
node
):
def
get_kernel_cache
(
self
,
node
):
...
@@ -732,7 +2511,7 @@ class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -732,7 +2511,7 @@ class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
err = GpuKernel_call(&
%(k_var)
s, 0,
%(ls)
s, gs, args);
err = GpuKernel_call(&
%(k_var)
s, 0,
%(ls)
s, gs, args);
if (err != GA_NO_ERROR) {
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
PyErr_Format(PyExc_RuntimeError,
"compyte error: GpuCAReduce:
%%
s.",
"compyte error: GpuCAReduce
CPY
:
%%
s.",
GpuKernel_error(&
%(k_var)
s, err));
GpuKernel_error(&
%(k_var)
s, err));
%(fail)
s
%(fail)
s
}
}
...
@@ -741,7 +2520,7 @@ class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -741,7 +2520,7 @@ class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
err = GpuArray_move(&
%(output)
s->ga, &tmp->ga);
err = GpuArray_move(&
%(output)
s->ga, &tmp->ga);
if (err != GA_NO_ERROR) {
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
PyErr_Format(PyExc_RuntimeError,
"compyte error: GpuCAReduce [cast]:
%%
s.",
"compyte error: GpuCAReduce
CPY
[cast]:
%%
s.",
GpuArray_error(&tmp->ga, err));
GpuArray_error(&tmp->ga, err));
%(fail)
s
%(fail)
s
}
}
...
@@ -788,3 +2567,5 @@ class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -788,3 +2567,5 @@ class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
else
:
else
:
output
[
0
]
=
pygpu
.
gpuarray
.
array
(
input
,
copy
=
True
,
output
[
0
]
=
pygpu
.
gpuarray
.
array
(
input
,
copy
=
True
,
dtype
=
node
.
outputs
[
0
]
.
type
.
dtype
)
dtype
=
node
.
outputs
[
0
]
.
type
.
dtype
)
# To allow reloading old pickled files
GpuCAReduce
=
GpuCAReduceCPY
theano/sandbox/gpuarray/opt.py
浏览文件 @
06cdb1fa
...
@@ -14,9 +14,7 @@ from theano.sandbox.gpuarray.type import GpuArrayType
...
@@ -14,9 +14,7 @@ from theano.sandbox.gpuarray.type import GpuArrayType
from
theano.sandbox.gpuarray.basic_ops
import
(
host_from_gpu
,
from
theano.sandbox.gpuarray.basic_ops
import
(
host_from_gpu
,
gpu_from_host
,
gpu_from_host
,
gpu_alloc
,
gpu_alloc
,
gpu_shape
,
GpuAlloc
,
GpuAlloc
,
GpuShape
,
GpuReshape
,
GpuReshape
,
GpuEye
)
GpuEye
)
from
theano.sandbox.gpuarray.blas
import
gpu_dot22
,
GpuGemv
,
GpuGemm
from
theano.sandbox.gpuarray.blas
import
gpu_dot22
,
GpuGemv
,
GpuGemm
...
@@ -24,7 +22,7 @@ from theano.sandbox.gpuarray.conv import GpuConv
...
@@ -24,7 +22,7 @@ from theano.sandbox.gpuarray.conv import GpuConv
from
theano.sandbox.gpuarray.nnet
import
(
GpuCrossentropySoftmaxArgmax1HotWithBias
,
from
theano.sandbox.gpuarray.nnet
import
(
GpuCrossentropySoftmaxArgmax1HotWithBias
,
GpuCrossentropySoftmax1HotWithBiasDx
)
GpuCrossentropySoftmax1HotWithBiasDx
)
from
theano.sandbox.gpuarray.elemwise
import
(
GpuElemwise
,
_is_scalar
,
from
theano.sandbox.gpuarray.elemwise
import
(
GpuElemwise
,
_is_scalar
,
GpuDimShuffle
,
GpuCAReduce
)
GpuDimShuffle
,
GpuCAReduce
Cuda
)
from
theano.sandbox.gpuarray.subtensor
import
GpuIncSubtensor
,
GpuSubtensor
from
theano.sandbox.gpuarray.subtensor
import
GpuIncSubtensor
,
GpuSubtensor
from
theano.sandbox.gpuarray.type
import
GpuArrayConstant
from
theano.sandbox.gpuarray.type
import
GpuArrayConstant
...
@@ -249,9 +247,57 @@ def local_gpua_incsubtensor(node):
...
@@ -249,9 +247,57 @@ def local_gpua_incsubtensor(node):
def
local_gpua_careduce
(
node
):
def
local_gpua_careduce
(
node
):
if
(
isinstance
(
node
.
op
.
scalar_op
,
scalar
.
basic
.
Add
)
or
if
(
isinstance
(
node
.
op
.
scalar_op
,
scalar
.
basic
.
Add
)
or
isinstance
(
node
.
op
.
scalar_op
,
scalar
.
basic
.
Mul
)):
isinstance
(
node
.
op
.
scalar_op
,
scalar
.
basic
.
Mul
)):
return
GpuCAReduce
(
node
.
op
.
scalar_op
,
axis
=
node
.
op
.
axis
,
x
,
=
node
.
inputs
dtype
=
getattr
(
node
.
op
,
'dtype'
,
None
),
greduce
=
GpuCAReduceCuda
(
node
.
op
.
scalar_op
,
axis
=
node
.
op
.
axis
)
acc_dtype
=
getattr
(
node
.
op
,
'acc_dtype'
,
None
))
if
x
.
dtype
!=
"float32"
:
return
gvar
=
greduce
(
x
)
#We need to have the make node called, otherwise the mask can
#be None
if
gvar
.
owner
.
op
.
supports_c_code
([
gpu_from_host
(
x
)]):
return
greduce
else
:
# Try to make a simpler pattern based on reshaping
# The principle is that if two adjacent dimensions have
# the same value in the reduce_mask, then we can reshape
# to make them a single dimension, do the reduction, and
# then reshape to get them back.
if
node
.
op
.
axis
is
None
:
reduce_mask
=
[
1
]
*
x
.
type
.
ndim
else
:
reduce_mask
=
[
0
]
*
x
.
type
.
ndim
for
a
in
node
.
op
.
axis
:
assert
reduce_mask
[
a
]
==
0
reduce_mask
[
a
]
=
1
shape_of
=
node
.
fgraph
.
shape_feature
.
shape_of
x_shape
=
shape_of
[
x
]
new_in_shp
=
[
x_shape
[
0
]]
new_mask
=
[
reduce_mask
[
0
]]
for
i
in
xrange
(
1
,
x
.
type
.
ndim
):
if
reduce_mask
[
i
]
==
reduce_mask
[
i
-
1
]:
new_in_shp
[
-
1
]
*=
x_shape
[
i
]
else
:
new_mask
.
append
(
reduce_mask
[
i
])
new_in_shp
.
append
(
x_shape
[
i
])
new_greduce
=
GpuCAReduceCuda
(
new_mask
,
scalar_op
)
reshaped_x
=
x
.
reshape
(
tensor
.
stack
(
*
new_in_shp
))
gpu_reshaped_x
=
gpu_from_host
(
reshaped_x
)
reshaped_gpu_inputs
=
[
gpu_reshaped_x
]
if
new_greduce
.
supports_c_code
(
reshaped_gpu_inputs
):
reduce_reshaped_x
=
host_from_gpu
(
new_greduce
(
gpu_reshaped_x
))
if
reduce_reshaped_x
.
ndim
!=
node
.
outputs
[
0
]
.
ndim
:
unreshaped_reduce
=
reduce_reshaped_x
.
reshape
(
tensor
.
stack
(
*
shape_of
[
node
.
outputs
[
0
]]))
else
:
unreshaped_reduce
=
reduce_reshaped_x
return
[
unreshaped_reduce
]
@register_opt
()
@register_opt
()
...
@@ -296,20 +342,6 @@ def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
...
@@ -296,20 +342,6 @@ def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
return
GpuCrossentropySoftmax1HotWithBiasDx
()
return
GpuCrossentropySoftmax1HotWithBiasDx
()
@register_opt
()
@local_optimizer
([
tensor
.
Shape
])
def
local_gpua_shape
(
node
):
"""
Can't use op_lifter as the output is on the GPU.
"""
if
isinstance
(
node
.
op
,
tensor
.
Shape
):
x
,
=
node
.
inputs
if
x
.
owner
and
x
.
owner
.
op
==
host_from_gpu
:
gpu_x
,
=
x
.
owner
.
inputs
return
[
gpu_shape
(
gpu_x
)]
return
False
@register_opt
()
@register_opt
()
@op_lifter
([
gpu_from_host
,
ConvOp
])
@op_lifter
([
gpu_from_host
,
ConvOp
])
def
local_gpu_conv
(
node
):
def
local_gpu_conv
(
node
):
...
...
theano/sandbox/gpuarray/tests/test_basic_ops.py
浏览文件 @
06cdb1fa
...
@@ -36,7 +36,7 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
...
@@ -36,7 +36,7 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
gpu_alloc
,
gpu_from_cuda
,
gpu_alloc
,
gpu_from_cuda
,
cuda_from_gpu
,
HostFromGpu
,
cuda_from_gpu
,
HostFromGpu
,
GpuFromHost
,
GpuReshape
,
GpuFromHost
,
GpuReshape
,
GpuEye
,
GpuShape
)
GpuEye
)
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
utt
.
seed_rng
()
utt
.
seed_rng
()
...
@@ -307,7 +307,7 @@ def test_shape():
...
@@ -307,7 +307,7 @@ def test_shape():
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
numpy
.
all
(
f
(
v
)
==
(
3
,
4
,
5
))
assert
numpy
.
all
(
f
(
v
)
==
(
3
,
4
,
5
))
assert
len
(
topo
)
==
1
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
Gpu
Shape
)
assert
isinstance
(
topo
[
0
]
.
op
,
T
.
Shape
)
class
G_reshape
(
T_reshape
):
class
G_reshape
(
T_reshape
):
...
...
theano/sandbox/gpuarray/tests/test_elemwise.py
浏览文件 @
06cdb1fa
...
@@ -10,7 +10,7 @@ from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
...
@@ -10,7 +10,7 @@ from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
from
theano.sandbox.gpuarray.tests.test_basic_ops
import
rand_gpuarray
from
theano.sandbox.gpuarray.tests.test_basic_ops
import
rand_gpuarray
from
theano.sandbox.gpuarray.elemwise
import
(
GpuElemwise
,
GpuDimShuffle
,
from
theano.sandbox.gpuarray.elemwise
import
(
GpuElemwise
,
GpuDimShuffle
,
GpuCAReduce
)
GpuCAReduce
Cuda
,
GpuCAReduceCPY
)
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
pygpu.array
import
gpuarray
from
pygpu.array
import
gpuarray
...
@@ -37,10 +37,11 @@ class test_gpu_Broadcast(test_Broadcast):
...
@@ -37,10 +37,11 @@ class test_gpu_Broadcast(test_Broadcast):
class
test_GpuDimShuffle
(
test_DimShuffle
):
class
test_GpuDimShuffle
(
test_DimShuffle
):
op
=
GpuDimShuffle
op
=
GpuDimShuffle
class
test_GpuCAReduce
(
test_CAReduce
):
class
test_GpuCAReduceCPY
(
test_CAReduce
):
dtypes
=
[
"float32"
]
dtypes
=
[
"float32"
]
bin_dtypes
=
[
"uint8"
,
"int8"
]
bin_dtypes
=
[
"uint8"
,
"int8"
]
op
=
GpuCAReduce
op
=
GpuCAReduce
CPY
reds
=
[
scalar
.
add
,
scalar
.
mul
]
reds
=
[
scalar
.
add
,
scalar
.
mul
]
def
test_perform
(
self
):
def
test_perform
(
self
):
...
@@ -64,3 +65,87 @@ class test_GpuCAReduce(test_CAReduce):
...
@@ -64,3 +65,87 @@ class test_GpuCAReduce(test_CAReduce):
for
op
in
self
.
reds
:
for
op
in
self
.
reds
:
self
.
with_linker
(
gof
.
CLinker
(),
op
,
dtype
=
dtype
,
self
.
with_linker
(
gof
.
CLinker
(),
op
,
dtype
=
dtype
,
test_nan
=
True
)
test_nan
=
True
)
def
test_infer_shape
(
self
):
for
dtype
in
self
.
dtypes
:
test_CAReduce
.
test_infer_shape
(
self
,
dtype
)
class
test_GpuCAReduceCuda
(
test_GpuCAReduceCPY
):
dtypes
=
[
"float32"
]
bin_dtypes
=
[
"uint8"
,
"int8"
]
bin_dtypes
=
[]
cases
=
[((
5
,
6
),
None
),
((
5
,
6
),
(
0
,
1
)),
((
5
,
6
),
(
0
,
)),
((
5
,
6
),
(
1
,
)),
((
5
,
6
),
(
-
1
,
)),
((
5
,
6
),
(
-
2
,
)),
#((5, 6), ()), #reduce on no axis(copy) isn't implemented
#((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
#((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
((
5
,
0
),
None
),
((
5
,
0
),
(
0
,
)),
((
5
,
0
),
(
1
,
)),
#((5, 0), ()), reduce on no axis isn't implemented
#((), None), reduce on no axis isn't implemented
#((), ()) reduce on no axis isn't implemented
#Test all GPU cases implemented
((
1
,
0
),(
1
,)),
((
0
,
1
),(
1
,)),
((
0
,
0
),(
1
,)),
((
0
,
0
,
0
),(
1
,
2
)),
((
0
,
0
,
0
,
0
),(
1
,
2
,
3
)),
((
2
,
1
),(
1
,)),
((
1
,
2
),(
1
,)),
((
100
,
3
,
1300
),[
1
]),
((
0
,),[
0
]),((
5
,),[
0
]),
((
0
,
0
),[
0
,
1
]),((
1
,
0
),[
0
,
1
]),((
5
,
4
),[
0
,
1
]),((
33
,
31
),[
0
,
1
]),((
5
,
4
),[
1
]),((
5
,
4
),[
0
]),
#need something bigger then 32 for some opt test.
((
5
,
4
,
3
),[
0
]),((
5
,
4
,
3
),[
1
]),((
5
,
4
,
3
),[
0
,
1
]),((
5
,
4
,
3
),[
2
]),((
5
,
4
,
3
),[
1
,
2
]),((
5
,
4
,
3
),[
0
,
1
,
2
]),
((
0
,
0
,
0
,
0
),[
0
,
1
,
2
,
3
]),
((
5
,
4
,
3
,
20
),[
2
,
3
]),
((
5
,
4
,
3
,
2
),[
0
,
1
,
2
,
3
]),
((
5
,
4
,
3
,
2
),[
0
,
2
,
3
]),((
5
,
4
,
3
,
2
),[
1
,
2
,
3
]),
#test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
((
4100
,
3
),[
0
]),((
3
,
4101
),[
0
]),
#10
((
1024
,
33
),[
0
]),((
33
,
1024
),[
0
]),
#10
((
1025
,
33
),[
0
]),((
33
,
1025
),[
0
]),
#10
((
4100
,
3
),[
1
]),((
3
,
4101
),[
1
]),
#01
((
1024
,
33
),[
1
]),((
33
,
1024
),[
1
]),
#01
((
1025
,
33
),[
1
]),((
33
,
1025
),[
1
]),
#01
((
4100
,
3
),[
0
,
1
]),((
3
,
4101
),[
0
,
1
]),
#11
((
1024
,
33
),[
0
,
1
]),((
33
,
1024
),[
0
,
1
]),
#01
((
1025
,
33
),[
0
,
1
]),((
33
,
1025
),[
0
,
1
]),
#01
((
4100
,
4
,
3
),[
0
]),((
5
,
4100
,
3
),[
0
]),((
5
,
4
,
4100
),[
0
]),
((
3
,
65536
,
1
),
[
0
]),
#100
((
4100
,
4
,
3
),[
1
]),((
5
,
4100
,
3
),[
1
]),((
5
,
4
,
4100
),[
1
]),
#010
((
4100
,
4
,
3
),[
2
]),((
5
,
4100
,
3
),[
2
]),((
5
,
4
,
4100
),[
2
]),
#001
((
4100
,
4
,
3
),[
0
,
1
]),((
5
,
4100
,
3
),[
0
,
1
]),((
5
,
4
,
4100
),[
0
,
1
]),
#110
((
4100
,
4
,
3
),[
1
,
2
]),((
5
,
4100
,
3
),[
1
,
2
]),((
5
,
4
,
4100
),[
1
,
2
]),
#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((
4100
,
4
,
3
),[
0
,
1
,
2
]),((
5
,
4100
,
3
),[
0
,
1
,
2
]),((
5
,
4
,
4100
),[
0
,
1
,
2
]),
#111
((
4100
,
4
,
3
,
2
),[
2
,
3
]),((
4
,
4100
,
3
,
2
),[
2
,
3
]),((
4
,
3
,
4100
,
2
),[
2
,
3
]),((
4
,
3
,
2
,
4100
),[
2
,
3
]),
#0011
((
4100
,
4
,
3
,
2
),[
1
,
3
]),((
4
,
4100
,
3
,
2
),[
1
,
3
]),((
4
,
3
,
4100
,
2
),[
1
,
3
]),((
4
,
3
,
2
,
4100
),[
1
,
3
]),
#0101
((
4100
,
4
,
3
,
2
),[
0
,
2
,
3
]),((
4
,
4100
,
3
,
2
),[
0
,
2
,
3
]),((
4
,
3
,
4100
,
2
),[
0
,
2
,
3
]),
#((4,3,2,4100),[0,2,3]),#1011
((
4100
,
4
,
3
,
2
),[
1
,
2
,
3
]),((
4
,
4100
,
3
,
2
),[
1
,
2
,
3
]),((
4
,
3
,
4100
,
2
),[
1
,
2
,
3
]),((
4
,
3
,
2
,
4100
),[
1
,
2
,
3
]),
#0111
((
4100
,
2
,
3
,
4
),[
0
,
1
,
2
,
3
]),((
2
,
4100
,
3
,
4
),[
0
,
1
,
2
,
3
]),((
2
,
3
,
4100
,
4
),[
0
,
1
,
2
,
3
]),((
2
,
3
,
4
,
4100
),[
0
,
1
,
2
,
3
]),((
128
,
1
,
3
,
3
),
[
0
,
1
,
2
,
3
]),
#1111
#test pattern implemented by reshape
# ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
# ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
# ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
# ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
# ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
# ((5,4,3,10,11),[1,2]),
]
op
=
GpuCAReduceCuda
reds
=
[
scalar
.
add
,
scalar
.
mul
]
def
test_perform
(
self
):
return
def
test_perform_nan
(
self
):
return
theano/sandbox/gpuarray/tests/test_opt.py
浏览文件 @
06cdb1fa
...
@@ -3,7 +3,7 @@ import numpy
...
@@ -3,7 +3,7 @@ import numpy
import
theano
import
theano
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
from
theano.sandbox.gpuarray.basic_ops
import
GpuAlloc
,
GpuReshape
,
gpu_alloc
from
theano.sandbox.gpuarray.basic_ops
import
GpuAlloc
,
GpuReshape
,
gpu_alloc
from
theano.sandbox.gpuarray.elemwise
import
GpuCAReduce
from
theano.sandbox.gpuarray.elemwise
import
GpuCAReduce
Cuda
import
theano.sandbox.gpuarray
import
theano.sandbox.gpuarray
from
theano.tests.unittest_tools
import
SkipTest
from
theano.tests.unittest_tools
import
SkipTest
...
@@ -69,7 +69,7 @@ def test_sum_prod():
...
@@ -69,7 +69,7 @@ def test_sum_prod():
res
=
f
(
val
)
res
=
f
(
val
)
utt
.
assert_allclose
(
res
,
val
.
sum
())
utt
.
assert_allclose
(
res
,
val
.
sum
())
assert
res
.
shape
==
()
assert
res
.
shape
==
()
assert
GpuCAReduce
in
[
type
(
node
.
op
)
assert
GpuCAReduce
Cuda
in
[
type
(
node
.
op
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
...
...
theano/sandbox/gpuarray/type.py
浏览文件 @
06cdb1fa
...
@@ -298,6 +298,23 @@ theano.compile.register_view_op_c_code(GpuArrayType, """
...
@@ -298,6 +298,23 @@ theano.compile.register_view_op_c_code(GpuArrayType, """
Py_XINCREF(
%(oname)
s);
Py_XINCREF(
%(oname)
s);
"""
,
version
=
(
0
,))
"""
,
version
=
(
0
,))
# Register GpuArrayType C code for Shape Op.
theano
.
compile
.
register_shape_c_code
(
GpuArrayType
,
"""
npy_intp shape[] = {
%(iname)
s->ga.nd};
if(
%(oname)
s == NULL || (PyArray_DIMS(
%(oname)
s)[0] != shape[0]))
{
Py_XDECREF(
%(oname)
s);
%(oname)
s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
}
for(int i=0;i<shape[0];i++)
{
((npy_int64*)PyArray_GETPTR1(
%(oname)
s, i))[0] =
%(iname)
s->ga.dimensions[i];
}
"""
,
version
=
1
)
theano
.
compile
.
register_shape_i_c_code
(
GpuArrayType
,
"""
theano
.
compile
.
register_shape_i_c_code
(
GpuArrayType
,
"""
if(!
%(oname)
s)
if(!
%(oname)
s)
%(oname)
s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
%(oname)
s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
...
...
theano/tensor/basic.py
浏览文件 @
06cdb1fa
...
@@ -25,6 +25,8 @@ from theano.gof.python25 import partial, any, all
...
@@ -25,6 +25,8 @@ from theano.gof.python25 import partial, any, all
from
theano.gof.utils
import
hashtype
from
theano.gof.utils
import
hashtype
from
theano
import
compile
,
printing
from
theano
import
compile
,
printing
from
theano.printing
import
pprint
,
min_informative_str
from
theano.printing
import
pprint
,
min_informative_str
from
theano.compile
import
Shape
,
shape
#For history
# We use these exceptions as well.
# We use these exceptions as well.
import
theano.scalar.sharedvar
import
theano.scalar.sharedvar
...
@@ -1125,83 +1127,6 @@ def cast(x, dtype):
...
@@ -1125,83 +1127,6 @@ def cast(x, dtype):
##########################
##########################
class
Shape
(
Op
):
"""
L{Op} to return the shape of a matrix.
@note: Non-differentiable.
"""
def
__hash__
(
self
):
return
hash
(
type
(
self
))
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
def
__str__
(
self
):
return
self
.
__class__
.
__name__
def
make_node
(
self
,
x
):
# Must work for all type that have a shape attribute.
# This will fail at execution time.
x
=
as_tensor_variable
(
x
)
# Each type variable should implement their .shape attribute
# and have the fct infer_shape() implemented in the op that convert
# the type to TensorVariable to have the optimization working
# correctly.
return
Apply
(
self
,
[
x
],
[
lvector
()])
def
perform
(
self
,
node
,
inp
,
out_
):
x
,
=
inp
out
,
=
out_
out
[
0
]
=
theano
.
_asarray
(
x
.
shape
,
dtype
=
'int64'
)
def
infer_shape
(
self
,
node
,
in_shapes
):
return
[[
len
(
in_shapes
[
0
])]]
def
connection_pattern
(
self
,
node
):
# the grad returns the gradient with respect to the
# elements of a tensor variable
# the elements of the tensor variable do not participate
# in the computation of the shape, so they are not really
# part of the graph
return
[[
False
]]
def
grad
(
self
,
inp
,
grads
):
# the grad returns the gradient with respect to the
# elements of a tensor variable
# the elements of the tensor variable do not participate
# in the computation of the shape, so they are not really
# part of the graph
return
[
DisconnectedType
()()]
def
R_op
(
self
,
inputs
,
eval_points
):
return
[
None
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
x
,
=
inp
z
,
=
out
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
TensorType
):
return
"""
npy_intp shape[] = {PyArray_NDIM(
%(x)
s)};
if(
%(z)
s == NULL || (PyArray_DIMS(
%(z)
s)[0] != shape[0]))
{
Py_XDECREF(
%(z)
s);
%(z)
s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
}
for(int i=0;i<shape[0];i++)
{
((npy_int64*)PyArray_GETPTR1(
%(z)
s, i))[0] = PyArray_DIMS(
%(x)
s)[i];
}
"""
%
locals
()
else
:
#TODO: if your type is not listed here, make a damn registry of
# shape_i ops for various types of variables.
# Do not continue this madness.
return
super
(
Shape
,
self
)
.
c_code
(
node
,
nodename
,
(
x
,),
(
out
,),
sub
)
def
c_code_cache_version
(
self
):
return
(
1
,)
@constructor
@constructor
def
old_shape
(
a
):
def
old_shape
(
a
):
"""
"""
...
@@ -1223,10 +1148,6 @@ def old_shape(a):
...
@@ -1223,10 +1148,6 @@ def old_shape(a):
# a tuple directly. This tuple is like the numpy.ndarray.shape tuple.
# a tuple directly. This tuple is like the numpy.ndarray.shape tuple.
return
va
.
type
.
shape
return
va
.
type
.
shape
shape
=
Shape
()
_shape
=
shape
# was used in the past, now use shape directly.
pprint
.
assign
(
_shape
,
printing
.
MemberPrinter
(
'shape'
))
class
SpecifyShape
(
Op
):
class
SpecifyShape
(
Op
):
"""
"""
...
...
theano/tensor/nnet/nnet.py
浏览文件 @
06cdb1fa
...
@@ -1398,7 +1398,7 @@ def _check_rows_is_arange_len_labels(rows, labels):
...
@@ -1398,7 +1398,7 @@ def _check_rows_is_arange_len_labels(rows, labels):
shape_subtensor
=
stop
.
owner
shape_subtensor
=
stop
.
owner
if
list
(
shape_subtensor
.
op
.
idx_list
)
==
[
0
]:
if
list
(
shape_subtensor
.
op
.
idx_list
)
==
[
0
]:
shape_var
,
=
shape_subtensor
.
inputs
shape_var
,
=
shape_subtensor
.
inputs
if
shape_var
.
owner
and
shape_var
.
owner
.
op
==
tensor
.
_
shape
:
if
shape_var
.
owner
and
shape_var
.
owner
.
op
==
tensor
.
shape
:
return
shape_var
.
owner
.
inputs
[
0
]
is
labels
return
shape_var
.
owner
.
inputs
[
0
]
is
labels
else
:
else
:
shape_of
=
stop
.
owner
.
fgraph
.
shape_feature
.
shape_of
shape_of
=
stop
.
owner
.
fgraph
.
shape_feature
.
shape_of
...
...
theano/tensor/tests/test_elemwise.py
浏览文件 @
06cdb1fa
...
@@ -284,10 +284,7 @@ class test_Broadcast(unittest.TestCase):
...
@@ -284,10 +284,7 @@ class test_Broadcast(unittest.TestCase):
class
test_CAReduce
(
unittest_tools
.
InferShapeTester
):
class
test_CAReduce
(
unittest_tools
.
InferShapeTester
):
op
=
CAReduce
op
=
CAReduce
cases
=
[((
5
,
6
),
None
),
def
with_linker
(
self
,
linker
,
scalar_op
=
scalar
.
add
,
dtype
=
"floatX"
,
test_nan
=
False
,
tensor_op
=
None
):
for
xsh
,
tosum
in
[((
5
,
6
),
None
),
((
5
,
6
),
(
0
,
1
)),
((
5
,
6
),
(
0
,
1
)),
((
5
,
6
),
(
0
,
)),
((
5
,
6
),
(
0
,
)),
((
5
,
6
),
(
1
,
)),
((
5
,
6
),
(
1
,
)),
...
@@ -301,7 +298,12 @@ class test_CAReduce(unittest_tools.InferShapeTester):
...
@@ -301,7 +298,12 @@ class test_CAReduce(unittest_tools.InferShapeTester):
((
5
,
0
),
(
1
,
)),
((
5
,
0
),
(
1
,
)),
((
5
,
0
),
()),
((
5
,
0
),
()),
((),
None
),
((),
None
),
((),
())]:
((),
())
]
def
with_linker
(
self
,
linker
,
scalar_op
=
scalar
.
add
,
dtype
=
"floatX"
,
test_nan
=
False
,
tensor_op
=
None
):
for
xsh
,
tosum
in
self
.
cases
:
if
dtype
==
"floatX"
:
if
dtype
==
"floatX"
:
dtype
=
theano
.
config
.
floatX
dtype
=
theano
.
config
.
floatX
x
=
TensorType
(
dtype
,
[(
entry
==
1
)
for
entry
in
xsh
])(
'x'
)
x
=
TensorType
(
dtype
,
[(
entry
==
1
)
for
entry
in
xsh
])(
'x'
)
...
@@ -400,17 +402,22 @@ class test_CAReduce(unittest_tools.InferShapeTester):
...
@@ -400,17 +402,22 @@ class test_CAReduce(unittest_tools.InferShapeTester):
if
scalar_op
in
[
scalar
.
and_
,
scalar
.
or_
]:
if
scalar_op
in
[
scalar
.
and_
,
scalar
.
or_
]:
zv
=
numpy
.
asarray
(
zv
,
dtype
=
'int8'
)
zv
=
numpy
.
asarray
(
zv
,
dtype
=
'int8'
)
if
test_nan
:
if
test_nan
:
self
.
assertTrue
(
theano
.
tensor
.
TensorType
.
values_eq
(
f
(
xv
),
try
:
zv
),
self
.
assertTrue
(
theano
.
tensor
.
TensorType
.
values_eq
(
f
(
xv
),
zv
),
(
f
(
xv
),
zv
))
(
f
(
xv
),
zv
))
except
NotImplementedError
:
# GpuCAReduce don't implement all cases when size is 0
assert
xv
.
size
==
0
else
:
else
:
try
:
f_xv
=
f
(
xv
)
f_xv
=
f
(
xv
)
self
.
assertTrue
((
f_xv
.
shape
==
zv
.
shape
),
(
f_xv
,
zv
))
self
.
assertTrue
((
f_xv
.
shape
==
zv
.
shape
),
(
f_xv
,
zv
))
self
.
assertTrue
(
numpy
.
allclose
(
f_xv
,
zv
),
(
f_xv
,
zv
))
self
.
assertTrue
(
numpy
.
allclose
(
f_xv
,
zv
),
(
f_xv
,
zv
))
except
NotImplementedError
:
# GpuCAReduce don't implement all cases when size is 0
assert
xv
.
size
==
0
#test CAReduce.infer_shape
#the Shape op don't implement c_code!
if
isinstance
(
linker
,
gof
.
PerformLinker
):
x
=
TensorType
(
dtype
,
[(
entry
==
1
)
for
entry
in
xsh
])(
'x'
)
x
=
TensorType
(
dtype
,
[(
entry
==
1
)
for
entry
in
xsh
])(
'x'
)
if
tensor_op
is
None
:
if
tensor_op
is
None
:
e
=
self
.
op
(
scalar_op
,
axis
=
tosum
)(
x
)
e
=
self
.
op
(
scalar_op
,
axis
=
tosum
)(
x
)
...
@@ -422,7 +429,11 @@ class test_CAReduce(unittest_tools.InferShapeTester):
...
@@ -422,7 +429,11 @@ class test_CAReduce(unittest_tools.InferShapeTester):
[
e
.
shape
]))
.
make_function
()
[
e
.
shape
]))
.
make_function
()
if
not
(
scalar_op
in
[
scalar
.
maximum
,
scalar
.
minimum
]
and
if
not
(
scalar_op
in
[
scalar
.
maximum
,
scalar
.
minimum
]
and
((
xsh
==
()
or
numpy
.
prod
(
xsh
)
==
0
))):
((
xsh
==
()
or
numpy
.
prod
(
xsh
)
==
0
))):
try
:
assert
all
(
f
(
xv
)
==
zv
.
shape
)
assert
all
(
f
(
xv
)
==
zv
.
shape
)
except
NotImplementedError
:
# GpuCAReduce don't implement all cases when size is 0
assert
xv
.
size
==
0
def
test_perform
(
self
):
def
test_perform
(
self
):
for
dtype
in
[
"floatX"
,
"complex64"
,
"complex128"
,
"int8"
,
"uint8"
]:
for
dtype
in
[
"floatX"
,
"complex64"
,
"complex128"
,
"int8"
,
"uint8"
]:
...
@@ -487,30 +498,19 @@ class test_CAReduce(unittest_tools.InferShapeTester):
...
@@ -487,30 +498,19 @@ class test_CAReduce(unittest_tools.InferShapeTester):
self
.
with_linker
(
gof
.
CLinker
(),
scalar
.
maximum
,
dtype
=
dtype
,
self
.
with_linker
(
gof
.
CLinker
(),
scalar
.
maximum
,
dtype
=
dtype
,
test_nan
=
True
)
test_nan
=
True
)
def
test_infer_shape
(
self
):
def
test_infer_shape
(
self
,
dtype
=
None
):
for
xsh
,
tosum
in
[((
5
,
6
),
None
),
if
dtype
is
None
:
((
5
,
6
),
(
0
,
1
)),
((
5
,
6
),
(
0
,
)),
((
5
,
6
),
(
1
,
)),
((
5
,
6
),
(
-
1
,
)),
((
5
,
6
),
(
-
2
,
)),
((
2
,
3
,
4
,
5
),
(
0
,
1
,
3
)),
((
2
,
3
,
4
,
5
),
(
-
2
,
-
3
)),
((
5
,
0
),
None
),
((
5
,
0
),
(
0
,
)),
((
5
,
0
),
(
1
,
)),
((
5
,
6
),
()),
((
5
,
0
),
()),
((),
None
),
((),
())]:
dtype
=
theano
.
config
.
floatX
dtype
=
theano
.
config
.
floatX
for
xsh
,
tosum
in
self
.
cases
:
x
=
TensorType
(
dtype
,
[(
entry
==
1
)
for
entry
in
xsh
])(
'x'
)
x
=
TensorType
(
dtype
,
[(
entry
==
1
)
for
entry
in
xsh
])(
'x'
)
if
tosum
is
None
:
if
tosum
is
None
:
tosum
=
range
(
len
(
xsh
))
tosum
=
range
(
len
(
xsh
))
xv
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
*
xsh
),
dtype
=
dtype
)
xv
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
*
xsh
),
dtype
=
dtype
)
self
.
_compile_and_check
([
x
],
self
.
_compile_and_check
([
x
],
[
self
.
op
(
scalar
.
add
,
axis
=
tosum
)(
x
)],
[
self
.
op
(
scalar
.
add
,
axis
=
tosum
)(
x
)],
[
xv
],
self
.
op
,
[
"local_cut_useless_reduce"
])
[
xv
],
self
.
op
,
[
"local_cut_useless_reduce"
],
warn
=
0
not
in
xsh
)
class
test_Prod
(
unittest
.
TestCase
):
class
test_Prod
(
unittest
.
TestCase
):
...
...
theano/tensor/type.py
浏览文件 @
06cdb1fa
...
@@ -611,6 +611,25 @@ theano.compile.register_view_op_c_code(
...
@@ -611,6 +611,25 @@ theano.compile.register_view_op_c_code(
"""
,
"""
,
version
=
1
)
version
=
1
)
# Register TensorType C code for Shape Op.
theano
.
compile
.
register_shape_c_code
(
TensorType
,
"""
npy_intp shape[] = {PyArray_NDIM(
%(iname)
s)};
if(
%(oname)
s == NULL || (PyArray_DIMS(
%(oname)
s)[0] != shape[0]))
{
Py_XDECREF(
%(oname)
s);
%(oname)
s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
}
for(int i=0;i<shape[0];i++)
{
((npy_int64*)PyArray_GETPTR1(
%(oname)
s, i))[0] = PyArray_DIMS(
%(iname)
s)[i];
}
"""
,
version
=
1
)
# Register TensorType C code for ViewOp.
# Register TensorType C code for ViewOp.
theano
.
compile
.
register_shape_i_c_code
(
theano
.
compile
.
register_shape_i_c_code
(
TensorType
,
TensorType
,
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论