Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
1f6b0c73
提交
1f6b0c73
authored
2月 20, 2012
作者:
nouiz
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #437 from lamblin/blas_double_strides
Make blas functions work with all stride patterns
上级
b96c7d5c
3316ba1e
隐藏空白字符变更
内嵌
并排
正在显示
10 个修改的文件
包含
1112 行增加
和
264 行删除
+1112
-264
blas.py
theano/sandbox/cuda/blas.py
+104
-77
cuda_ndarray.cu
theano/sandbox/cuda/cuda_ndarray.cu
+251
-105
cuda_ndarray.cuh
theano/sandbox/cuda/cuda_ndarray.cuh
+21
-12
test_blas.py
theano/sandbox/cuda/tests/test_blas.py
+86
-7
blas.py
theano/tensor/blas.py
+10
-4
blas_c.py
theano/tensor/blas_c.py
+110
-35
blas_scipy.py
theano/tensor/blas_scipy.py
+6
-1
test_blas.py
theano/tensor/tests/test_blas.py
+487
-16
test_blas_c.py
theano/tensor/tests/test_blas_c.py
+31
-6
test_blas_scipy.py
theano/tensor/tests/test_blas_scipy.py
+6
-1
没有找到文件。
theano/sandbox/cuda/blas.py
浏览文件 @
1f6b0c73
...
...
@@ -192,7 +192,7 @@ class GpuGemm(GpuOp):
return
Apply
(
self
,
[
z
,
a
,
x
,
y
,
b
],
[
z
.
type
()])
def
c_code_cache_version
(
self
):
return
(
3
,)
return
(
4
,)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
#z_out = alpha * dot(x,y) + beta * z_in
...
...
@@ -200,6 +200,7 @@ class GpuGemm(GpuOp):
#not inplace version, we copy z_in to z_out.
z_in
,
a
,
x
,
y
,
b
=
inputs
z_out
,
=
outputs
inplace
=
int
(
self
.
inplace
)
fail
=
sub
[
'fail'
]
sio
=
StringIO
.
StringIO
()
...
...
@@ -215,39 +216,50 @@ class GpuGemm(GpuOp):
: (REAL)(((double*)
%(b)
s->data)[0]);
#undef REAL
"""
if
self
.
inplace
:
print
>>
sio
,
"""
if (
%(inplace)
s
&& (CudaNdarray_HOST_STRIDES(
%(z_in)
s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(
%(z_in)
s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(
%(z_in)
s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(
%(z_in)
s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(
%(z_in)
s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(
%(z_in)
s)[1] == 1)))
{
// The input has an appropriate layout, we work inplace
Py_XDECREF(
%(z_out)
s);
%(z_out)
s =
%(z_in)
s;
Py_INCREF(
%(z_out)
s);
"""
else
:
print
>>
sio
,
"""
if (!
%(z_out)
s
|| (
%(z_out)
s->nd != 2)
|| (CudaNdarray_HOST_DIMS(
%(z_out)
s)[0] != CudaNdarray_HOST_DIMS(
%(z_in)
s)[0])
|| (CudaNdarray_HOST_DIMS(
%(z_out)
s)[1] != CudaNdarray_HOST_DIMS(
%(z_in)
s)[1])
)
}
else if (
%(z_out)
s
&& (
%(z_out)
s->nd == 2)
&& (CudaNdarray_HOST_DIMS(
%(z_out)
s)[0]
== CudaNdarray_HOST_DIMS(
%(z_in)
s)[0])
&& (CudaNdarray_HOST_DIMS(
%(z_out)
s)[1]
== CudaNdarray_HOST_DIMS(
%(z_in)
s)[1])
&& (CudaNdarray_HOST_STRIDES(
%(z_out)
s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(
%(z_out)
s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(
%(z_out)
s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(
%(z_out)
s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(
%(z_out)
s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(
%(z_out)
s)[1] == 1)))
{
// The existing output has an appropriate layout,
// copy the input data into it, then work inplace
if (CudaNdarray_CopyFromCudaNdarray(
%(z_out)
s,
%(z_in)
s))
{
Py_XDECREF(
%(z_out)
s);
%(z_out)
s = (CudaNdarray*)CudaNdarray_Copy(
%(z_in)
s);
if (!
%(z_out)
s)
{
%(fail)
s;
}
%(fail)
s;
}
else
}
else
{
// Copy the input, use the copy as output
Py_XDECREF(
%(z_out)
s);
%(z_out)
s = (CudaNdarray*)CudaNdarray_Copy(
%(z_in)
s);
if (!
%(z_out)
s)
{
if (CudaNdarray_CopyFromCudaNdarray(
%(z_out)
s,
%(z_in)
s))
{
%(fail)
s;
}
%(fail)
s;
}
"""
}
print
>>
sio
,
"""
if (CudaNdarray_gemm(
%(name)
s_a,
%(x)
s,
%(y)
s,
%(name)
s_b,
%(z_out)
s))
{
%(fail)
s;
...
...
@@ -294,7 +306,7 @@ class GpuGemv(GpuOp):
return
Apply
(
self
,
[
z
,
a
,
x
,
y
,
b
],
[
z
.
type
()])
def
c_code_cache_version
(
self
):
return
(
1
,)
return
(
2
,)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
#z_out = alpha * dot(x,y) + beta * z_in
...
...
@@ -302,44 +314,46 @@ class GpuGemv(GpuOp):
#not inplace version, we copy z_in to z_out.
z_in
,
a
,
x
,
y
,
b
=
inputs
z_out
,
=
outputs
inplace
=
int
(
self
.
inplace
)
fail
=
sub
[
'fail'
]
sio
=
StringIO
.
StringIO
()
print
>>
sio
,
"""
float
%(name)
s_alpha = ((dtype_
%(a)
s*)(
%(a)
s->data))[0];
float
%(name)
s_beta = ((dtype_
%(b)
s*)(
%(b)
s->data))[0];
"""
if
self
.
inplace
:
print
>>
sio
,
"""
if (
%(inplace)
s
&& ((CudaNdarray_HOST_STRIDES(
%(z_in)
s)[0] > 0)
|| ((CudaNdarray_HOST_STRIDES(
%(z_in)
s)[0] == 0)
&& (CudaNdarray_HOST_DIMS(
%(z_in)
s)[0] == 1))))
{
// Work inplace on the input
Py_XDECREF(
%(z_out)
s);
%(z_out)
s =
%(z_in)
s;
Py_INCREF(
%(z_out)
s);
"""
else
:
print
>>
sio
,
"""
if (!
%(z_out)
s
|| (
%(z_out)
s->nd != 1)
|| (CudaNdarray_HOST_DIMS(
%(z_out)
s)[0] != CudaNdarray_HOST_DIMS(
%(z_in)
s)[0])
)
}
else if (
%(z_out)
s
&& ((CudaNdarray_HOST_STRIDES(
%(z_out)
s)[0] > 0)
|| ((CudaNdarray_HOST_STRIDES(
%(z_out)
s)[0] == 0)
&& (CudaNdarray_HOST_DIMS(
%(z_out)
s)[0] == 1))))
{
// Work on the output
if (CudaNdarray_CopyFromCudaNdarray(
%(z_out)
s,
%(z_in)
s))
{
Py_XDECREF(
%(z_out)
s);
%(z_out)
s = (CudaNdarray*)CudaNdarray_Copy(
%(z_in)
s);
if (!
%(z_out)
s)
{
%(fail)
s;
}
%(fail)
s;
}
else
}
else
{
// Copy
Py_XDECREF(
%(z_out)
s);
%(z_out)
s = (CudaNdarray*)CudaNdarray_Copy(
%(z_in)
s);
if (!
%(z_out)
s)
{
if (CudaNdarray_CopyFromCudaNdarray(
%(z_out)
s,
%(z_in)
s))
{
%(fail)
s;
}
%(fail)
s;
}
"""
}
print
>>
sio
,
"""
if (CudaNdarray_sgemv(
%(name)
s_alpha,
%(x)
s,
%(y)
s,
%(name)
s_beta,
%(z_out)
s))
{
%(fail)
s;
...
...
@@ -385,7 +399,7 @@ class GpuGer(GpuOp):
return
Apply
(
self
,
[
z
,
a
,
x
,
y
],
[
z
.
type
()])
def
c_code_cache_version
(
self
):
return
(
1
,)
return
(
2
,)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
#z_out = alpha * dot(x,y) + beta * z_in
...
...
@@ -393,44 +407,57 @@ class GpuGer(GpuOp):
#not inplace version, we copy z_in to z_out.
z_in
,
a
,
x
,
y
=
inputs
z_out
,
=
outputs
inplace
=
int
(
self
.
inplace
)
fail
=
sub
[
'fail'
]
sio
=
StringIO
.
StringIO
()
print
>>
sio
,
"""
float
%(name)
s_alpha = ((dtype_
%(a)
s*)(
%(a)
s->data))[0];
"""
if
self
.
inplace
:
print
>>
sio
,
"""
if (
%(inplace)
s
&& (CudaNdarray_HOST_STRIDES(
%(z_in)
s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(
%(z_in)
s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(
%(z_in)
s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(
%(z_in)
s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(
%(z_in)
s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(
%(z_in)
s)[1] == 1)))
{
// The input has an appropriate layout, we work inplace
Py_XDECREF(
%(z_out)
s);
%(z_out)
s =
%(z_in)
s;
Py_INCREF(
%(z_out)
s);
"""
else
:
print
>>
sio
,
"""
if (!
%(z_out)
s
|| (
%(z_out)
s->nd != 2)
|| (CudaNdarray_HOST_DIMS(
%(z_out)
s)[0] != CudaNdarray_HOST_DIMS(
%(z_in)
s)[0])
|| (CudaNdarray_HOST_DIMS(
%(z_out)
s)[1] != CudaNdarray_HOST_DIMS(
%(z_in)
s)[1])
)
}
else if (
%(z_out)
s
&& (
%(z_out)
s->nd == 2)
&& (CudaNdarray_HOST_DIMS(
%(z_out)
s)[0]
== CudaNdarray_HOST_DIMS(
%(z_in)
s)[0])
&& (CudaNdarray_HOST_DIMS(
%(z_out)
s)[1]
== CudaNdarray_HOST_DIMS(
%(z_in)
s)[1])
&& (CudaNdarray_HOST_STRIDES(
%(z_out)
s)[0] >= 0)
&& (CudaNdarray_HOST_STRIDES(
%(z_out)
s)[1] >= 0)
&& ((CudaNdarray_HOST_DIMS(
%(z_out)
s)[0] <= 1)
|| (CudaNdarray_HOST_STRIDES(
%(z_out)
s)[0] == 1)
|| (CudaNdarray_HOST_DIMS(
%(z_out)
s)[1] <= 1)
|| (CudaNdarray_HOST_STRIDES(
%(z_out)
s)[1] == 1)))
{
// The existing output has an appropriate layout,
// copy the input data into it, then work inplace
if (CudaNdarray_CopyFromCudaNdarray(
%(z_out)
s,
%(z_in)
s))
{
Py_XDECREF(
%(z_out)
s);
%(z_out)
s = (CudaNdarray*)CudaNdarray_Copy(
%(z_in)
s);
if (!
%(z_out)
s)
{
%(fail)
s;
}
%(fail)
s;
}
else
}
else
{
// Copy the input, use the copy as output
Py_XDECREF(
%(z_out)
s);
%(z_out)
s = (CudaNdarray*)CudaNdarray_Copy(
%(z_in)
s);
if (!
%(z_out)
s)
{
if (CudaNdarray_CopyFromCudaNdarray(
%(z_out)
s,
%(z_in)
s))
{
%(fail)
s;
}
%(fail)
s;
}
"""
}
print
>>
sio
,
"""
if (CudaNdarray_sger(
%(name)
s_alpha,
%(x)
s,
%(y)
s,
%(z_out)
s))
{
%(fail)
s;
...
...
theano/sandbox/cuda/cuda_ndarray.cu
浏览文件 @
1f6b0c73
...
...
@@ -453,7 +453,7 @@ PyObject* CudaNdarray_Zeros(PyObject* dummy, PyObject* shape)
PyObject
*
CudaNdarray_Copy
(
CudaNdarray
*
self
)
PyObject
*
CudaNdarray_Copy
(
const
CudaNdarray
*
self
)
{
PyObject
*
rval
=
CudaNdarray_New
();
if
((
!
rval
)
||
(
-
1
==
self
->
nd
))
...
...
@@ -1095,7 +1095,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
k3
<<<
n_blocks
,
n_threads
>>>
(
1
,
//d0
1
,
//d1
1
,
//d2
1
,
//d2
CudaNdarray_DEV_DATA
(
self
),
1
,
//strides
1
,
...
...
@@ -2777,7 +2777,7 @@ static __global__ void k_copy_1d(const int N, const float * x, const int sx, flo
}
//copy from other into self
int
CudaNdarray_CopyFromCudaNdarray
(
CudaNdarray
*
self
,
CudaNdarray
*
other
,
bool
unbroadcast
)
int
CudaNdarray_CopyFromCudaNdarray
(
CudaNdarray
*
self
,
const
CudaNdarray
*
other
,
bool
unbroadcast
)
{
int
verbose
=
0
;
if
(
verbose
>
1
)
fprintf
(
stderr
,
"CudaNdarray_CopyFromCudaNdarray
\n
"
);
...
...
@@ -2856,7 +2856,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
// call worker routine
unsigned
int
n_blocks
=
std
::
min
(
size
,
(
unsigned
int
)
NUM_VECTOR_OP_BLOCKS
);
unsigned
int
threads_per_block
=
std
::
min
(
ceil_intdiv
(
size
,
n_blocks
),
(
unsigned
int
)
NUM_VECTOR_OP_THREADS_PER_BLOCK
);
CudaNdarray
*
cuda_dims
=
other
;
const
CudaNdarray
*
cuda_dims
=
other
;
if
(
unbroadcast
)
cuda_dims
=
self
;
//copy from other into self
...
...
@@ -2882,9 +2882,21 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
int
CudaNdarray_gemm
(
float
alpha
,
const
CudaNdarray
*
A
,
const
CudaNdarray
*
B
,
float
beta
,
CudaNdarray
*
C
)
{
if
(
A
->
nd
!=
2
)
{
PyErr_SetString
(
PyExc_ValueError
,
"non-matrix arg to gemm"
);
return
-
1
;
}
if
(
B
->
nd
!=
2
)
{
PyErr_SetString
(
PyExc_ValueError
,
"non-matrix arg to gemm"
);
return
-
1
;
}
if
(
C
->
nd
!=
2
)
{
PyErr_SetString
(
PyExc_ValueError
,
"non-matrix arg to gemm"
);
return
-
1
;
}
if
(
A
->
nd
!=
2
)
{
PyErr_SetString
(
PyExc_ValueError
,
"non-matrix arg A to gemm"
);
return
-
1
;
}
if
(
B
->
nd
!=
2
)
{
PyErr_SetString
(
PyExc_ValueError
,
"non-matrix arg B to gemm"
);
return
-
1
;
}
if
(
C
->
nd
!=
2
)
{
PyErr_SetString
(
PyExc_ValueError
,
"non-matrix arg C to gemm"
);
return
-
1
;
}
// We must allow dimensions to be zeros.
if
((
CudaNdarray_HOST_DIMS
(
A
)[
1
]
!=
CudaNdarray_HOST_DIMS
(
B
)[
0
])
...
...
@@ -2901,14 +2913,72 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return
-
1
;
}
// a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place
// TODO: make a copy instead of returning in error
if
(((
CudaNdarray_HOST_DIMS
(
A
)[
0
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
!=
1
))
&&
((
CudaNdarray_HOST_DIMS
(
A
)[
1
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
!=
1
)))
{
PyErr_SetString
(
PyExc_NotImplementedError
,
"non-unit stride in gemm arg"
);
return
-
1
;
}
if
(((
CudaNdarray_HOST_DIMS
(
B
)[
0
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
B
)[
0
]
!=
1
))
&&
((
CudaNdarray_HOST_DIMS
(
B
)[
1
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
B
)[
1
]
!=
1
)))
{
PyErr_SetString
(
PyExc_NotImplementedError
,
"non-unit stride in gemm arg"
);
return
-
1
;
}
if
(((
CudaNdarray_HOST_DIMS
(
C
)[
0
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
C
)[
0
]
!=
1
))
&&
((
CudaNdarray_HOST_DIMS
(
C
)[
1
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
C
)[
1
]
!=
1
)))
{
PyErr_SetString
(
PyExc_NotImplementedError
,
"non-unit stride in gemm arg"
);
return
-
1
;
}
// If matrix A or B has non-unit size and non-unit stride in both
// dimensions, we can make a copy.
if
(((
CudaNdarray_HOST_DIMS
(
A
)[
0
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
!=
1
)
&&
(
CudaNdarray_HOST_DIMS
(
A
)[
1
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
!=
1
))
||
(
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
<
0
)
||
(
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
<
0
))
{
const
CudaNdarray
*
A_new
=
(
CudaNdarray
*
)
CudaNdarray_Copy
(
A
);
if
(
!
A_new
)
return
-
1
;
A
=
A_new
;
}
else
{
// In the case above, we will need to decref A_new at the end.
// To make things simpler, we incref A here, so we can always
// decref A.
Py_INCREF
(
A
);
}
if
(((
CudaNdarray_HOST_DIMS
(
B
)[
0
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
B
)[
0
]
!=
1
)
&&
(
CudaNdarray_HOST_DIMS
(
B
)[
1
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
B
)[
1
]
!=
1
))
||
(
CudaNdarray_HOST_STRIDES
(
B
)[
0
]
<
0
)
||
(
CudaNdarray_HOST_STRIDES
(
B
)[
1
]
<
0
))
{
const
CudaNdarray
*
B_new
=
(
CudaNdarray
*
)
CudaNdarray_Copy
(
B
);
if
(
!
B_new
)
{
Py_XDECREF
(
A
);
return
-
1
;
}
B
=
B_new
;
}
else
{
// In the case above, we will need to decref B_new at the end.
// To make things simpler, we incref B here, so we can always
// decref B.
Py_INCREF
(
B
);
}
// If matrix C has non-unit size and non-unit stride in both
// dimensions, or negative strides, we can't operate. We cannot copy
// C either, because the calling code will expect the result to be
// in the original C container.
if
(((
CudaNdarray_HOST_DIMS
(
C
)[
0
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
C
)[
0
]
!=
1
)
&&
(
CudaNdarray_HOST_DIMS
(
C
)[
1
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
C
)[
1
]
!=
1
))
||
(
CudaNdarray_HOST_STRIDES
(
C
)[
0
]
<
0
)
||
(
CudaNdarray_HOST_STRIDES
(
C
)[
1
]
<
0
))
{
PyErr_Format
(
PyExc_AssertionError
,
"non-unit or negative stride in gemm arg C (%i,%i) of shape (%i,%i)"
,
CudaNdarray_HOST_STRIDES
(
C
)[
0
],
CudaNdarray_HOST_STRIDES
(
C
)[
1
],
CudaNdarray_HOST_DIMS
(
C
)[
0
],
CudaNdarray_HOST_DIMS
(
C
)[
1
]);
Py_XDECREF
(
A
);
Py_XDECREF
(
B
);
return
-
1
;
}
// the unit integer is divided logically into three fields of 4 bits
// the lowermost 4 bits encode the stride pattern of the output
...
...
@@ -2920,47 +2990,31 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
// a stride of 0 implies a dimension of 1 - so we can actually define
// a stride of 0 as a 'unit' stride because gemm will never use it.
// If a dimension is 0, its stride will not be used either, so we can
// consider it a 'unit' stride too.
int
unit
=
0
;
if
(
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
==
1
||
CudaNdarray_HOST_
STRIDES
(
A
)[
1
]
==
0
)
{
if
(
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
==
1
||
CudaNdarray_HOST_
DIMS
(
A
)[
1
]
<=
1
)
{
unit
|=
(
0x0
<<
8
);
}
else
if
(
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
==
1
||
CudaNdarray_HOST_
STRIDES
(
A
)[
0
]
==
0
)
{
}
else
if
(
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
==
1
||
CudaNdarray_HOST_
DIMS
(
A
)[
0
]
<=
1
)
{
unit
|=
(
0x1
<<
8
);
}
else
{
unit
|=
(
0x2
<<
8
);
}
if
(
CudaNdarray_HOST_STRIDES
(
B
)[
1
]
==
1
||
CudaNdarray_HOST_
STRIDES
(
B
)[
1
]
==
0
)
{
if
(
CudaNdarray_HOST_STRIDES
(
B
)[
1
]
==
1
||
CudaNdarray_HOST_
DIMS
(
B
)[
1
]
<=
1
)
{
unit
|=
(
0x0
<<
4
);
}
else
if
(
CudaNdarray_HOST_STRIDES
(
B
)[
0
]
==
1
||
CudaNdarray_HOST_
STRIDES
(
B
)[
0
]
==
0
)
{
}
else
if
(
CudaNdarray_HOST_STRIDES
(
B
)[
0
]
==
1
||
CudaNdarray_HOST_
DIMS
(
B
)[
0
]
<=
1
)
{
unit
|=
(
0x1
<<
4
);
}
else
{
unit
|=
(
0x2
<<
4
);
}
if
(
CudaNdarray_HOST_STRIDES
(
C
)[
1
]
==
1
||
CudaNdarray_HOST_
STRIDES
(
C
)[
1
]
==
0
)
{
if
(
CudaNdarray_HOST_STRIDES
(
C
)[
1
]
==
1
||
CudaNdarray_HOST_
DIMS
(
C
)[
1
]
<=
1
)
{
unit
|=
(
0x0
<<
0
);
}
else
if
(
CudaNdarray_HOST_STRIDES
(
C
)[
0
]
==
1
||
CudaNdarray_HOST_
STRIDES
(
C
)[
0
]
==
0
)
{
}
else
if
(
CudaNdarray_HOST_STRIDES
(
C
)[
0
]
==
1
||
CudaNdarray_HOST_
DIMS
(
C
)[
0
]
<=
1
)
{
unit
|=
(
0x1
<<
0
);
}
else
{
unit
|=
(
0x2
<<
0
);
}
// I don't know if cudablas handles negative strides
if
(
(
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
<
0
)
||
(
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
<
0
)
||
(
CudaNdarray_HOST_STRIDES
(
B
)[
0
]
<
0
)
||
(
CudaNdarray_HOST_STRIDES
(
B
)[
1
]
<
0
)
||
(
CudaNdarray_HOST_STRIDES
(
C
)[
0
]
<
0
)
||
(
CudaNdarray_HOST_STRIDES
(
C
)[
1
]
<
0
))
{
PyErr_Format
(
PyExc_ValueError
,
"illegal strides in args to gemm (%i,%i)x(%i,%i)->(%i,%i)"
,
CudaNdarray_HOST_STRIDES
(
A
)[
0
],
CudaNdarray_HOST_STRIDES
(
A
)[
1
],
CudaNdarray_HOST_STRIDES
(
B
)[
0
],
CudaNdarray_HOST_STRIDES
(
B
)[
1
],
CudaNdarray_HOST_STRIDES
(
C
)[
0
],
CudaNdarray_HOST_STRIDES
(
C
)[
1
]);
return
-
1
;
}
/* create appropriate strides for malformed matrices that are row or column
* vectors
*/
...
...
@@ -2977,8 +3031,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
char
N
=
'N'
;
char
T
=
'T'
;
//std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
//TODO: recognize the negative stride and make a copy of the offending argument,
//rather than aborting
// There should be no negative stride at that point
#define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
if (sx == 0){sx = 1;}\
if (sy == 0){sy = 1;}\
...
...
@@ -2986,7 +3039,9 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
if ((sx > 0) && (sy > 0) && (sz > 0)) { \
cublasSgemm(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz); \
} else { \
PyErr_SetString(PyExc_NotImplementedError, "negative stride to sGemm");\
PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
Py_XDECREF(A);\
Py_XDECREF(B);\
return -1; \
}
...
...
@@ -3000,14 +3055,19 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
case
0x101
:
CHK_STRIDE_SGEMM
(
N
,
T
,
CudaNdarray_HOST_DIMS
(
C
)[
0
],
CudaNdarray_HOST_DIMS
(
C
)[
1
],
CudaNdarray_HOST_DIMS
(
A
)[
1
],
alpha
,
a
,
sa_1
,
b
,
sb_0
,
beta
,
c
,
sc_1
);
break
;
case
0x011
:
CHK_STRIDE_SGEMM
(
T
,
N
,
CudaNdarray_HOST_DIMS
(
C
)[
0
],
CudaNdarray_HOST_DIMS
(
C
)[
1
],
CudaNdarray_HOST_DIMS
(
A
)[
1
],
alpha
,
a
,
sa_0
,
b
,
sb_1
,
beta
,
c
,
sc_1
);
break
;
case
0x111
:
CHK_STRIDE_SGEMM
(
N
,
N
,
CudaNdarray_HOST_DIMS
(
C
)[
0
],
CudaNdarray_HOST_DIMS
(
C
)[
1
],
CudaNdarray_HOST_DIMS
(
A
)[
1
],
alpha
,
a
,
sa_1
,
b
,
sb_1
,
beta
,
c
,
sc_1
);
break
;
default:
PyErr_Format
(
PyExc_ValueError
,
"some matrix has no unit stride (unit=%
i
)"
,
unit
);
default:
PyErr_Format
(
PyExc_ValueError
,
"some matrix has no unit stride (unit=%
x
)"
,
unit
);
return
-
1
;
};
CNDA_THREAD_SYNC
;
cudaError_t
err
=
cudaGetLastError
();
Py_XDECREF
(
A
);
Py_XDECREF
(
B
);
cublasStatus
err
=
cublasGetError
();
if
(
CUBLAS_STATUS_SUCCESS
!=
err
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"cublassGemm failed (%s)"
,
cudaGetErrorString
(
err
));
PyErr_Format
(
PyExc_RuntimeError
,
"cublasSgemm failed (%i)"
,
err
);
return
-
1
;
}
return
0
;
...
...
@@ -3037,22 +3097,53 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return
-
1
;
}
// a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place
// TODO: make a copy instead of returning in error
if
(((
CudaNdarray_HOST_DIMS
(
A
)[
0
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
!=
1
))
&&
((
CudaNdarray_HOST_DIMS
(
A
)[
1
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
!=
1
)))
{
PyErr_SetString
(
PyExc_NotImplementedError
,
"non-unit stride in gemv arg"
);
return
-
1
;
}
// If matrix A has non-unit size and non-unit stride in both
// dimensions, or negative strides, we cannot operate, but we can
// make a copy.
if
(((
CudaNdarray_HOST_DIMS
(
A
)[
0
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
!=
1
)
&&
(
CudaNdarray_HOST_DIMS
(
A
)[
1
]
>
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
!=
1
))
||
(
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
<
0
)
||
(
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
<
0
))
{
const
CudaNdarray
*
A_new
=
(
CudaNdarray
*
)
CudaNdarray_Copy
(
A
);
if
(
!
A_new
)
return
-
1
;
A
=
A_new
;
}
else
{
// Incref A, so we can decref it at the end in all cases
Py_INCREF
(
A
);
}
// If vector B as a negative stride, we also have to make a copy.
if
(
CudaNdarray_HOST_STRIDES
(
B
)[
0
]
<
0
)
{
const
CudaNdarray
*
B_new
=
(
CudaNdarray
*
)
CudaNdarray_Copy
(
B
);
if
(
!
B_new
)
{
Py_XDECREF
(
A
);
return
-
1
;
}
B
=
B_new
;
}
else
{
// Incref B, so we can decref it at the end in all cases
Py_INCREF
(
B
);
}
//
I don't know if cudablas handles negative strides
//
cudablas does not handle negative strides as expected
if
(
(
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
<
0
)
||
(
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
<
0
)
||
(
CudaNdarray_HOST_STRIDES
(
B
)[
0
]
<
0
)
||
(
CudaNdarray_HOST_STRIDES
(
C
)[
0
]
<
0
))
||
(
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
<
0
))
{
PyErr_Format
(
PyExc_ValueError
,
"illegal strides in args to gemv (%i,%i)
x(%i)->(%i)
"
,
PyErr_Format
(
PyExc_ValueError
,
"illegal strides in args to gemv (%i,%i)"
,
CudaNdarray_HOST_STRIDES
(
A
)[
0
],
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
B
)[
0
],
CudaNdarray_HOST_STRIDES
(
C
)[
0
]
);
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
);
Py_XDECREF
(
A
);
Py_XDECREF
(
B
);
return
-
1
;
}
...
...
@@ -3064,42 +3155,71 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
int
sb_0
=
(
CudaNdarray_HOST_DIMS
(
B
)[
0
]
>
1
)
?
CudaNdarray_HOST_STRIDES
(
B
)[
0
]
:
1
;
int
sc_0
=
(
CudaNdarray_HOST_DIMS
(
C
)[
0
]
>
1
)
?
CudaNdarray_HOST_STRIDES
(
C
)[
0
]
:
1
;
if
(
sa_0
==
1
)
{
cublasSgemv
(
'N'
,
CudaNdarray_HOST_DIMS
(
A
)[
0
],
CudaNdarray_HOST_DIMS
(
A
)[
1
],
alpha
,
CudaNdarray_DEV_DATA
(
A
),
sa_1
,
CudaNdarray_DEV_DATA
(
B
),
sb_0
,
beta
,
CudaNdarray_DEV_DATA
(
C
),
sc_0
);
}
else
if
(
sa_1
==
1
)
{
cublasSgemv
(
'T'
,
CudaNdarray_HOST_DIMS
(
A
)[
1
],
CudaNdarray_HOST_DIMS
(
A
)[
0
],
alpha
,
CudaNdarray_DEV_DATA
(
A
),
sa_0
,
CudaNdarray_DEV_DATA
(
B
),
sb_0
,
beta
,
CudaNdarray_DEV_DATA
(
C
),
sc_0
);
}
else
{
PyErr_SetString
(
PyExc_NotImplementedError
,
"too many strides strides in sgemv"
);
return
-
1
;
if
(
sa_0
==
0
)
sa_0
=
1
;
if
(
sa_1
==
0
)
sa_1
=
1
;
if
(
CudaNdarray_SIZE
(
C
))
{
if
((
CudaNdarray_HOST_DIMS
(
A
)[
0
]
<=
1
)
||
((
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
==
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
>
0
)))
{
cublasSgemv
(
'N'
,
CudaNdarray_HOST_DIMS
(
A
)[
0
],
CudaNdarray_HOST_DIMS
(
A
)[
1
],
alpha
,
CudaNdarray_DEV_DATA
(
A
),
sa_1
,
CudaNdarray_DEV_DATA
(
B
),
sb_0
,
beta
,
CudaNdarray_DEV_DATA
(
C
),
sc_0
);
}
else
if
((
CudaNdarray_HOST_DIMS
(
A
)[
1
]
<=
1
)
||
((
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
==
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
>
0
)))
{
cublasSgemv
(
'T'
,
CudaNdarray_HOST_DIMS
(
A
)[
1
],
CudaNdarray_HOST_DIMS
(
A
)[
0
],
alpha
,
CudaNdarray_DEV_DATA
(
A
),
sa_0
,
CudaNdarray_DEV_DATA
(
B
),
sb_0
,
beta
,
CudaNdarray_DEV_DATA
(
C
),
sc_0
);
}
else
{
PyErr_Format
(
PyExc_AssertionError
,
"Unexpected stride pattern in gemv: (%i, %i) x %i -> %i.
\n
"
"Shapes are: (%i, %i) x %i -> %i
\n
"
,
CudaNdarray_HOST_STRIDES
(
A
)[
0
],
CudaNdarray_HOST_STRIDES
(
A
)[
1
],
CudaNdarray_HOST_STRIDES
(
B
)[
0
],
CudaNdarray_HOST_STRIDES
(
C
)[
0
],
CudaNdarray_HOST_DIMS
(
A
)[
0
],
CudaNdarray_HOST_DIMS
(
A
)[
1
],
CudaNdarray_HOST_DIMS
(
B
)[
0
],
CudaNdarray_HOST_DIMS
(
C
)[
0
]);
Py_XDECREF
(
A
);
Py_XDECREF
(
B
);
return
-
1
;
}
}
CNDA_THREAD_SYNC
;
cudaError_t
err
=
cudaGetLastError
();
Py_XDECREF
(
A
);
Py_XDECREF
(
B
);
cublasStatus
err
=
cublasGetError
();
if
(
CUBLAS_STATUS_SUCCESS
!=
err
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"cublassGemv failed (%s)"
,
cudaGetErrorString
(
err
));
PyErr_Format
(
PyExc_RuntimeError
,
"cublasSgemv failed (%i)"
,
err
);
return
-
1
;
}
return
0
;
}
int
CudaNdarray_sger
(
float
alpha
,
CudaNdarray
*
x
,
CudaNdarray
*
y
,
CudaNdarray
*
A
)
{
int
CudaNdarray_sger
(
float
alpha
,
const
CudaNdarray
*
x
,
const
CudaNdarray
*
y
,
CudaNdarray
*
A
)
{
if
(
x
->
nd
!=
1
)
{
PyErr_SetString
(
PyExc_ValueError
,
"non-vector arg x to sger"
);
return
-
1
;
}
if
(
y
->
nd
!=
1
)
{
PyErr_SetString
(
PyExc_ValueError
,
"non-vector arg y to sger"
);
return
-
1
;
}
if
(
A
->
nd
!=
2
)
{
PyErr_SetString
(
PyExc_ValueError
,
"non-matrix arg A to sger"
);
return
-
1
;
}
...
...
@@ -3115,14 +3235,8 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
return
-
1
;
}
// Maybe this could work, but be safe for now
if
(
!
CudaNdarray_is_c_contiguous
(
A
))
{
PyErr_SetString
(
PyExc_NotImplementedError
,
"non-c continugous A in sger"
);
return
-
1
;
}
// Since Sger expects A in col-major, we invert x and y to fake this.
int
x_strides
=
CudaNdarray_HOST_STRIDES
(
x
)[
0
];
CudaNdarray
*
x_
=
x
;
const
CudaNdarray
*
x_
=
x
;
if
(
x_strides
==
0
){
if
(
CudaNdarray_HOST_DIMS
(
x
)[
0
]
!=
1
){
PyErr_Format
(
PyExc_RuntimeError
,
...
...
@@ -3131,14 +3245,14 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
" that have more then 1 elements!"
);
return
-
1
;
}
x_strides
=
4
;
x_strides
=
1
;
}
else
if
(
x_strides
<
0
){
x_
=
(
CudaNdarray
*
)
CudaNdarray_Copy
(
x
);
x_strides
=
CudaNdarray_HOST_STRIDES
(
x_
)[
0
];
}
int
y_strides
=
CudaNdarray_HOST_STRIDES
(
y
)[
0
];
CudaNdarray
*
y_
=
y
;
const
CudaNdarray
*
y_
=
y
;
if
(
y_strides
==
0
){
if
(
CudaNdarray_HOST_DIMS
(
y
)[
0
]
!=
1
){
PyErr_Format
(
PyExc_RuntimeError
,
...
...
@@ -3147,17 +3261,46 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
" that have more then 1 elements!"
);
return
-
1
;
}
y_strides
=
4
;
y_strides
=
1
;
}
else
if
(
y_strides
<
0
){
y_
=
(
CudaNdarray
*
)
CudaNdarray_Copy
(
y
);
y_strides
=
CudaNdarray_HOST_STRIDES
(
y_
)[
0
];
}
// Create appropriate strides if A is a row or column vector
int
sa_0
=
(
CudaNdarray_HOST_DIMS
(
A
)[
0
]
>
1
)
?
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
:
CudaNdarray_HOST_DIMS
(
A
)[
1
];
int
sa_1
=
(
CudaNdarray_HOST_DIMS
(
A
)[
1
]
>
1
)
?
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
:
CudaNdarray_HOST_DIMS
(
A
)[
0
];
if
(
CudaNdarray_SIZE
(
A
)){
cublasSger
(
CudaNdarray_HOST_DIMS
(
y
)[
0
],
CudaNdarray_HOST_DIMS
(
x
)[
0
],
alpha
,
CudaNdarray_DEV_DATA
(
y_
),
y_strides
,
CudaNdarray_DEV_DATA
(
x_
),
x_strides
,
CudaNdarray_DEV_DATA
(
A
),
CudaNdarray_HOST_DIMS
(
A
)[
1
]);
// If A is in col-major
if
((
CudaNdarray_HOST_DIMS
(
A
)[
0
]
<=
1
)
||
((
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
==
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
>
0
)))
{
cublasSger
(
CudaNdarray_HOST_DIMS
(
x
)[
0
],
CudaNdarray_HOST_DIMS
(
y
)[
0
],
alpha
,
CudaNdarray_DEV_DATA
(
x_
),
x_strides
,
CudaNdarray_DEV_DATA
(
y_
),
y_strides
,
CudaNdarray_DEV_DATA
(
A
),
sa_1
);
}
// Since Sger expects A in col-major, we invert x and y to fake this.
else
if
((
CudaNdarray_HOST_DIMS
(
A
)[
1
]
<=
1
)
||
((
CudaNdarray_HOST_STRIDES
(
A
)[
1
]
==
1
)
&&
(
CudaNdarray_HOST_STRIDES
(
A
)[
0
]
>
0
)))
{
cublasSger
(
CudaNdarray_HOST_DIMS
(
y
)[
0
],
CudaNdarray_HOST_DIMS
(
x
)[
0
],
alpha
,
CudaNdarray_DEV_DATA
(
y_
),
y_strides
,
CudaNdarray_DEV_DATA
(
x_
),
x_strides
,
CudaNdarray_DEV_DATA
(
A
),
sa_0
);
}
// A has to be either c- or f-contiguous, with no negative strides
else
{
PyErr_SetString
(
PyExc_NotImplementedError
,
"non-contiguous A, or negative strides, in sger"
);
return
-
1
;
}
}
CNDA_THREAD_SYNC
;
if
(
x_
!=
x
)
...
...
@@ -3165,12 +3308,15 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
if
(
y_
!=
y
)
Py_DECREF
(
y_
);
cu
daError_t
err
=
cudaGetLas
tError
();
cu
blasStatus
err
=
cublasGe
tError
();
if
(
CUBLAS_STATUS_SUCCESS
!=
err
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"cublasSger failed (%s)"
,
cudaGetErrorString
(
err
));
PyErr_Format
(
PyExc_RuntimeError
,
"cublasSger failed (%i)"
,
err
);
return
-
1
;
}
return
0
;
}
...
...
@@ -3816,7 +3962,7 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
int
cnda_copy_structure_to_device
(
CudaNdarray
*
self
)
cnda_copy_structure_to_device
(
const
CudaNdarray
*
self
)
{
cublasSetVector
(
cnda_structure_size
(
self
->
nd
),
sizeof
(
int
),
self
->
host_structure
,
1
,
self
->
dev_structure
,
1
);
CNDA_THREAD_SYNC
;
...
...
@@ -3830,7 +3976,7 @@ cnda_copy_structure_to_device(CudaNdarray * self)
}
const
int
*
CudaNdarray_DEV_DIMS
(
CudaNdarray
*
self
)
CudaNdarray_DEV_DIMS
(
const
CudaNdarray
*
self
)
{
if
(
!
self
->
dev_structure_fresh
)
{
...
...
@@ -3840,7 +3986,7 @@ CudaNdarray_DEV_DIMS(CudaNdarray * self)
return
self
->
dev_structure
;
}
const
int
*
CudaNdarray_DEV_STRIDES
(
CudaNdarray
*
self
)
CudaNdarray_DEV_STRIDES
(
const
CudaNdarray
*
self
)
{
if
(
!
self
->
dev_structure_fresh
)
{
...
...
@@ -3850,7 +3996,7 @@ CudaNdarray_DEV_STRIDES(CudaNdarray * self)
return
self
->
dev_structure
+
self
->
nd
;
}
const
int
*
CudaNdarray_DEV_LOG2DIMS
(
CudaNdarray
*
self
)
CudaNdarray_DEV_LOG2DIMS
(
const
CudaNdarray
*
self
)
{
if
(
!
self
->
dev_structure_fresh
)
{
...
...
@@ -3991,4 +4137,4 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:
encoding=utf-8:
textwidth=79 :
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
theano/sandbox/cuda/cuda_ndarray.cuh
浏览文件 @
1f6b0c73
...
...
@@ -81,7 +81,7 @@ struct CudaNdarray
//device pointers (allocated by cudaMalloc)
int
dev_structure_fresh
;
mutable
int
dev_structure_fresh
;
//dev_structure should be accessed via macros, otherwise may not be synchronized
int
*
dev_structure
;
//dim0, dim1, ..., stride0, stride1, ...
real
*
devdata
;
//pointer to data element [0,..,0].
...
...
@@ -154,11 +154,11 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s);
*
* This means: recalculate the log2dims and transfer structure to the card
*/
DllExport
int
cnda_copy_structure_to_device
(
CudaNdarray
*
self
);
DllExport
int
cnda_copy_structure_to_device
(
const
CudaNdarray
*
self
);
DllExport
const
int
*
CudaNdarray_DEV_DIMS
(
CudaNdarray
*
self
);
DllExport
const
int
*
CudaNdarray_DEV_STRIDES
(
CudaNdarray
*
self
);
DllExport
const
int
*
CudaNdarray_DEV_LOG2DIMS
(
CudaNdarray
*
self
);
DllExport
const
int
*
CudaNdarray_DEV_DIMS
(
const
CudaNdarray
*
self
);
DllExport
const
int
*
CudaNdarray_DEV_STRIDES
(
const
CudaNdarray
*
self
);
DllExport
const
int
*
CudaNdarray_DEV_LOG2DIMS
(
const
CudaNdarray
*
self
);
DllExport
float
*
CudaNdarray_DEV_DATA
(
const
CudaNdarray
*
self
);
/**
...
...
@@ -229,13 +229,22 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
return
-
1
;
}
assert
(
size
>
0
);
if
(
size
<
0
)
{
PyErr_Format
(
PyExc_AssertionError
,
"size (%i) < 0"
,
size
);
return
-
1
;
}
self
->
devdata
=
(
float
*
)
device_malloc
(
size
*
sizeof
(
real
));
if
(
!
self
->
devdata
)
if
(
size
&&
!
self
->
devdata
)
{
CudaNdarray_set_nd
(
self
,
-
1
);
CudaNdarray_set_nd
(
self
,
-
1
);
self
->
data_allocated
=
0
;
self
->
devdata
=
0
;
PyErr_SetString
(
PyExc_RuntimeError
,
"Could not allocate memory on device"
);
return
-
1
;
}
if
(
0
)
...
...
@@ -283,7 +292,7 @@ DllExport PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo);
/**
* Return an independent copy of self
*/
DllExport
PyObject
*
CudaNdarray_Copy
(
CudaNdarray
*
self
);
DllExport
PyObject
*
CudaNdarray_Copy
(
const
CudaNdarray
*
self
);
/**
* Return a new object obtained by summing over the dimensions for which there is a 1 in the mask.
...
...
@@ -302,7 +311,7 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
*
* self is reallocated to have the correct dimensions if necessary.
*/
DllExport
int
CudaNdarray_CopyFromCudaNdarray
(
CudaNdarray
*
self
,
CudaNdarray
*
other
,
bool
unbroadcast
=
false
);
DllExport
int
CudaNdarray_CopyFromCudaNdarray
(
CudaNdarray
*
self
,
const
CudaNdarray
*
other
,
bool
unbroadcast
=
false
);
/**
* Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
...
...
@@ -321,7 +330,7 @@ DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
DllExport
int
CudaNdarray_gemm
(
float
alpha
,
const
CudaNdarray
*
A
,
const
CudaNdarray
*
B
,
float
beta
,
CudaNdarray
*
C
);
DllExport
int
CudaNdarray_sgemv
(
float
alpha
,
const
CudaNdarray
*
A
,
const
CudaNdarray
*
B
,
float
beta
,
CudaNdarray
*
C
);
DllExport
int
CudaNdarray_sger
(
float
alpha
,
CudaNdarray
*
x
,
CudaNdarray
*
y
,
CudaNdarray
*
A
);
DllExport
int
CudaNdarray_sger
(
float
alpha
,
const
CudaNdarray
*
x
,
const
CudaNdarray
*
y
,
CudaNdarray
*
A
);
DllExport
int
CudaNdarray_reduce_sum
(
CudaNdarray
*
self
,
CudaNdarray
*
A
);
DllExport
int
CudaNdarray_reduce_prod
(
CudaNdarray
*
self
,
CudaNdarray
*
A
);
...
...
@@ -343,4 +352,4 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:
encoding=utf-8:
textwidth=79 :
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
theano/sandbox/cuda/tests/test_blas.py
浏览文件 @
1f6b0c73
import
itertools
from
unittest
import
TestCase
from
theano.compile.pfunc
import
pfunc
...
...
@@ -17,7 +18,7 @@ import theano.sandbox.cuda as tcn
from
theano.tensor.signal.downsample
import
DownsampleFactorMax
,
DownsampleFactorMaxGrad
import
theano.compile.mode
from
theano.tensor.tests.test_blas
import
BaseGemv
,
TestGer
from
theano.tensor.tests.test_blas
import
BaseGemv
,
Test
BlasStrides
,
Test
Ger
from
theano.sandbox.cuda.blas
import
gpu_gemv_no_inplace
,
gpu_gemv_inplace
from
theano.sandbox.cuda.blas
import
gpu_ger_inplace
,
gpu_ger_no_inplace
...
...
@@ -32,20 +33,31 @@ else:
def
my_rand
(
*
shape
):
return
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
shape
),
dtype
=
'float32'
)
def
transpose
(
cuda_mat
):
# The easiest way to transpose a cuda matrix for now
return
tcn
.
dimshuffle
(
cuda_mat
,
[
1
,
0
])
def
test_dot22
():
def
cmp
(
a_shp
,
b_shp
):
a
=
tcn
.
shared_constructor
(
my_rand
(
*
a_shp
),
'a'
)
a0
=
my_rand
(
*
a_shp
)
a
=
tcn
.
shared_constructor
(
a0
,
'a'
)
b
=
tensor
.
fmatrix
()
f
=
pfunc
([
b
],
[],
updates
=
[(
a
,
tensor
.
dot
(
a
,
b
))],
mode
=
mode_with_gpu
)
a0
=
a
.
get_value
()
*
1.0
bval
=
my_rand
(
*
b_shp
)
f
(
bval
)
assert
numpy
.
allclose
(
numpy
.
dot
(
a0
,
bval
),
a
.
get_value
())
# Try with a matrix equal to a0, but with strides in both dims
a
.
set_value
(
a0
)
a
.
set_value
(
a
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
f
(
bval
)
cmp
((
3
,
4
),(
4
,
5
))
cmp
((
0
,
4
),(
4
,
5
))
cmp
((
3
,
4
),(
4
,
0
))
...
...
@@ -90,7 +102,8 @@ def test_dot22scalar():
def
test_gemm
():
def
cmp
(
a_shp
,
b_shp
):
a
=
tcn
.
shared_constructor
(
my_rand
(
*
a_shp
),
'a'
)
a0
=
my_rand
(
*
a_shp
)
a
=
tcn
.
shared_constructor
(
a0
,
'a'
)
b
=
tensor
.
fmatrix
(
'b'
)
c
=
tensor
.
fmatrix
(
'c'
)
...
...
@@ -98,12 +111,19 @@ def test_gemm():
f
=
pfunc
([
b
,
c
],
[],
updates
=
[(
a
,
tensor
.
dot
(
a
,
b
)
+
tensor
.
exp
(
c
))],
mode
=
mode_with_gpu
)
assert
any
([
node
.
op
==
tcn
.
blas
.
gpu_gemm_inplace
for
node
in
f
.
maker
.
env
.
toposort
()])
a0
=
a
.
get_value
()
*
1.0
bval
=
my_rand
(
*
b_shp
)
cval
=
my_rand
(
a_shp
[
0
],
b_shp
[
1
])
f
(
bval
,
cval
)
assert
numpy
.
allclose
(
numpy
.
dot
(
a0
,
bval
)
+
numpy
.
exp
(
cval
),
a
.
get_value
())
# Try with a matrix equal to a0, but with strides in both dims
a
.
set_value
(
a0
)
a
.
set_value
(
a
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
f
(
bval
,
cval
)
cmp
((
3
,
4
),(
4
,
5
))
cmp
((
0
,
4
),(
4
,
5
))
cmp
((
3
,
4
),(
4
,
0
))
...
...
@@ -114,7 +134,8 @@ def test_gemm():
def
test_gemm_no_inplace
():
def
cmp
(
a_shp
,
b_shp
):
a
=
tcn
.
shared_constructor
(
my_rand
(
*
a_shp
),
'a'
)
a0
=
my_rand
(
*
a_shp
)
a
=
tcn
.
shared_constructor
(
a0
,
'a'
)
cval
=
my_rand
(
a_shp
[
0
],
b_shp
[
1
])
c
=
tcn
.
shared_constructor
(
cval
.
copy
(),
'c'
)
...
...
@@ -123,7 +144,6 @@ def test_gemm_no_inplace():
f
=
pfunc
([
b
,
b2
],
[
tensor
.
dot
(
a
,
b2
)
+
c
],
updates
=
[(
a
,
tensor
.
dot
(
a
,
b
)
+
c
)],
mode
=
mode_with_gpu
)
a0
=
a
.
get_value
()
*
1.0
assert
any
([
node
.
op
==
tcn
.
blas
.
gpu_gemm_no_inplace
for
node
in
f
.
maker
.
env
.
toposort
()])
bval
=
my_rand
(
*
b_shp
)
bval2
=
my_rand
(
*
b_shp
)
...
...
@@ -132,6 +152,13 @@ def test_gemm_no_inplace():
assert
numpy
.
allclose
(
numpy
.
dot
(
a0
,
bval
)
+
cval
,
a
.
get_value
())
assert
numpy
.
allclose
(
numpy
.
dot
(
a0
,
bval2
)
+
cval
,
rval
)
# Try with a matrix equal to a0, but with strides in both dims
a
.
set_value
(
a0
)
a
.
set_value
(
a
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
f
(
bval
,
bval2
)
cmp
((
3
,
4
),(
4
,
5
))
cmp
((
0
,
4
),(
4
,
5
))
cmp
((
3
,
4
),(
4
,
0
))
...
...
@@ -139,6 +166,13 @@ def test_gemm_no_inplace():
cmp
((
0
,
4
),(
4
,
0
))
cmp
((
0
,
0
),(
0
,
0
))
class
TestBlasStridesGpu
(
TestBlasStrides
):
dtype
=
'float32'
shared
=
staticmethod
(
tcn
.
shared_constructor
)
mode
=
mode_with_gpu
def
test_outer
():
x
=
tcn
.
shared_constructor
(
my_rand
(
8
,),
'x'
)
y
=
tcn
.
shared_constructor
(
my_rand
(
6
,),
'y'
)
...
...
@@ -260,6 +294,23 @@ class TestGpuGemv(TestCase, BaseGemv,
gemv
=
gpu_gemv_inplace
gemv_inplace
=
gpu_gemv_inplace
class
TestGpuGemvNoTransfer
(
TestCase
,
BaseGemv
,
unittest_tools
.
TestOptimizationMixin
):
mode
=
mode_with_gpu
dtype
=
'float32'
# Mimic shared constructors registry
@staticmethod
def
shared
(
val
):
try
:
return
tcn
.
shared_constructor
(
val
)
except
TypeError
:
return
theano
.
shared
(
val
)
# In this test, inputs are not always transfered to GPU
gemv
=
gpu_gemv_no_inplace
gemv_inplace
=
gpu_gemv_inplace
class
TestVectorMatrixDot
(
TestCase
):
### Tolerance factor used in this tests
...
...
@@ -286,6 +337,14 @@ class TestVectorMatrixDot(TestCase):
assert
sum
([
node
.
op
is
gpu_gemv_inplace
for
node
in
gpu_f2
.
maker
.
env
.
toposort
()
])
==
1
# Check double-strided m
m
.
set_value
(
m
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
assert
numpy
.
allclose
(
no_gpu_f
(),
gpu_f
(),
atol
=
self
.
atol
)
assert
numpy
.
allclose
(
no_gpu_f
(),
gpu_f2
(),
atol
=
self
.
atol
)
def
test_dot_mv
(
self
):
''' Test matrix dot vector '''
v
=
theano
.
shared
(
numpy
.
array
(
numpy
.
random
.
rand
(
2
),
dtype
=
'float32'
))
...
...
@@ -365,6 +424,26 @@ class TestGpuGer(TestGer):
self
.
ger
=
gpu_ger_inplace
self
.
gemm
=
tcn
.
blas
.
gpu_gemm_inplace
class
TestGpuGerNoTransfer
(
TestGer
):
@staticmethod
def
shared
(
val
):
try
:
return
tcn
.
shared_constructor
(
val
)
except
TypeError
:
return
theano
.
shared
(
val
)
def
setUp
(
self
):
self
.
mode
=
mode_with_gpu
dtype
=
self
.
dtype
=
'float32'
# optimization isn't dtype-dependent
self
.
A
=
tensor
.
tensor
(
dtype
=
dtype
,
broadcastable
=
(
False
,
False
))
self
.
a
=
tensor
.
tensor
(
dtype
=
dtype
,
broadcastable
=
())
self
.
x
=
tensor
.
tensor
(
dtype
=
dtype
,
broadcastable
=
(
False
,))
self
.
y
=
tensor
.
tensor
(
dtype
=
dtype
,
broadcastable
=
(
False
,))
# data on the gpu make the op always inplace
self
.
ger
=
gpu_ger_inplace
self
.
ger_destructive
=
gpu_ger_inplace
self
.
gemm
=
tcn
.
blas
.
gpu_gemm_inplace
class
TestGpuGer_OpContract
(
TestCase
,
unittest_tools
.
T_OpContractMixin
):
def
setUp
(
self
):
...
...
theano/tensor/blas.py
浏览文件 @
1f6b0c73
...
...
@@ -496,7 +496,9 @@ class GemmRelated(Op):
if ((Sx[0] < 1) || (Sx[1] < 1) || (Sx[0] MOD type_size) || (Sx[1] MOD type_size)
|| ((Sx[0] != type_size) && (Sx[1] != type_size)))
{
PyArrayObject * _x_copy = PyArray_GETCONTIGUOUS(
%(_x)
s);
PyArrayObject * _x_copy = (PyArrayObject *) PyArray_Copy(
%(_x)
s);
if (!_x_copy)
%(fail)
s
Py_XDECREF(
%(_x)
s);
%(_x)
s = _x_copy;
Sx =
%(_x)
s->strides;
...
...
@@ -505,7 +507,9 @@ class GemmRelated(Op):
if ((Sy[0] < 1) || (Sy[1] < 1) || (Sy[0] MOD type_size) || (Sy[1] MOD type_size)
|| ((Sy[0] != type_size) && (Sy[1] != type_size)))
{
PyArrayObject * _y_copy = PyArray_GETCONTIGUOUS(
%(_y)
s);
PyArrayObject * _y_copy = (PyArrayObject *) PyArray_Copy(
%(_y)
s);
if (!_y_copy)
%(fail)
s
Py_XDECREF(
%(_y)
s);
%(_y)
s = _y_copy;
Sy =
%(_y)
s->strides;
...
...
@@ -514,7 +518,9 @@ class GemmRelated(Op):
if ((Sz[0] < 1) || (Sz[1] < 1) || (Sz[0] MOD type_size) || (Sz[1] MOD type_size)
|| ((Sz[0] != type_size) && (Sz[1] != type_size)))
{
PyArrayObject * _z_copy = PyArray_GETCONTIGUOUS(
%(_zout)
s);
PyArrayObject * _z_copy = (PyArrayObject *) PyArray_Copy(
%(_zout)
s);
if (!_z_copy)
%(fail)
s
Py_XDECREF(
%(_zout)
s);
%(_zout)
s = _z_copy;
Sz =
%(_zout)
s->strides;
...
...
@@ -649,7 +655,7 @@ class GemmRelated(Op):
self
.
end_switch_typenum
),
''
)
def
build_gemm_version
(
self
):
return
(
1
0
,)
return
(
1
2
,)
class
Gemm
(
GemmRelated
):
"""In-place version of matrix-matrix multiplication (with accumulation):
...
...
theano/tensor/blas_c.py
浏览文件 @
1f6b0c73
...
...
@@ -58,9 +58,10 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
// copy A if !self.destructive or A is fully strided
if (!
%(destructive)
s
|| (
%(A)
s->strides[0] < 0)
|| (
%(A)
s->strides[1] < 0)
|| ((
%(A)
s->strides[0] != elemsize)
&&
(
%(A)
s->strides[1] != elemsize)))
&& (
%(A)
s->strides[1] != elemsize)))
{
npy_intp dims[2];
dims[0] =
%(A)
s->dimensions[0];
...
...
@@ -68,7 +69,11 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
if ((NULL ==
%(Z)
s)
|| (
%(Z)
s->dimensions[0] !=
%(A)
s->dimensions[0])
|| (
%(Z)
s->dimensions[1] !=
%(A)
s->dimensions[1]))
|| (
%(Z)
s->dimensions[1] !=
%(A)
s->dimensions[1])
|| (
%(Z)
s->strides[0] < 0)
|| (
%(Z)
s->strides[1] < 0)
|| ((
%(Z)
s->strides[0] != elemsize)
&& (
%(Z)
s->strides[1] != elemsize)))
{
if (
%(Z)
s) Py_XDECREF(
%(Z)
s);
%(Z)
s = (PyArrayObject*)PyArray_SimpleNew(2, dims, PyArray_TYPE(
%(A)
s));
...
...
@@ -77,7 +82,11 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
%(fail)
s
}
}
assert (
%(Z)
s !=
%(A)
s);
if (
%(Z)
s ==
%(A)
s)
{
PyErr_SetString(PyExc_AssertionError, "
%(Z)
s !=
%(A)
s");
%(fail)
s
}
if (
%(Z)
s->descr->type_num == PyArray_FLOAT)
{
float * zoutdata = (float*)
%(Z)
s->data;
...
...
@@ -142,8 +151,16 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
int Sz0 = (Nz0 > 1) ? (
%(Z)
s->strides[0] / elemsize) : (Nz1 + 1);
int Sz1 = (Nz1 > 1) ? (
%(Z)
s->strides[1] / elemsize) : (Nz0 + 1);
if (1)
{
dtype_
%(x)
s* x_data = (dtype_
%(x)
s*)
%(x)
s->data;
dtype_
%(y)
s* y_data = (dtype_
%(y)
s*)
%(y)
s->data;
// gemv expects pointers to the beginning of memory arrays,
// but numpy provides provides a pointer to the first element,
// so when the stride is negative, we need to get the last one.
if (Sx < 0)
x_data += (Nz0 - 1) * Sx;
if (Sy < 0)
y_data += (Nz1 - 1) * Sy;
if (
%(Z)
s->strides[0] == elemsize)
{
if (
%(Z)
s->descr->type_num == PyArray_FLOAT)
...
...
@@ -151,19 +168,22 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
//fprintf(stderr, "A
\\
n");
float alpha = ((dtype_
%(a)
s*)
%(a)
s->data)[0];
sger_(&Nz0, &Nz1, &alpha,
(float*)
(
%(x)
s->data)
, &Sx,
(float*)
(
%(y)
s->data)
, &Sy,
(float*)
x_data
, &Sx,
(float*)
y_data
, &Sy,
(float*)(
%(Z)
s->data), &Sz1);
}
else if (
%(Z)
s->descr->type_num == PyArray_DOUBLE)
{
double alpha = ((dtype_
%(a)
s*)
%(a)
s->data)[0];
dger_(&Nz0, &Nz1, &alpha,
(double*)
(
%(x)
s->data)
, &Sx,
(double*)
(
%(y)
s->data)
, &Sy,
(double*)
x_data
, &Sx,
(double*)
y_data
, &Sy,
(double*)(
%(Z)
s->data), &Sz1);
}
else { assert(0); }
else {
PyErr_SetString(PyExc_NotImplementedError, "not float nor double");
%(fail)
s
}
}
else if (
%(Z)
s->strides[1] == elemsize)
{
...
...
@@ -174,21 +194,30 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
//fprintf(stderr, "alpha=
%%
f
\\
n", alpha);
//fprintf(stderr, "sx sy
%%
i
%%
i
\\
n", Sx, Sy);
sger_(&Nz1, &Nz0, &alpha,
(float*)
(
%(y)
s->data)
, &Sy,
(float*)
(
%(x)
s->data)
, &Sx,
(float*)
y_data
, &Sy,
(float*)
x_data
, &Sx,
(float*)(
%(Z)
s->data), &Sz0);
}
else if (
%(Z)
s->descr->type_num == PyArray_DOUBLE)
{
double alpha = ((dtype_
%(a)
s*)
%(a)
s->data)[0];
dger_(&Nz1, &Nz0, &alpha,
(double*)
(
%(y)
s->data)
, &Sy,
(double*)
(
%(x)
s->data)
, &Sx,
(double*)
y_data
, &Sy,
(double*)
x_data
, &Sx,
(double*)(
%(Z)
s->data), &Sz0);
}
else { assert(0); }
else
{
PyErr_SetString(PyExc_NotImplementedError, "not float nor double");
%(fail)
s
}
}
else { assert(0); }
else
{
PyErr_SetString(PyExc_AssertionError,
"A is a double-strided matrix, and should have been copied "
"into a memory-contiguous one.");
%(fail)
s
}
}
...
...
@@ -204,7 +233,7 @@ class CGer(BaseBLAS, Ger):
return
code
def
c_code_cache_version
(
self
):
return
(
4
,)
return
(
8
,)
@local_optimizer
([
ger
,
ger_destructive
])
...
...
@@ -282,7 +311,11 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
%(fail)
s
}
}
assert (
%(zz)
s !=
%(aa)
s);
if (
%(zz)
s ==
%(aa)
s)
{
PyErr_SetString(PyExc_AssertionError, "
%(zz)
s !=
%(aa)
s");
%(fail)
s
}
if (dbeta != 0)
{
if (
%(zz)
s->descr->type_num == PyArray_FLOAT)
...
...
@@ -330,13 +363,52 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
char NOTRANS = 'N';
int Nx0 =
%(xx)
s->dimensions[0];
int Nx1 =
%(xx)
s->dimensions[1];
int Sx0 =
%(xx)
s->strides[0] / elemsize;
int Sx1 =
%(xx)
s->strides[1] / elemsize;
/* This formula is needed in the case where xx is actually a row or
* column matrix, because BLAS sometimes insists that the strides:
* - are not smaller than the number of elements in the array
* - are not 0.
*/
int Sx0 = (Nx0 > 1) ? (
%(xx)
s->strides[0] / elemsize) : (Nx1 + 1);
int Sx1 = (Nx1 > 1) ? (
%(xx)
s->strides[1] / elemsize) : (Nx0 + 1);
int Sz =
%(zz)
s->strides[0] / elemsize;
int Sy =
%(yy)
s->strides[0] / elemsize;
dtype_
%(yy)
s* yy_data = (dtype_
%(yy)
s*)
%(yy)
s->data;
dtype_
%(zz)
s* zz_data = (dtype_
%(zz)
s*)
%(zz)
s->data;
// gemv expects pointers to the beginning of memory arrays,
// but numpy provides provides a pointer to the first element,
// so when the stride is negative, we need to get the last one.
if (Sy < 0)
yy_data += (Nx1 - 1) * Sy;
if (Sz < 0)
zz_data += (Nx0 - 1) * Sz;
if (Nx0 * Nx1)
{
// If xx is neither C- nor F-contiguous, we make a copy.
// TODO:
// - if one stride is equal to "- elemsize", we can still call
// gemv on reversed matrix and vectors
// - if the copy is too long, maybe call vector/vector dot on
// each row instead
if ((
%(xx)
s->strides[0] < 0)
|| (
%(xx)
s->strides[1] < 0)
|| ((
%(xx)
s->strides[0] != elemsize)
&& (
%(xx)
s->strides[1] != elemsize)))
{
npy_intp dims[2];
dims[0] = Nx0;
dims[1] = Nx1;
PyArrayObject * xx_copy = (PyArrayObject *) PyArray_Copy(
%(xx)
s);
if (!xx_copy)
%(fail)
s
Py_XDECREF(
%(xx)
s);
%(xx)
s = xx_copy;
Sx0 = (Nx0 > 1) ? (
%(xx)
s->strides[0] / elemsize) : (Nx1 + 1);
Sx1 = (Nx1 > 1) ? (
%(xx)
s->strides[1] / elemsize) : (Nx0 + 1);
}
if (
%(xx)
s->strides[0] == elemsize)
{
if (
%(xx)
s->descr->type_num == PyArray_FLOAT)
...
...
@@ -346,9 +418,9 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
sgemv_(&NOTRANS, &Nx0, &Nx1,
&alpha,
(float*)(
%(xx)
s->data), &Sx1,
(float*)
(
%(yy)
s->data)
, &Sy,
(float*)
yy_data
, &Sy,
&fbeta,
(float*)
(
%(zz)
s->data)
, &Sz);
(float*)
zz_data
, &Sz);
}
else if (
%(xx)
s->descr->type_num == PyArray_DOUBLE)
{
...
...
@@ -356,13 +428,14 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
dgemv_(&NOTRANS, &Nx0, &Nx1,
&alpha,
(double*)(
%(xx)
s->data), &Sx1,
(double*)
(
%(yy)
s->data)
, &Sy,
(double*)
yy_data
, &Sy,
&dbeta,
(double*)
(
%(zz)
s->data)
, &Sz);
(double*)
zz_data
, &Sz);
}
else
{
assert(0);
PyErr_SetString(PyExc_AssertionError, "neither float nor double dtype");
%(fail)
s
}
}
else if (
%(xx)
s->strides[1] == elemsize)
...
...
@@ -376,9 +449,9 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
sgemv_(&TRANS, &Nx1, &Nx0,
&alpha,
(float*)(
%(xx)
s->data), &Sx0,
(float*)
(
%(yy)
s->data)
, &Sy,
(float*)
yy_data
, &Sy,
&fbeta,
(float*)
(
%(zz)
s->data)
, &Sz);
(float*)
zz_data
, &Sz);
}
else if (
%(xx)
s->descr->type_num == PyArray_DOUBLE)
{
...
...
@@ -386,20 +459,22 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
dgemv_(&TRANS, &Nx1, &Nx0,
&alpha,
(double*)(
%(xx)
s->data), &Sx0,
(double*)
(
%(yy)
s->data)
, &Sy,
(double*)
yy_data
, &Sy,
&dbeta,
(double*)
(
%(zz)
s->data)
, &Sz);
(double*)
zz_data
, &Sz);
}
else
{
assert(0);
PyErr_SetString(PyExc_AssertionError, "neither float nor double dtype");
%(fail)
s
}
}
else
{
// if xx is strided in both directions, then just do the gemv with a
// pair of for loops.
assert (0);
PyErr_SetString(PyExc_AssertionError,
"xx is a double-strided matrix, and should have been copied "
"into a memory-contiguous one.");
%(fail)
s
}
}
else if (dbeta != 1.0)
...
...
@@ -429,7 +504,7 @@ class CGemv(BaseBLAS, Gemv):
return
code
def
c_code_cache_version
(
self
):
return
(
2
,)
return
(
9
,)
@local_optimizer
([
gemv_inplace
,
gemv_no_inplace
])
...
...
theano/tensor/blas_scipy.py
浏览文件 @
1f6b0c73
...
...
@@ -44,7 +44,12 @@ class ScipyGer(Ger):
# N.B. some versions of scipy (e.g. mine) don't actually work
# in-place on a, even when I tell it to.
A
=
cA
[
0
]
if
A
.
flags
[
'C_CONTIGUOUS'
]:
if
A
.
size
==
0
:
# We don't have to do anything, A is empty.
# We need this special case because Numpy considers it
# C-contiguous, wich is confusing.
pass
elif
A
.
flags
[
'C_CONTIGUOUS'
]:
A
=
local_ger
(
calpha
[
0
],
cy
[
0
],
cx
[
0
],
a
=
A
.
T
,
overwrite_a
=
int
(
self
.
destructive
))
.
T
else
:
...
...
theano/tensor/tests/test_blas.py
浏览文件 @
1f6b0c73
#from nose.plugins.skip import SkipTest
#import traceback
import
sys
import
itertools
,
sys
import
theano.tensor
as
T
from
theano
import
tensor
#from theano.gof import Env
from
theano.printing
import
pp
...
...
@@ -878,6 +879,11 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
# Assert they produce the same output
assert
numpy
.
allclose
(
f
(),
numpy
.
dot
(
v
.
get_value
(),
m
.
get_value
()))
# Assert it works when m has no contiguous dimension
m
.
set_value
(
m
.
get_value
(
borrow
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
assert
numpy
.
allclose
(
f
(),
numpy
.
dot
(
v
.
get_value
(),
m
.
get_value
()))
def
test_dot_mv
(
self
):
...
...
@@ -894,6 +900,11 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
# Assert they produce the same output
assert
numpy
.
allclose
(
f
(),
numpy
.
dot
(
m
.
get_value
(),
v
.
get_value
()))
# Assert it works when m has no contiguous dimension
m
.
set_value
(
m
.
get_value
(
borrow
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
assert
numpy
.
allclose
(
f
(),
numpy
.
dot
(
m
.
get_value
(),
v
.
get_value
()))
@staticmethod
def
t_gemv1
(
m_shp
):
...
...
@@ -915,19 +926,30 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
assert
topo
[
0
]
.
op
.
inplace
==
False
#test the inplace version
f
=
theano
.
function
([],
[],
updates
=
{
v2
:
v2
+
theano
.
dot
(
m
,
v1
)}
g
=
theano
.
function
([],
[],
updates
=
{
v2
:
v2
+
theano
.
dot
(
m
,
v1
)}
,
mode
=
mode_blas_opt
)
# Assert they produce the same output
f
()
g
()
assert
numpy
.
allclose
(
v2
.
get_value
(),
numpy
.
dot
(
m
.
get_value
(),
v1
.
get_value
())
+
v2_orig
)
topo
=
f
.
maker
.
env
.
toposort
()
topo
=
g
.
maker
.
env
.
toposort
()
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
Gemv
)
if
config
.
mode
!=
'FAST_COMPILE'
:
assert
topo
[
0
]
.
op
.
inplace
==
True
# Do the same tests with a matrix with strides in both dimensions
m
.
set_value
(
m
.
get_value
(
borrow
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
v2
.
set_value
(
v2_orig
)
assert
numpy
.
allclose
(
f
(),
numpy
.
dot
(
m
.
get_value
(),
v1
.
get_value
())
+
v2_orig
)
g
()
assert
numpy
.
allclose
(
v2
.
get_value
(),
numpy
.
dot
(
m
.
get_value
(),
v1
.
get_value
())
+
v2_orig
)
def
test_gemv1
(
self
):
self
.
t_gemv1
((
3
,
2
))
self
.
t_gemv1
((
0
,
2
))
...
...
@@ -952,18 +974,29 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
assert
topo
[
-
1
]
.
op
.
inplace
==
False
#test the inplace version
f
=
theano
.
function
([],
[],
updates
=
{
v2
:
v2
+
theano
.
dot
(
v1
,
m
)}
g
=
theano
.
function
([],
[],
updates
=
{
v2
:
v2
+
theano
.
dot
(
v1
,
m
)}
,
mode
=
mode_blas_opt
)
# Assert they produce the same output
f
()
g
()
assert
numpy
.
allclose
(
v2
.
get_value
(),
numpy
.
dot
(
v1
.
get_value
(),
m
.
get_value
())
+
v2_orig
)
topo
=
f
.
maker
.
env
.
toposort
()
topo
=
g
.
maker
.
env
.
toposort
()
assert
sum
(
isinstance
(
node
.
op
,
Gemv
)
for
node
in
topo
)
==
1
if
config
.
mode
!=
'FAST_COMPILE'
:
assert
topo
[
-
1
]
.
op
.
inplace
==
True
# Do the same tests with a matrix with strides in both dimensions
m
.
set_value
(
m
.
get_value
(
borrow
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
v2
.
set_value
(
v2_orig
)
assert
numpy
.
allclose
(
f
(),
numpy
.
dot
(
v1
.
get_value
(),
m
.
get_value
())
+
v2
.
get_value
())
g
()
assert
numpy
.
allclose
(
v2
.
get_value
(),
numpy
.
dot
(
v1
.
get_value
(),
m
.
get_value
())
+
v2_orig
)
def
test_gemv_dimensions
(
self
):
A
=
T
.
matrix
(
'A'
)
x
,
y
=
T
.
vectors
(
'x'
,
'y'
)
...
...
@@ -984,6 +1017,7 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
ones_6
=
numpy
.
ones
(
6
,
dtype
=
config
.
floatX
)
f
(
A_val
,
ones_3
,
ones_5
)
f
(
A_val
[::
-
1
,
::
-
1
],
ones_3
,
ones_5
)
self
.
assertRaises
(
ValueError
,
f
,
A_val
,
ones_4
,
ones_5
)
self
.
assertRaises
(
ValueError
,
f
,
A_val
,
ones_3
,
ones_6
)
self
.
assertRaises
(
ValueError
,
f
,
A_val
,
ones_4
,
ones_6
)
...
...
@@ -1017,6 +1051,7 @@ def matrixmultiply(a, b):
class
BaseGemv
(
object
):
mode
=
mode_blas_opt
# can be overridden with self.mode
shared
=
staticmethod
(
theano
.
shared
)
def
get_data
(
self
,
x_stride
=
1
,
y_stride
=
1
):
rng
=
numpy
.
random
.
RandomState
(
unittest_tools
.
fetch_seed
())
...
...
@@ -1031,7 +1066,7 @@ class BaseGemv(object):
return
alpha
,
beta
,
a
,
x
,
y
def
test_simple
(
self
):
alpha
,
beta
,
a
,
x
,
y
=
[
shared
(
value
)
for
value
in
self
.
get_data
()
]
alpha
,
beta
,
a
,
x
,
y
=
[
s
elf
.
s
hared
(
value
)
for
value
in
self
.
get_data
()
]
desired_oy
=
alpha
.
get_value
()
*
matrixmultiply
(
a
.
get_value
(),
x
.
get_value
())
+
beta
.
get_value
()
*
y
.
get_value
()
oy
=
alpha
*
T
.
dot
(
a
,
x
)
+
beta
*
y
...
...
@@ -1049,8 +1084,8 @@ class BaseGemv(object):
vs
=
self
.
get_data
()
alpha_v
,
beta_v
,
a_v
,
x_v
,
y_v
=
vs
a
=
shared
(
a_v
)
x
=
shared
(
x_v
)
a
=
s
elf
.
s
hared
(
a_v
)
x
=
s
elf
.
s
hared
(
x_v
)
desired_oy
=
matrixmultiply
(
a_v
,
x_v
)
...
...
@@ -1067,7 +1102,7 @@ class BaseGemv(object):
def
test_simple_transpose
(
self
):
vs
=
self
.
get_data
()
alpha_v
,
beta_v
,
a_v
,
x_v
,
y_v
=
vs
alpha
,
beta
,
a
,
x
,
y
=
[
shared
(
v
)
for
v
in
vs
]
alpha
,
beta
,
a
,
x
,
y
=
[
s
elf
.
s
hared
(
v
)
for
v
in
vs
]
desired_oy
=
alpha_v
*
matrixmultiply
(
transpose
(
a_v
),
x_v
)
+
beta_v
*
y_v
...
...
@@ -1083,7 +1118,7 @@ class BaseGemv(object):
def
test_x_stride
(
self
):
vs
=
self
.
get_data
(
x_stride
=
2
)
alpha_v
,
beta_v
,
a_v
,
x_v
,
y_v
=
vs
alpha
,
beta
,
a
,
x
,
y
=
[
shared
(
v
)
for
v
in
vs
]
alpha
,
beta
,
a
,
x
,
y
=
[
s
elf
.
s
hared
(
v
)
for
v
in
vs
]
desired_oy
=
alpha_v
*
matrixmultiply
(
a_v
,
x_v
[::
2
])
+
beta_v
*
y_v
...
...
@@ -1099,7 +1134,7 @@ class BaseGemv(object):
def
test_x_stride_transpose
(
self
):
vs
=
self
.
get_data
(
x_stride
=
2
)
alpha_v
,
beta_v
,
a_v
,
x_v
,
y_v
=
vs
alpha
,
beta
,
a
,
x
,
y
=
[
shared
(
v
)
for
v
in
vs
]
alpha
,
beta
,
a
,
x
,
y
=
[
s
elf
.
s
hared
(
v
)
for
v
in
vs
]
desired_oy
=
alpha_v
*
matrixmultiply
(
transpose
(
a_v
),
x_v
[::
2
])
+
beta_v
*
y_v
...
...
@@ -1115,7 +1150,7 @@ class BaseGemv(object):
def
test_y_stride
(
self
):
vs
=
self
.
get_data
(
y_stride
=
2
)
alpha_v
,
beta_v
,
a_v
,
x_v
,
y_v
=
vs
alpha
,
beta
,
a
,
x
,
y
=
[
shared
(
v
)
for
v
in
vs
]
alpha
,
beta
,
a
,
x
,
y
=
[
s
elf
.
s
hared
(
v
)
for
v
in
vs
]
desired_oy
=
alpha_v
*
matrixmultiply
(
a_v
,
x_v
)
+
beta_v
*
y_v
[::
2
]
...
...
@@ -1131,7 +1166,7 @@ class BaseGemv(object):
def
test_y_stride_transpose
(
self
):
vs
=
self
.
get_data
(
y_stride
=
2
)
alpha_v
,
beta_v
,
a_v
,
x_v
,
y_v
=
vs
alpha
,
beta
,
a
,
x
,
y
=
[
shared
(
v
)
for
v
in
vs
]
alpha
,
beta
,
a
,
x
,
y
=
[
s
elf
.
s
hared
(
v
)
for
v
in
vs
]
desired_oy
=
alpha_v
*
matrixmultiply
(
transpose
(
a_v
),
x_v
)
+
beta_v
*
y_v
[::
2
]
...
...
@@ -1144,6 +1179,46 @@ class BaseGemv(object):
oy_v
=
oy_func
()
assert_array_almost_equal
(
desired_oy
,
oy_v
)
def
test_a_strides
(
self
):
vs
=
self
.
get_data
()
alpha_v
,
beta_v
,
a_v
,
x_v
,
y_v
=
vs
alpha
,
beta
,
a
,
x
,
y
=
[
self
.
shared
(
v
)
for
v
in
vs
]
a_v
=
a_v
[::
-
1
,
::
-
1
]
a
.
set_value
(
a
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
desired_oy
=
alpha_v
*
matrixmultiply
(
a_v
,
x_v
)
+
beta_v
*
y_v
oy
=
alpha
*
T
.
dot
(
a
,
x
)
+
beta
*
y
oy_func
=
theano
.
function
([],
oy
,
mode
=
self
.
mode
)
self
.
assertFunctionContains1
(
oy_func
,
self
.
gemv
)
oy_v
=
oy_func
()
assert_array_almost_equal
(
desired_oy
,
oy_v
)
def
test_a_strides_transpose
(
self
):
vs
=
self
.
get_data
()
alpha_v
,
beta_v
,
a_v
,
x_v
,
y_v
=
vs
alpha
,
beta
,
a
,
x
,
y
=
[
self
.
shared
(
v
)
for
v
in
vs
]
a_v
=
a_v
[::
-
1
,
::
-
1
]
a
.
set_value
(
a
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
desired_oy
=
alpha_v
*
matrixmultiply
(
transpose
(
a_v
),
x_v
)
+
beta_v
*
y_v
oy
=
alpha
*
T
.
dot
(
a
.
T
,
x
)
+
beta
*
y
oy_func
=
theano
.
function
([],
oy
,
mode
=
self
.
mode
)
self
.
assertFunctionContains1
(
oy_func
,
self
.
gemv
)
oy_v
=
oy_func
()
assert_array_almost_equal
(
desired_oy
,
oy_v
)
def
test_upcasting_scalar_nogemv
(
self
):
# Test that the optimization does not crash when the scale has
# an incorrect dtype, and forces upcasting of the result
...
...
@@ -1281,6 +1356,7 @@ class TestGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
class
TestGer
(
TestCase
,
unittest_tools
.
TestOptimizationMixin
):
shared
=
staticmethod
(
theano
.
shared
)
def
setUp
(
self
):
self
.
mode
=
theano
.
compile
.
get_default_mode
()
.
including
(
'fast_run'
)
...
...
@@ -1332,6 +1408,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
f
(
numpy
.
random
.
rand
(
5
,
4
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
5
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
4
)
.
astype
(
self
.
dtype
))
f
(
numpy
.
random
.
rand
(
5
,
4
)
.
astype
(
self
.
dtype
)[::
-
1
,
::
-
1
],
numpy
.
random
.
rand
(
5
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
4
)
.
astype
(
self
.
dtype
))
def
test_A_plus_scaled_outer
(
self
):
f
=
self
.
function
([
self
.
A
,
self
.
x
,
self
.
y
],
...
...
@@ -1340,6 +1419,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
f
(
numpy
.
random
.
rand
(
5
,
4
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
5
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
4
)
.
astype
(
self
.
dtype
))
f
(
numpy
.
random
.
rand
(
5
,
4
)
.
astype
(
self
.
dtype
)[::
-
1
,
::
-
1
],
numpy
.
random
.
rand
(
5
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
4
)
.
astype
(
self
.
dtype
))
def
test_scaled_A_plus_scaled_outer
(
self
):
f
=
self
.
function
([
self
.
A
,
self
.
x
,
self
.
y
],
...
...
@@ -1352,6 +1434,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
f
(
numpy
.
random
.
rand
(
5
,
4
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
5
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
4
)
.
astype
(
self
.
dtype
))
f
(
numpy
.
random
.
rand
(
5
,
4
)
.
astype
(
self
.
dtype
)[::
-
1
,
::
-
1
],
numpy
.
random
.
rand
(
5
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
4
)
.
astype
(
self
.
dtype
))
def
given_dtype
(
self
,
dtype
,
M
,
N
):
""" test corner case shape and dtype"""
...
...
@@ -1362,6 +1447,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
f
(
numpy
.
random
.
rand
(
M
,
N
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
M
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
N
)
.
astype
(
self
.
dtype
))
f
(
numpy
.
random
.
rand
(
M
,
N
)
.
astype
(
self
.
dtype
)[::
-
1
,
::
-
1
],
numpy
.
random
.
rand
(
M
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
N
)
.
astype
(
self
.
dtype
))
def
test_f32_0_0
(
self
):
return
self
.
given_dtype
(
'float32'
,
0
,
0
)
...
...
@@ -1394,10 +1482,393 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
return
self
.
given_dtype
(
'complex128'
,
1
,
9
)
def
test_inplace
(
self
):
A
=
theano
.
shared
(
numpy
.
random
.
rand
(
4
,
5
)
.
astype
(
self
.
dtype
))
A
=
self
.
shared
(
numpy
.
random
.
rand
(
4
,
5
)
.
astype
(
self
.
dtype
))
f
=
self
.
function
([
self
.
x
,
self
.
y
],
[],
updates
=
{
A
:
A
+
T
.
constant
(
0.1
,
dtype
=
self
.
dtype
)
*
T
.
outer
(
self
.
x
,
self
.
y
)})
self
.
assertFunctionContains
(
f
,
self
.
ger_destructive
)
f
(
numpy
.
random
.
rand
(
4
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
5
)
.
astype
(
self
.
dtype
))
A
.
set_value
(
A
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
f
(
numpy
.
random
.
rand
(
4
)
.
astype
(
self
.
dtype
),
numpy
.
random
.
rand
(
5
)
.
astype
(
self
.
dtype
))
class
TestBlasStrides
(
TestCase
):
dtype
=
'float64'
shared
=
staticmethod
(
tensor
.
_shared
)
mode
=
theano
.
compile
.
get_default_mode
()
mode
=
mode
.
including
(
'fast_run'
)
.
excluding
(
'gpu'
,
'c_blas'
,
'scipy_blas'
)
rng
=
numpy
.
random
.
RandomState
(
seed
=
unittest_tools
.
fetch_seed
())
def
rand
(
self
,
*
shape
):
return
theano
.
_asarray
(
self
.
rng
.
rand
(
*
shape
),
dtype
=
self
.
dtype
)
def
cmp_dot22
(
self
,
b_shp
,
c_shp
):
av
=
numpy
.
zeros
((
0
,
0
),
dtype
=
self
.
dtype
)
bv
=
self
.
rand
(
*
b_shp
)
cv
=
self
.
rand
(
*
c_shp
)
a
=
self
.
shared
(
av
,
'a'
)
b
=
self
.
shared
(
bv
,
'b'
)
c
=
self
.
shared
(
cv
,
'c'
)
b_t
=
self
.
shared
(
bv
.
T
,
'b.T'
)
c_t
=
self
.
shared
(
cv
.
T
,
'c.T'
)
b_dev
=
b
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
c_dev
=
c
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
bt_dev
=
b_t
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
ct_dev
=
c_t
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
f_nn
=
theano
.
function
([],
[],
updates
=
{
a
:
tensor
.
dot
(
b
,
c
)},
mode
=
self
.
mode
)
print
'class name:'
,
self
.
__class__
.
__name__
theano
.
printing
.
debugprint
(
f_nn
)
f_nt
=
theano
.
function
([],
[],
updates
=
{
a
:
tensor
.
dot
(
b
,
c_t
.
T
)},
mode
=
self
.
mode
)
f_tn
=
theano
.
function
([],
[],
updates
=
{
a
:
tensor
.
dot
(
b_t
.
T
,
c
)},
mode
=
self
.
mode
)
f_tt
=
theano
.
function
([],
[],
updates
=
{
a
:
tensor
.
dot
(
b_t
.
T
,
c_t
.
T
)},
mode
=
self
.
mode
)
# Try with all stride patterns, and all transposed pattern
for
step_signs
in
itertools
.
product
((
-
1
,
1
),
repeat
=
4
):
for
step
in
(
1
,
2
):
b_step1
,
b_step2
,
c_step1
,
c_step2
=
(
s
*
step
for
s
in
step_signs
)
b
.
set_value
(
b_dev
.
copy
()[::
b_step1
,
::
b_step2
],
borrow
=
True
)
c
.
set_value
(
c_dev
.
copy
()[::
c_step1
,
::
c_step2
],
borrow
=
True
)
b_t
.
set_value
(
bt_dev
.
copy
()[::
b_step2
,
::
b_step1
],
borrow
=
True
)
c_t
.
set_value
(
ct_dev
.
copy
()[::
c_step2
,
::
c_step1
],
borrow
=
True
)
# Numpy result
a_n
=
numpy
.
dot
(
bv
[::
b_step1
,
::
b_step2
],
cv
[::
c_step1
,
::
c_step2
])
f_nn
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
)
f_nt
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
)
f_tn
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
)
f_tt
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
)
def
test_dot22
(
self
):
self
.
cmp_dot22
((
3
,
4
),
(
4
,
5
))
self
.
cmp_dot22
((
1
,
4
),
(
4
,
5
))
self
.
cmp_dot22
((
3
,
4
),
(
4
,
1
))
self
.
cmp_dot22
((
3
,
1
),
(
1
,
1
))
self
.
cmp_dot22
((
1
,
4
),
(
4
,
1
))
self
.
cmp_dot22
((
3
,
1
),
(
1
,
5
))
self
.
cmp_dot22
((
0
,
4
),
(
4
,
5
))
self
.
cmp_dot22
((
0
,
4
),
(
4
,
1
))
self
.
cmp_dot22
((
0
,
1
),
(
1
,
5
))
self
.
cmp_dot22
((
3
,
4
),
(
4
,
0
))
self
.
cmp_dot22
((
3
,
0
),
(
0
,
5
))
self
.
cmp_dot22
((
0
,
4
),
(
4
,
0
))
self
.
cmp_dot22
((
0
,
0
),
(
0
,
0
))
def
cmp_dot22scalar
(
self
,
b_shp
,
c_shp
):
av
=
numpy
.
zeros
((
0
,
0
),
dtype
=
self
.
dtype
)
bv
=
self
.
rand
(
*
b_shp
)
cv
=
self
.
rand
(
*
c_shp
)
l
=
numpy
.
float32
(
0.2
)
a
=
self
.
shared
(
av
,
'a'
)
b
=
self
.
shared
(
bv
,
'b'
)
c
=
self
.
shared
(
cv
,
'c'
)
b_t
=
self
.
shared
(
bv
.
T
,
'b.T'
)
c_t
=
self
.
shared
(
cv
.
T
,
'c.T'
)
b_dev
=
b
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
c_dev
=
c
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
bt_dev
=
b_t
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
ct_dev
=
c_t
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
f_nn
=
theano
.
function
([],
[],
updates
=
{
a
:
l
*
tensor
.
dot
(
b
,
c
)},
mode
=
self
.
mode
)
f_nt
=
theano
.
function
([],
[],
updates
=
{
a
:
l
*
tensor
.
dot
(
b
,
c_t
.
T
)},
mode
=
self
.
mode
)
f_tn
=
theano
.
function
([],
[],
updates
=
{
a
:
l
*
tensor
.
dot
(
b_t
.
T
,
c
)},
mode
=
self
.
mode
)
f_tt
=
theano
.
function
([],
[],
updates
=
{
a
:
l
*
tensor
.
dot
(
b_t
.
T
,
c_t
.
T
)},
mode
=
self
.
mode
)
# Try with all stride patterns, and all transposed pattern
for
step_signs
in
itertools
.
product
((
-
1
,
1
),
repeat
=
4
):
for
step
in
(
1
,
2
):
b_step1
,
b_step2
,
c_step1
,
c_step2
=
(
s
*
step
for
s
in
step_signs
)
b
.
set_value
(
b_dev
.
copy
()[::
b_step1
,
::
b_step2
],
borrow
=
True
)
c
.
set_value
(
c_dev
.
copy
()[::
c_step1
,
::
c_step2
],
borrow
=
True
)
b_t
.
set_value
(
bt_dev
.
copy
()[::
b_step2
,
::
b_step1
],
borrow
=
True
)
c_t
.
set_value
(
ct_dev
.
copy
()[::
c_step2
,
::
c_step1
],
borrow
=
True
)
# Numpy result
a_n
=
l
*
numpy
.
dot
(
bv
[::
b_step1
,
::
b_step2
],
cv
[::
c_step1
,
::
c_step2
])
f_nn
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
)
f_nt
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
)
f_tn
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
)
f_tt
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
)
def
test_dot22scalar
(
self
):
self
.
cmp_dot22scalar
((
3
,
4
),
(
4
,
5
))
self
.
cmp_dot22scalar
((
1
,
4
),
(
4
,
5
))
self
.
cmp_dot22scalar
((
3
,
4
),
(
4
,
1
))
self
.
cmp_dot22scalar
((
3
,
1
),
(
1
,
1
))
self
.
cmp_dot22scalar
((
1
,
4
),
(
4
,
1
))
self
.
cmp_dot22scalar
((
3
,
1
),
(
1
,
5
))
self
.
cmp_dot22scalar
((
0
,
4
),
(
4
,
5
))
self
.
cmp_dot22scalar
((
0
,
4
),
(
4
,
1
))
self
.
cmp_dot22scalar
((
0
,
1
),
(
1
,
5
))
self
.
cmp_dot22scalar
((
3
,
4
),
(
4
,
0
))
self
.
cmp_dot22scalar
((
3
,
0
),
(
0
,
5
))
self
.
cmp_dot22scalar
((
0
,
4
),
(
4
,
0
))
self
.
cmp_dot22scalar
((
0
,
0
),
(
0
,
0
))
def
cmp_gemm
(
self
,
a_shp
,
b_shp
,
c_shp
):
av
=
self
.
rand
(
*
a_shp
)
bv
=
self
.
rand
(
*
b_shp
)
cv
=
self
.
rand
(
*
c_shp
)
l
=
numpy
.
float32
(
0.2
)
a
=
self
.
shared
(
av
,
'a'
)
b
=
self
.
shared
(
bv
,
'b'
)
c
=
self
.
shared
(
cv
,
'c'
)
a_t
=
self
.
shared
(
av
.
T
,
'a.T'
)
b_t
=
self
.
shared
(
bv
.
T
,
'b.T'
)
c_t
=
self
.
shared
(
cv
.
T
,
'c.T'
)
a_dev
=
a
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
b_dev
=
b
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
c_dev
=
c
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
bt_dev
=
b_t
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
ct_dev
=
c_t
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
f_nnn
=
theano
.
function
([],
[],
updates
=
{
a
:
(
l
*
a
+
tensor
.
dot
(
b
,
c
))},
mode
=
self
.
mode
)
f_nnt
=
theano
.
function
([],
[],
updates
=
{
a
:
(
l
*
a
+
tensor
.
dot
(
b
,
c_t
.
T
))},
mode
=
self
.
mode
)
f_ntn
=
theano
.
function
([],
[],
updates
=
{
a
:
(
l
*
a
+
tensor
.
dot
(
b_t
.
T
,
c
))},
mode
=
self
.
mode
)
f_ntt
=
theano
.
function
([],
[],
updates
=
{
a
:
(
l
*
a
+
tensor
.
dot
(
b_t
.
T
,
c_t
.
T
))},
mode
=
self
.
mode
)
f_tnn
=
theano
.
function
([],
[],
updates
=
{
a_t
:
(
l
*
a_t
+
tensor
.
dot
(
b
,
c
)
.
T
)},
mode
=
self
.
mode
)
f_tnt
=
theano
.
function
([],
[],
updates
=
{
a_t
:
(
l
*
a_t
+
tensor
.
dot
(
b
,
c_t
.
T
)
.
T
)},
mode
=
self
.
mode
)
f_ttn
=
theano
.
function
([],
[],
updates
=
{
a_t
:
(
l
*
a_t
+
tensor
.
dot
(
b_t
.
T
,
c
)
.
T
)},
mode
=
self
.
mode
)
f_ttt
=
theano
.
function
([],
[],
updates
=
{
a_t
:
(
l
*
a_t
+
tensor
.
dot
(
b_t
.
T
,
c_t
.
T
)
.
T
)},
mode
=
self
.
mode
)
# Try with all stride patterns, and all transposed pattern
for
step_signs
in
itertools
.
product
((
-
1
,
1
),
repeat
=
6
):
for
step
in
(
1
,
2
):
a_step1
,
a_step2
,
b_step1
,
b_step2
,
c_step1
,
c_step2
=
\
(
s
*
step
for
s
in
step_signs
)
b
.
set_value
(
b_dev
.
copy
()[::
b_step1
,
::
b_step2
],
borrow
=
True
)
c
.
set_value
(
c_dev
.
copy
()[::
c_step1
,
::
c_step2
],
borrow
=
True
)
b_t
.
set_value
(
bt_dev
.
copy
()[::
b_step2
,
::
b_step1
],
borrow
=
True
)
c_t
.
set_value
(
ct_dev
.
copy
()[::
c_step2
,
::
c_step1
],
borrow
=
True
)
# Numpy results
a_n
=
(
l
*
av
[::
a_step1
,
::
a_step2
]
+
numpy
.
dot
(
bv
[::
b_step1
,
::
b_step2
],
cv
[::
c_step1
,
::
c_step2
]))
at_n
=
(
l
*
av
[::
a_step1
,
::
a_step2
]
.
T
+
numpy
.
dot
(
bv
[::
b_step1
,
::
b_step2
],
cv
[::
c_step1
,
::
c_step2
])
.
T
)
# a's value is updated, so we need to reinitialize it each time
a
.
set_value
(
a_dev
.
copy
()[::
a_step1
,
::
a_step2
],
borrow
=
True
)
f_nnn
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
)
a
.
set_value
(
a_dev
.
copy
()[::
a_step1
,
::
a_step2
],
borrow
=
True
)
f_nnt
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
)
a
.
set_value
(
a_dev
.
copy
()[::
a_step1
,
::
a_step2
],
borrow
=
True
)
f_ntn
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
)
a
.
set_value
(
a_dev
.
copy
()[::
a_step1
,
::
a_step2
],
borrow
=
True
)
f_ntt
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
)
a_t
.
set_value
(
transpose
(
a_dev
.
copy
())[::
a_step2
,
::
a_step1
],
borrow
=
True
)
f_tnn
()
assert
numpy
.
allclose
(
a_t
.
get_value
(),
at_n
)
a_t
.
set_value
(
transpose
(
a_dev
.
copy
())[::
a_step2
,
::
a_step1
],
borrow
=
True
)
f_tnt
()
assert
numpy
.
allclose
(
a_t
.
get_value
(),
at_n
)
a_t
.
set_value
(
transpose
(
a_dev
.
copy
())[::
a_step2
,
::
a_step1
],
borrow
=
True
)
f_ttn
()
assert
numpy
.
allclose
(
a_t
.
get_value
(),
at_n
)
a_t
.
set_value
(
transpose
(
a_dev
.
copy
())[::
a_step2
,
::
a_step1
],
borrow
=
True
)
f_ttt
()
assert
numpy
.
allclose
(
a_t
.
get_value
(),
at_n
)
def
test_gemm
(
self
):
self
.
cmp_gemm
((
3
,
5
),
(
3
,
4
),
(
4
,
5
))
self
.
cmp_gemm
((
1
,
5
),
(
1
,
4
),
(
4
,
5
))
self
.
cmp_gemm
((
3
,
1
),
(
3
,
4
),
(
4
,
1
))
self
.
cmp_gemm
((
3
,
1
),
(
3
,
1
),
(
1
,
1
))
self
.
cmp_gemm
((
1
,
1
),
(
1
,
4
),
(
4
,
1
))
self
.
cmp_gemm
((
3
,
5
),
(
3
,
1
),
(
1
,
5
))
self
.
cmp_gemm
((
0
,
5
),
(
0
,
4
),
(
4
,
5
))
self
.
cmp_gemm
((
0
,
1
),
(
0
,
4
),
(
4
,
1
))
self
.
cmp_gemm
((
0
,
5
),
(
0
,
1
),
(
1
,
5
))
self
.
cmp_gemm
((
3
,
0
),
(
3
,
4
),
(
4
,
0
))
self
.
cmp_gemm
((
3
,
5
),
(
3
,
0
),
(
0
,
5
))
self
.
cmp_gemm
((
0
,
0
),
(
0
,
4
),
(
4
,
0
))
self
.
cmp_gemm
((
0
,
0
),
(
0
,
0
),
(
0
,
0
))
def
cmp_gemv
(
self
,
a_shp
,
b_shp
,
c_shp
):
av
=
self
.
rand
(
a_shp
)
bv
=
self
.
rand
(
*
b_shp
)
cv
=
self
.
rand
(
c_shp
)
l
=
numpy
.
float32
(
0.2
)
a
=
self
.
shared
(
av
,
'a'
)
b
=
self
.
shared
(
bv
,
'b'
)
c
=
self
.
shared
(
cv
,
'c'
)
b_t
=
self
.
shared
(
bv
.
T
,
'b.T'
)
a_dev
=
a
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
b_dev
=
b
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
c_dev
=
c
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
f_n
=
theano
.
function
([],
[],
updates
=
{
a
:
(
a
+
l
*
tensor
.
dot
(
b
,
c
))},
mode
=
self
.
mode
)
f_t
=
theano
.
function
([],
[],
updates
=
{
a
:
(
a
+
l
*
tensor
.
dot
(
b_t
.
T
,
c
))},
mode
=
self
.
mode
)
# Try with all stride patterns, and all transposed pattern
for
step_signs
in
itertools
.
product
((
1
,
-
1
),
repeat
=
4
):
for
step
in
(
1
,
2
):
a_step
,
b_step1
,
b_step2
,
c_step
=
(
s
*
step
for
s
in
step_signs
)
a
.
set_value
(
a_dev
.
copy
()[::
a_step
],
borrow
=
True
)
b
.
set_value
(
b_dev
.
copy
()[::
b_step1
,
::
b_step2
],
borrow
=
True
)
b_t
.
set_value
(
transpose
(
b_dev
.
copy
())[::
b_step2
,
::
b_step1
],
borrow
=
True
)
c
.
set_value
(
c_dev
.
copy
()[::
c_step
],
borrow
=
True
)
a_n
=
(
av
[::
a_step
]
+
l
*
numpy
.
dot
(
bv
[::
b_step1
,
::
b_step2
],
cv
[::
c_step
]))
f_n
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
),
(
a
.
get_value
(),
a_n
)
a
.
set_value
(
a_dev
.
copy
()[::
a_step
],
borrow
=
True
)
f_t
()
assert
numpy
.
allclose
(
a
.
get_value
(),
a_n
),
(
a
.
get_value
(),
a_n
)
def
test_gemv
(
self
):
self
.
cmp_gemv
(
3
,
(
3
,
5
),
5
)
self
.
cmp_gemv
(
1
,
(
1
,
5
),
5
)
self
.
cmp_gemv
(
3
,
(
3
,
1
),
1
)
self
.
cmp_gemv
(
0
,
(
0
,
5
),
5
)
self
.
cmp_gemv
(
3
,
(
3
,
0
),
0
)
self
.
cmp_gemv
(
0
,
(
0
,
1
),
1
)
self
.
cmp_gemv
(
1
,
(
1
,
0
),
0
)
self
.
cmp_gemv
(
0
,
(
0
,
0
),
0
)
def
cmp_ger
(
self
,
a_shp
,
b_shp
,
c_shp
):
av
=
self
.
rand
(
*
a_shp
)
bv
=
self
.
rand
(
b_shp
)
cv
=
self
.
rand
(
c_shp
)
l
=
numpy
.
float32
(
0.2
)
a
=
self
.
shared
(
av
,
'a'
)
b
=
self
.
shared
(
bv
,
'b'
)
c
=
self
.
shared
(
cv
,
'c'
)
a_t
=
self
.
shared
(
av
.
T
,
'a.T'
)
a_dev
=
a
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
b_dev
=
b
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
c_dev
=
c
.
get_value
(
borrow
=
False
,
return_internal_type
=
True
)
f_n
=
theano
.
function
([],
[],
updates
=
{
a
:
(
a
+
l
*
tensor
.
outer
(
b
,
c
))},
mode
=
self
.
mode
)
f_t
=
theano
.
function
([],
[],
updates
=
{
a_t
:
(
a_t
+
l
*
tensor
.
outer
(
b
,
c
)
.
T
)},
mode
=
self
.
mode
)
# Try with all stride patterns, and all transposed patterns
for
step_signs
in
itertools
.
product
((
1
,
-
1
),
repeat
=
4
):
for
step
in
(
1
,
2
):
a_step1
,
a_step2
,
b_step
,
c_step
=
(
s
*
step
for
s
in
step_signs
)
a
.
set_value
(
a_dev
.
copy
()[::
a_step1
,
::
a_step2
],
borrow
=
True
)
a_t
.
set_value
(
transpose
(
a_dev
.
copy
())[::
a_step1
,
::
a_step2
],
borrow
=
True
)
b
.
set_value
(
b_dev
.
copy
()[::
b_step
],
borrow
=
True
)
c
.
set_value
(
c_dev
.
copy
()[::
c_step
],
borrow
=
True
)
f_n
()
n_n
=
(
av
[::
a_step1
,
::
a_step2
]
+
l
*
numpy
.
outer
(
bv
[::
b_step
],
cv
[::
c_step
]))
assert
numpy
.
allclose
(
a
.
get_value
(),
n_n
),
(
a
.
get_value
(),
n_n
)
f_t
()
n_t
=
(
av
.
T
[::
a_step1
,
::
a_step2
]
+
l
*
numpy
.
outer
(
bv
[::
b_step
],
cv
[::
c_step
])
.
T
)
assert
numpy
.
allclose
(
a_t
.
get_value
(),
n_t
),
\
(
a_t
.
get_value
(),
n_t
)
def
test_ger_strides
(
self
):
self
.
cmp_ger
((
3
,
5
),
3
,
5
)
self
.
cmp_ger
((
1
,
5
),
1
,
5
)
self
.
cmp_ger
((
3
,
1
),
3
,
1
)
self
.
cmp_ger
((
0
,
5
),
0
,
5
)
self
.
cmp_ger
((
3
,
0
),
3
,
0
)
self
.
cmp_ger
((
0
,
1
),
0
,
1
)
self
.
cmp_ger
((
1
,
0
),
1
,
0
)
self
.
cmp_ger
((
0
,
0
),
0
,
0
)
theano/tensor/tests/test_blas_c.py
浏览文件 @
1f6b0c73
import
sys
import
numpy
from
unittest
import
TestCase
import
theano
import
theano.tensor
as
tensor
...
...
@@ -14,8 +16,7 @@ from theano.tensor.blas import Gemv
from
theano.tests
import
unittest_tools
from
theano.tests.unittest_tools
import
TestOptimizationMixin
from
test_blas
import
TestCase
from
test_blas
import
BaseGemv
from
theano.tensor.tests.test_blas
import
BaseGemv
,
TestBlasStrides
mode_blas_opt
=
theano
.
compile
.
get_default_mode
()
.
including
(
'BlasOpt'
,
'specialize'
,
'InplaceBlasOpt'
,
'c_blas'
)
...
...
@@ -41,7 +42,8 @@ class TestCGer(TestCase, TestOptimizationMixin):
)
def
run_f
(
self
,
f
):
return
f
(
self
.
Aval
,
self
.
xval
,
self
.
yval
)
f
(
self
.
Aval
,
self
.
xval
,
self
.
yval
)
f
(
self
.
Aval
[::
-
1
,
::
-
1
],
self
.
xval
,
self
.
yval
)
def
b
(
self
,
bval
):
return
tensor
.
as_tensor_variable
(
numpy
.
asarray
(
bval
,
dtype
=
self
.
dtype
))
...
...
@@ -132,6 +134,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
assert
numpy
.
allclose
(
f
(
self
.
xval
,
self
.
Aval
),
numpy
.
dot
(
self
.
xval
,
self
.
Aval
))
# Test with negative strides on 2 dims
assert
numpy
.
allclose
(
f
(
self
.
xval
,
self
.
Aval
[::
-
1
,
::
-
1
]),
numpy
.
dot
(
self
.
xval
,
self
.
Aval
[::
-
1
,
::
-
1
]))
def
test_optimizations_mv
(
self
):
''' Test matrix dot vector '''
f
=
theano
.
function
([
self
.
A
,
self
.
y
],
...
...
@@ -145,6 +151,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
# Assert they produce the same output
assert
numpy
.
allclose
(
f
(
self
.
Aval
,
self
.
yval
),
numpy
.
dot
(
self
.
Aval
,
self
.
yval
))
# Test with negative strides on 2 dims
assert
numpy
.
allclose
(
f
(
self
.
Aval
[::
-
1
,
::
-
1
],
self
.
yval
),
numpy
.
dot
(
self
.
Aval
[::
-
1
,
::
-
1
],
self
.
yval
))
def
t_gemv1
(
self
,
m_shp
):
''' test vector2 + dot(matrix, vector1) '''
...
...
@@ -164,17 +174,28 @@ class TestCGemv(TestCase, TestOptimizationMixin):
assert
topo
==
[
CGemv
(
inplace
=
False
)],
topo
#test the inplace version
f
=
theano
.
function
([],
[],
g
=
theano
.
function
([],
[],
updates
=
{
v2
:
v2
+
theano
.
dot
(
m
,
v1
)},
mode
=
self
.
mode
)
# Assert they produce the same output
f
()
g
()
assert
numpy
.
allclose
(
v2
.
get_value
(),
numpy
.
dot
(
m
.
get_value
(),
v1
.
get_value
())
+
v2_orig
)
topo
=
[
n
.
op
for
n
in
f
.
maker
.
env
.
toposort
()]
topo
=
[
n
.
op
for
n
in
g
.
maker
.
env
.
toposort
()]
assert
topo
==
[
CGemv
(
inplace
=
True
)]
# Do the same tests with a matrix with strides in both dimensions
m
.
set_value
(
m
.
get_value
(
borrow
=
True
)[::
-
1
,
::
-
1
],
borrow
=
True
)
v2
.
set_value
(
v2_orig
)
assert
numpy
.
allclose
(
f
(),
numpy
.
dot
(
m
.
get_value
(),
v1
.
get_value
())
+
v2_orig
)
g
()
assert
numpy
.
allclose
(
v2
.
get_value
(),
numpy
.
dot
(
m
.
get_value
(),
v1
.
get_value
())
+
v2_orig
)
def
test_gemv1
(
self
):
self
.
t_gemv1
((
3
,
2
))
self
.
t_gemv1
((
0
,
2
))
...
...
@@ -200,6 +221,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
ones_6
=
numpy
.
ones
(
6
,
dtype
=
dtype
)
f
(
A_val
,
ones_3
,
ones_5
)
f
(
A_val
[::
-
1
,
::
-
1
],
ones_3
,
ones_5
)
self
.
assertRaises
(
ValueError
,
f
,
A_val
,
ones_4
,
ones_5
)
self
.
assertRaises
(
ValueError
,
f
,
A_val
,
ones_3
,
ones_6
)
self
.
assertRaises
(
ValueError
,
f
,
A_val
,
ones_4
,
ones_6
)
...
...
@@ -217,3 +239,6 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
dtype
=
'float64'
gemv
=
CGemv
(
inplace
=
False
)
gemv_inplace
=
CGemv
(
inplace
=
True
)
class
TestBlasStridesC
(
TestBlasStrides
):
mode
=
mode_blas_opt
theano/tensor/tests/test_blas_scipy.py
浏览文件 @
1f6b0c73
...
...
@@ -4,7 +4,7 @@ import theano
import
theano.tensor
as
tensor
from
theano.tensor.blas_scipy
import
ScipyGer
from
test_blas
import
TestCase
,
gemm_no_inplace
from
test_blas
import
TestCase
,
gemm_no_inplace
,
TestBlasStrides
from
theano.tests.unittest_tools
import
TestOptimizationMixin
class
TestScipyGer
(
TestCase
,
TestOptimizationMixin
):
...
...
@@ -30,6 +30,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
def
run_f
(
self
,
f
):
f
(
self
.
Aval
,
self
.
xval
,
self
.
yval
)
f
(
self
.
Aval
[::
-
1
,
::
-
1
],
self
.
xval
[::
-
1
],
self
.
yval
[::
-
1
])
def
b
(
self
,
bval
):
return
tensor
.
as_tensor_variable
(
numpy
.
asarray
(
bval
,
dtype
=
self
.
dtype
))
...
...
@@ -55,3 +56,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
0.2
*
self
.
A
+
0.1
*
tensor
.
outer
(
self
.
x
,
self
.
y
))
self
.
assertFunctionContains
(
f
,
gemm_no_inplace
)
self
.
run_f
(
f
)
#DebugMode tests correctness
class
TestBlasStridesScipy
(
TestBlasStrides
):
mode
=
theano
.
compile
.
get_default_mode
()
mode
=
mode
.
including
(
'fast_run'
)
.
excluding
(
'gpu'
,
'c_blas'
)
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论