Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
18708214
提交
18708214
authored
9月 21, 2012
作者:
lamblin
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #942 from nouiz/gpu_alloc2
Gpu alloc2
上级
c8934e50
a3987a12
显示空白字符变更
内嵌
并排
正在显示
9 个修改的文件
包含
208 行增加
和
135 行删除
+208
-135
__init__.py
theano/sandbox/cuda/__init__.py
+1
-1
basic_ops.py
theano/sandbox/cuda/basic_ops.py
+32
-9
cuda_ndarray.cu
theano/sandbox/cuda/cuda_ndarray.cu
+21
-105
cuda_ndarray.cuh
theano/sandbox/cuda/cuda_ndarray.cuh
+99
-11
opt.py
theano/sandbox/cuda/opt.py
+14
-0
test_basic_ops.py
theano/sandbox/cuda/tests/test_basic_ops.py
+2
-2
test_memory.py
theano/sandbox/cuda/tests/test_memory.py
+2
-2
test_opt.py
theano/sandbox/cuda/tests/test_opt.py
+33
-0
test_merge.py
theano/tensor/tests/test_merge.py
+4
-5
没有找到文件。
theano/sandbox/cuda/__init__.py
浏览文件 @
18708214
...
@@ -164,7 +164,7 @@ if compile_cuda_ndarray and cuda_available:
...
@@ -164,7 +164,7 @@ if compile_cuda_ndarray and cuda_available:
code
,
code
,
location
=
cuda_ndarray_loc
,
location
=
cuda_ndarray_loc
,
include_dirs
=
[
cuda_path
],
libs
=
[
'cublas'
],
include_dirs
=
[
cuda_path
],
libs
=
[
'cublas'
],
preargs
=
compiler
.
compile_args
())
preargs
=
[
'-O3'
]
+
compiler
.
compile_args
())
from
cuda_ndarray.cuda_ndarray
import
*
from
cuda_ndarray.cuda_ndarray
import
*
except
Exception
,
e
:
except
Exception
,
e
:
_logger
.
error
(
"Failed to compile cuda_ndarray.cu:
%
s"
,
str
(
e
))
_logger
.
error
(
"Failed to compile cuda_ndarray.cu:
%
s"
,
str
(
e
))
...
...
theano/sandbox/cuda/basic_ops.py
浏览文件 @
18708214
...
@@ -2255,20 +2255,28 @@ gpu_join = GpuJoin()
...
@@ -2255,20 +2255,28 @@ gpu_join = GpuJoin()
class
GpuAlloc
(
GpuOp
):
class
GpuAlloc
(
GpuOp
):
"""Implement Alloc on the gpu.
The memset_0 param is an optimization. When True, we call
cudaMalloc that is faster.
"""
"""
Implement Alloc on the gpu.
def
__init__
(
self
,
memset_0
=
False
):
"""
self
.
memset_0
=
memset_0
def
__init__
(
self
):
pass
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
return
type
(
self
)
==
type
(
other
)
and
self
.
memset_0
==
other
.
memset_0
def
__hash__
(
self
):
def
__hash__
(
self
):
return
hash
(
type
(
self
))
return
hash
(
type
(
self
))
^
hash
(
self
.
memset_0
)
def
__str__
(
self
):
def
__str__
(
self
):
return
self
.
__class__
.
__name__
#Hide the memset parameter when not used to prevent confusion.
if
self
.
memset_0
:
s
=
"
%
s{memset_0=
%
s}"
%
(
self
.
__class__
.
__name__
,
self
.
memset_0
)
else
:
s
=
self
.
__class__
.
__name__
return
s
def
make_node
(
self
,
value
,
*
shape
):
def
make_node
(
self
,
value
,
*
shape
):
#if their is unneeded transfert generated by the next line
#if their is unneeded transfert generated by the next line
...
@@ -2307,6 +2315,7 @@ class GpuAlloc(GpuOp):
...
@@ -2307,6 +2315,7 @@ class GpuAlloc(GpuOp):
value
=
inputs
[
0
]
value
=
inputs
[
0
]
shps
=
inputs
[
1
:]
shps
=
inputs
[
1
:]
nd
=
len
(
shps
)
nd
=
len
(
shps
)
memset_0
=
int
(
self
.
memset_0
)
str
=
"int dims[
%(nd)
s];
\n
"
%
locals
()
str
=
"int dims[
%(nd)
s];
\n
"
%
locals
()
for
idx
,
sh
in
enumerate
(
shps
):
for
idx
,
sh
in
enumerate
(
shps
):
str
+=
"dims[
%(idx)
s] = PyInt_AsLong((PyObject*)
%(sh)
s);
\n
"
%
locals
()
str
+=
"dims[
%(idx)
s] = PyInt_AsLong((PyObject*)
%(sh)
s);
\n
"
%
locals
()
...
@@ -2330,7 +2339,21 @@ class GpuAlloc(GpuOp):
...
@@ -2330,7 +2339,21 @@ class GpuAlloc(GpuOp):
%(fail)
s;
%(fail)
s;
}
}
}
}
if (CudaNdarray_CopyFromCudaNdarray(
%(out)
s,
%(value)
s, true))
if (
%(memset_0)
s)
{
if (cudaSuccess != cudaMemset(
%(out)
s->devdata, 0,
CudaNdarray_SIZE(
%(out)
s) * 4))
{
PyErr_Format(PyExc_MemoryError,
"GpuAlloc: Error memsetting
%%
d"
" bytes of device memory.",
CudaNdarray_SIZE(
%(out)
s) * 4);
Py_XDECREF(
%(out)
s);
%(out)
s = NULL;
%(fail)
s;
}
}
else if (CudaNdarray_CopyFromCudaNdarray(
%(out)
s,
%(value)
s, true))
{
{
// exception already set
// exception already set
Py_XDECREF(
%(out)
s);
Py_XDECREF(
%(out)
s);
...
@@ -2348,7 +2371,7 @@ class GpuAlloc(GpuOp):
...
@@ -2348,7 +2371,7 @@ class GpuAlloc(GpuOp):
return
[
None
for
i
in
inputs
]
return
[
None
for
i
in
inputs
]
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
4
,)
return
(
5
,)
def
do_constant_folding
(
self
,
node
):
def
do_constant_folding
(
self
,
node
):
for
client
in
node
.
outputs
[
0
]
.
clients
:
for
client
in
node
.
outputs
[
0
]
.
clients
:
...
...
theano/sandbox/cuda/cuda_ndarray.cu
浏览文件 @
18708214
...
@@ -3133,24 +3133,6 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
...
@@ -3133,24 +3133,6 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
return
0
;
return
0
;
}
}
bool
CudaNdarray_is_c_contiguous
(
const
CudaNdarray
*
self
)
{
bool
c_contiguous
=
true
;
int
size
=
1
;
for
(
int
i
=
self
->
nd
-
1
;
(
i
>=
0
)
&&
c_contiguous
;
--
i
)
{
if
(
CudaNdarray_HOST_DIMS
(
self
)[
i
]
==
1
)
continue
;
if
(
CudaNdarray_HOST_STRIDES
(
self
)[
i
]
!=
size
)
{
c_contiguous
=
false
;
}
size
=
size
*
CudaNdarray_HOST_DIMS
(
self
)[
i
];
}
return
c_contiguous
;
}
PyObject
*
PyObject
*
CudaNdarray_new_nd
(
int
nd
)
CudaNdarray_new_nd
(
int
nd
)
{
{
...
@@ -4346,12 +4328,6 @@ CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
...
@@ -4346,12 +4328,6 @@ CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
return
self
->
host_structure
+
2
*
self
->
nd
;
return
self
->
host_structure
+
2
*
self
->
nd
;
}
}
void
cnda_mark_dev_structure_dirty
(
CudaNdarray
*
self
)
{
self
->
dev_structure_fresh
=
0
;
}
int
int
CudaNdarray_EqualAndIgnore
(
CudaNdarray
*
cnda1
,
CudaNdarray
*
cnda2
,
int
ignoreSync
,
int
ignoreBase
)
CudaNdarray_EqualAndIgnore
(
CudaNdarray
*
cnda1
,
CudaNdarray
*
cnda2
,
int
ignoreSync
,
int
ignoreBase
)
{
{
...
@@ -4406,43 +4382,33 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
...
@@ -4406,43 +4382,33 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
return
CudaNdarray_EqualAndIgnore
(
cnda1
,
cnda2
,
0
,
0
);
return
CudaNdarray_EqualAndIgnore
(
cnda1
,
cnda2
,
0
,
0
);
}
}
void
int
CudaNdarray_set_dim
(
CudaNdarray
*
self
,
int
idx
,
int
d
)
cnda_copy_structure_to_device
(
const
CudaNdarray
*
self
)
{
{
if
((
idx
>=
self
->
nd
)
||
(
idx
<
0
)
||
(
d
<
0
))
//If the device structure do not exists, create it.
//We allocate it here as we do not need it often.
//In fact, we need it so infrequently that we expect
//that most object won't need it. Not allocating it
//save a significant when creating object.
//This speed up a benchmark by 8% with the gc.
if
(
!
self
->
dev_structure
)
{
int
struct_size
=
cnda_structure_size
(
self
->
nd
);
if
(
struct_size
)
{
{
fprintf
(
stderr
,
"WARNING: probably bad CudaNdarray_set_dim arguments: %i %i
\n
"
,
idx
,
d
);
self
->
dev_structure
=
(
int
*
)
device_malloc
(
struct_size
*
sizeof
(
int
));
}
if
(
NULL
==
self
->
dev_structure
)
if
(
d
!=
self
->
host_structure
[
idx
])
{
{
self
->
host_structure
[
idx
]
=
d
;
return
-
1
;
int
log2d
=
(
int
)
log2
((
double
)
d
);
self
->
host_structure
[
idx
+
2
*
self
->
nd
]
=
(
d
==
(
1
<<
log2d
))
?
log2d
:
-
1
;
cnda_mark_dev_structure_dirty
(
self
);
}
}
}
void
CudaNdarray_set_stride
(
CudaNdarray
*
self
,
int
idx
,
int
s
)
{
if
((
idx
>=
self
->
nd
)
||
(
idx
<
0
))
{
fprintf
(
stderr
,
"WARNING: probably bad CudaNdarray_set_stride arguments: %i %i
\n
"
,
idx
,
s
);
}
}
if
(
s
!=
CudaNdarray_HOST_STRIDES
(
self
)[
idx
])
{
self
->
host_structure
[
idx
+
self
->
nd
]
=
s
;
cnda_mark_dev_structure_dirty
(
self
);
}
}
}
cublasSetVector
(
cnda_structure_size
(
self
->
nd
),
sizeof
(
int
),
self
->
host_structure
,
int
1
,
cnda_copy_structure_to_device
(
const
CudaNdarray
*
self
)
self
->
dev_structure
,
{
1
);
cublasSetVector
(
cnda_structure_size
(
self
->
nd
),
sizeof
(
int
),
self
->
host_structure
,
1
,
self
->
dev_structure
,
1
);
CNDA_THREAD_SYNC
;
CNDA_THREAD_SYNC
;
if
(
CUBLAS_STATUS_SUCCESS
!=
cublasGetError
())
if
(
CUBLAS_STATUS_SUCCESS
!=
cublasGetError
())
{
{
...
@@ -4510,56 +4476,6 @@ CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
...
@@ -4510,56 +4476,6 @@ CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
return
PyInt_FromLong
(
CudaNdarray_SIZE
(
self
));
return
PyInt_FromLong
(
CudaNdarray_SIZE
(
self
));
}
}
int
CudaNdarray_set_nd
(
CudaNdarray
*
self
,
const
int
nd
)
{
if
(
nd
!=
self
->
nd
)
{
if
(
self
->
dev_structure
)
{
if
(
device_free
(
self
->
dev_structure
))
{
return
-
1
;
}
self
->
dev_structure
=
NULL
;
}
if
(
self
->
host_structure
)
{
free
(
self
->
host_structure
);
self
->
host_structure
=
NULL
;
self
->
nd
=
-
1
;
}
if
(
nd
==
-
1
)
return
0
;
self
->
host_structure
=
(
int
*
)
malloc
(
cnda_structure_size
(
nd
)
*
sizeof
(
int
));
if
(
NULL
==
self
->
host_structure
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Failed to allocate dim or str"
);
return
-
1
;
}
//initialize all dimensions and strides to 0
for
(
int
i
=
0
;
i
<
cnda_structure_size
(
nd
);
++
i
)
{
self
->
host_structure
[
i
]
=
0
;
}
int
struct_size
=
cnda_structure_size
(
nd
);
if
(
struct_size
)
{
self
->
dev_structure
=
(
int
*
)
device_malloc
(
struct_size
*
sizeof
(
int
));
if
(
NULL
==
self
->
dev_structure
)
{
free
(
self
->
host_structure
);
self
->
host_structure
=
NULL
;
self
->
dev_structure
=
NULL
;
return
-
1
;
}
}
self
->
nd
=
nd
;
self
->
dev_structure_fresh
=
0
;
}
return
0
;
}
int
CudaNdarray_set_device_data
(
CudaNdarray
*
self
,
float
*
data
,
const
CudaNdarray
*
base
)
int
CudaNdarray_set_device_data
(
CudaNdarray
*
self
,
float
*
data
,
const
CudaNdarray
*
base
)
{
{
return
CudaNdarray_set_device_data
(
self
,
data
,
(
PyObject
*
)
base
);
return
CudaNdarray_set_device_data
(
self
,
data
,
(
PyObject
*
)
base
);
...
...
theano/sandbox/cuda/cuda_ndarray.cuh
浏览文件 @
18708214
...
@@ -82,8 +82,9 @@ struct CudaNdarray
...
@@ -82,8 +82,9 @@ struct CudaNdarray
//device pointers (allocated by cudaMalloc)
//device pointers (allocated by cudaMalloc)
mutable
int
dev_structure_fresh
;
mutable
int
dev_structure_fresh
;
//dev_structure should be accessed via macros, otherwise may not be synchronized
//dev_structure should be accessed via macros, otherwise may not be
int
*
dev_structure
;
//dim0, dim1, ..., stride0, stride1, ...
//synchronized. The macro will allocate it when needed.
mutable
int
*
dev_structure
;
//dim0, dim1, ..., stride0, stride1, ...
real
*
devdata
;
//pointer to data element [0,..,0].
real
*
devdata
;
//pointer to data element [0,..,0].
};
};
...
@@ -126,8 +127,12 @@ CudaNdarray_HOST_STRIDES(const CudaNdarray * self);
...
@@ -126,8 +127,12 @@ CudaNdarray_HOST_STRIDES(const CudaNdarray * self);
DllExport
const
int
*
DllExport
const
int
*
CudaNdarray_HOST_LOG2DIMS
(
const
CudaNdarray
*
self
);
CudaNdarray_HOST_LOG2DIMS
(
const
CudaNdarray
*
self
);
DllExport
void
DllExport
inline
void
__attribute__
((
always_inline
))
cnda_mark_dev_structure_dirty
(
CudaNdarray
*
self
);
cnda_mark_dev_structure_dirty
(
CudaNdarray
*
self
)
{
self
->
dev_structure_fresh
=
0
;
}
DllExport
int
DllExport
int
CudaNdarray_EqualAndIgnore
(
CudaNdarray
*
cnda1
,
CudaNdarray
*
cnda2
,
int
ignoreSync
,
int
ignoreBase
);
CudaNdarray_EqualAndIgnore
(
CudaNdarray
*
cnda1
,
CudaNdarray
*
cnda2
,
int
ignoreSync
,
int
ignoreBase
);
...
@@ -143,11 +148,38 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
...
@@ -143,11 +148,38 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
*
*
* Does not sync structure to host.
* Does not sync structure to host.
*/
*/
DllExport
void
DllExport
inline
void
__attribute__
((
always_inline
))
CudaNdarray_set_dim
(
CudaNdarray
*
self
,
int
idx
,
int
d
);
CudaNdarray_set_dim
(
CudaNdarray
*
self
,
int
idx
,
int
d
)
{
if
((
idx
>=
self
->
nd
)
||
(
idx
<
0
)
||
(
d
<
0
))
{
fprintf
(
stderr
,
"WARNING: probably bad CudaNdarray_set_dim arguments: %i %i
\n
"
,
idx
,
d
);
}
if
(
d
!=
self
->
host_structure
[
idx
])
{
self
->
host_structure
[
idx
]
=
d
;
int
log2d
=
(
int
)
log2
((
double
)
d
);
self
->
host_structure
[
idx
+
2
*
self
->
nd
]
=
(
d
==
(
1
<<
log2d
))
?
log2d
:
-
1
;
cnda_mark_dev_structure_dirty
(
self
);
}
}
DllExport
void
DllExport
inline
void
__attribute__
((
always_inline
))
CudaNdarray_set_stride
(
CudaNdarray
*
self
,
int
idx
,
int
s
);
CudaNdarray_set_stride
(
CudaNdarray
*
self
,
int
idx
,
int
s
)
{
if
((
idx
>=
self
->
nd
)
||
(
idx
<
0
))
{
fprintf
(
stderr
,
"WARNING: probably bad CudaNdarray_set_stride arguments: %i %i
\n
"
,
idx
,
s
);
}
if
(
s
!=
CudaNdarray_HOST_STRIDES
(
self
)[
idx
])
{
self
->
host_structure
[
idx
+
self
->
nd
]
=
s
;
cnda_mark_dev_structure_dirty
(
self
);
}
}
/***
/***
* Update dependent variables from the contents of CudaNdarray_HOST_DIMS(self) and CudaNdarray_HOST_STRIDES(self)
* Update dependent variables from the contents of CudaNdarray_HOST_DIMS(self) and CudaNdarray_HOST_STRIDES(self)
...
@@ -188,7 +220,46 @@ DllExport PyObject * CudaNdarray_new_nd(const int nd);
...
@@ -188,7 +220,46 @@ DllExport PyObject * CudaNdarray_new_nd(const int nd);
*
*
* Note: This does not allocate storage for data.
* Note: This does not allocate storage for data.
*/
*/
DllExport
int
CudaNdarray_set_nd
(
CudaNdarray
*
self
,
const
int
nd
);
DllExport
inline
int
__attribute__
((
always_inline
))
CudaNdarray_set_nd
(
CudaNdarray
*
self
,
const
int
nd
)
{
if
(
nd
!=
self
->
nd
)
{
if
(
self
->
dev_structure
)
{
if
(
device_free
(
self
->
dev_structure
))
{
return
-
1
;
}
self
->
dev_structure
=
NULL
;
}
if
(
self
->
host_structure
)
{
free
(
self
->
host_structure
);
self
->
host_structure
=
NULL
;
self
->
nd
=
-
1
;
}
if
(
nd
==
-
1
)
return
0
;
self
->
host_structure
=
(
int
*
)
malloc
(
cnda_structure_size
(
nd
)
*
sizeof
(
int
));
if
(
NULL
==
self
->
host_structure
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Failed to allocate dim or str"
);
return
-
1
;
}
//initialize all dimensions and strides to 0
for
(
int
i
=
0
;
i
<
cnda_structure_size
(
nd
);
++
i
)
{
self
->
host_structure
[
i
]
=
0
;
}
//The device structure will be created in cnda_copy_structure_to_device
//if needed.
self
->
nd
=
nd
;
self
->
dev_structure_fresh
=
0
;
}
return
0
;
}
/**
/**
* CudaNdarray_alloc_contiguous
* CudaNdarray_alloc_contiguous
...
@@ -218,7 +289,7 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
...
@@ -218,7 +289,7 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
size
=
size
*
dim
[
i
];
size
=
size
*
dim
[
i
];
}
}
if
(
CudaNdarray_is_c_contiguous
(
self
)
&&
(
self
->
data_allocated
==
size
))
if
(
(
self
->
data_allocated
==
size
)
&&
CudaNdarray_is_c_contiguous
(
self
))
{
{
return
0
;
return
0
;
}
}
...
@@ -333,7 +404,24 @@ CudaNdarray_ZEROS(int n, int * dims);
...
@@ -333,7 +404,24 @@ CudaNdarray_ZEROS(int n, int * dims);
/**
/**
* True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
* True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
*/
*/
DllExport
bool
CudaNdarray_is_c_contiguous
(
const
CudaNdarray
*
self
);
DllExport
inline
bool
__attribute__
((
always_inline
))
CudaNdarray_is_c_contiguous
(
const
CudaNdarray
*
self
)
{
bool
c_contiguous
=
true
;
int
size
=
1
;
for
(
int
i
=
self
->
nd
-
1
;
(
i
>=
0
)
&&
c_contiguous
;
--
i
)
{
if
(
CudaNdarray_HOST_DIMS
(
self
)[
i
]
==
1
)
continue
;
if
(
CudaNdarray_HOST_STRIDES
(
self
)[
i
]
!=
size
)
{
c_contiguous
=
false
;
}
size
=
size
*
CudaNdarray_HOST_DIMS
(
self
)[
i
];
}
return
c_contiguous
;
}
DllExport
PyObject
*
CudaNdarray_IS_C_Contiguous
(
CudaNdarray
*
self
);
DllExport
PyObject
*
CudaNdarray_IS_C_Contiguous
(
CudaNdarray
*
self
);
DllExport
int
CudaNdarray_gemm
(
float
alpha
,
const
CudaNdarray
*
A
,
const
CudaNdarray
*
B
,
float
beta
,
CudaNdarray
*
C
);
DllExport
int
CudaNdarray_gemm
(
float
alpha
,
const
CudaNdarray
*
A
,
const
CudaNdarray
*
B
,
float
beta
,
CudaNdarray
*
C
);
...
...
theano/sandbox/cuda/opt.py
浏览文件 @
18708214
...
@@ -33,6 +33,7 @@ from theano.sandbox.cuda.nnet import (
...
@@ -33,6 +33,7 @@ from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmax1HotWithBiasDx
,
GpuCrossentropySoftmax1HotWithBiasDx
,
GpuSoftmax
,
GpuSoftmaxWithBias
)
GpuSoftmax
,
GpuSoftmaxWithBias
)
from
theano.sandbox.cuda.elemwise
import
SupportCodeError
from
theano.sandbox.cuda.elemwise
import
SupportCodeError
from
theano.sandbox.cuda.var
import
CudaNdarrayConstant
from
theano.scan_module
import
scan_utils
,
scan_op
from
theano.scan_module
import
scan_utils
,
scan_op
from
theano.tensor.blas
import
_is_real_vector
,
_is_real_matrix
from
theano.tensor.blas
import
_is_real_vector
,
_is_real_matrix
...
@@ -1337,6 +1338,19 @@ def local_gpualloc(node):
...
@@ -1337,6 +1338,19 @@ def local_gpualloc(node):
return
[
new_out
]
return
[
new_out
]
@register_opt
()
@local_optimizer
([
tensor
.
Alloc
])
def
local_gpualloc_memset_0
(
node
):
replace
=
False
if
isinstance
(
node
.
op
,
GpuAlloc
)
and
not
node
.
op
.
memset_0
:
inp
=
node
.
inputs
[
0
]
if
(
isinstance
(
inp
,
CudaNdarrayConstant
)
and
inp
.
data
.
size
==
1
and
(
numpy
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
new_out
=
GpuAlloc
(
memset_0
=
True
)(
*
node
.
inputs
)
return
[
new_out
]
def
safe_to_gpu
(
x
):
def
safe_to_gpu
(
x
):
if
(
isinstance
(
x
.
type
,
tensor
.
TensorType
)
and
if
(
isinstance
(
x
.
type
,
tensor
.
TensorType
)
and
x
.
type
.
dtype
==
'float32'
):
x
.
type
.
dtype
==
'float32'
):
...
...
theano/sandbox/cuda/tests/test_basic_ops.py
浏览文件 @
18708214
...
@@ -775,11 +775,11 @@ def test_gpujoin_gpualloc():
...
@@ -775,11 +775,11 @@ def test_gpujoin_gpualloc():
assert
sum
([
node
.
op
==
T
.
alloc
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
==
2
assert
sum
([
node
.
op
==
T
.
alloc
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
==
2
assert
sum
([
node
.
op
==
T
.
join
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
==
1
assert
sum
([
node
.
op
==
T
.
join
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
==
1
assert
sum
([
node
.
op
==
B
.
gpu_alloc
assert
sum
([
isinstance
(
node
.
op
,
B
.
GpuAlloc
)
for
node
in
f_gpu
.
maker
.
fgraph
.
toposort
()])
==
2
for
node
in
f_gpu
.
maker
.
fgraph
.
toposort
()])
==
2
assert
sum
([
node
.
op
==
B
.
gpu_join
assert
sum
([
node
.
op
==
B
.
gpu_join
for
node
in
f_gpu
.
maker
.
fgraph
.
toposort
()])
==
1
for
node
in
f_gpu
.
maker
.
fgraph
.
toposort
()])
==
1
assert
sum
([
node
.
op
==
B
.
gpu_alloc
assert
sum
([
isinstance
(
node
.
op
,
B
.
GpuAlloc
)
for
node
in
f_gpu2
.
maker
.
fgraph
.
toposort
()])
==
2
for
node
in
f_gpu2
.
maker
.
fgraph
.
toposort
()])
==
2
assert
sum
([
node
.
op
==
B
.
gpu_join
assert
sum
([
node
.
op
==
B
.
gpu_join
for
node
in
f_gpu2
.
maker
.
fgraph
.
toposort
()])
==
1
for
node
in
f_gpu2
.
maker
.
fgraph
.
toposort
()])
==
1
...
...
theano/sandbox/cuda/tests/test_memory.py
浏览文件 @
18708214
...
@@ -62,7 +62,7 @@ def test_memory():
...
@@ -62,7 +62,7 @@ def test_memory():
# When dtype is float64, only the shared is on the gpu and it is transferd
# When dtype is float64, only the shared is on the gpu and it is transferd
# to the cpu for computation. So no extra alloc after compilation.
# to the cpu for computation. So no extra alloc after compilation.
# more_alloc1 if after the first compilation, more_alloc2 after the second.
# more_alloc1 if after the first compilation, more_alloc2 after the second.
for
dtype
,
more_alloc1
,
more_alloc2
in
[(
"float32"
,
2
,
9
),
for
dtype
,
more_alloc1
,
more_alloc2
in
[(
"float32"
,
1
,
4
),
(
"float64"
,
0
,
0
)]:
(
"float64"
,
0
,
0
)]:
print
dtype
print
dtype
test_params
=
np
.
asarray
(
np
.
random
.
randn
(
np
.
prod
(
shapes
)),
dtype
)
test_params
=
np
.
asarray
(
np
.
random
.
randn
(
np
.
prod
(
shapes
)),
dtype
)
...
@@ -129,7 +129,7 @@ def test_memory_lazy():
...
@@ -129,7 +129,7 @@ def test_memory_lazy():
# When dtype is float64, only the shared is on the gpu and it is transferd
# When dtype is float64, only the shared is on the gpu and it is transferd
# to the cpu for computation. So no extra alloc after compilation.
# to the cpu for computation. So no extra alloc after compilation.
# more_alloc1 if after the first compilation, more_alloc2 after the second.
# more_alloc1 if after the first compilation, more_alloc2 after the second.
for
dtype
,
more_alloc1
in
[(
"float32"
,
3
),
for
dtype
,
more_alloc1
in
[(
"float32"
,
2
),
(
"float64"
,
0
)]:
(
"float64"
,
0
)]:
print
dtype
print
dtype
test_params
=
np
.
asarray
(
np
.
random
.
randn
(
np
.
prod
(
shapes
)),
dtype
)
test_params
=
np
.
asarray
(
np
.
random
.
randn
(
np
.
prod
(
shapes
)),
dtype
)
...
...
theano/sandbox/cuda/tests/test_opt.py
浏览文件 @
18708214
...
@@ -70,6 +70,39 @@ def test_gpualloc():
...
@@ -70,6 +70,39 @@ def test_gpualloc():
assert
numpy
.
any
(
ininstance
(
x
.
op
,
cuda
.
GpuAlloc
)
for
x
in
l
)
assert
numpy
.
any
(
ininstance
(
x
.
op
,
cuda
.
GpuAlloc
)
for
x
in
l
)
def
test_alloc_memset_0
():
i
=
tensor
.
iscalar
()
z
=
numpy
.
zeros
((
1
,),
dtype
=
'float32'
)
o
=
numpy
.
ones
((
1
,),
dtype
=
'float32'
)
ones
=
numpy
.
ones
((
2
,),
dtype
=
'float32'
)
# Test with 0
a
=
basic_ops
.
gpu_alloc
(
cuda
.
gpu_from_host
(
tensor
.
constant
(
z
)),
i
)
f
=
theano
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
basic_ops
.
GpuAlloc
)
and
topo
[
0
]
.
op
.
memset_0
assert
(
numpy
.
asarray
(
f
(
6
))
==
0
)
.
all
()
# Test with 1
a
=
basic_ops
.
gpu_alloc
(
cuda
.
gpu_from_host
(
tensor
.
constant
(
o
)),
i
)
f
=
theano
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
basic_ops
.
GpuAlloc
)
assert
not
topo
[
0
]
.
op
.
memset_0
assert
(
numpy
.
asarray
(
f
(
6
))
==
1
)
.
all
()
# Test with 1, 1
a
=
basic_ops
.
gpu_alloc
(
cuda
.
gpu_from_host
(
tensor
.
constant
(
ones
)),
i
)
f
=
theano
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
basic_ops
.
GpuAlloc
)
assert
not
topo
[
0
]
.
op
.
memset_0
assert
(
numpy
.
asarray
(
f
(
2
))
==
1
)
.
all
()
def
test_gpuspecifyshape
():
def
test_gpuspecifyshape
():
x
=
cuda
.
shared_constructor
(
numpy
.
ones
(
3
,
dtype
=
'float32'
),
'x'
)
x
=
cuda
.
shared_constructor
(
numpy
.
ones
(
3
,
dtype
=
'float32'
),
'x'
)
m
=
theano
.
tensor
.
specify_shape
(
x
+
numpy
.
float32
(
1
),
(
3
,))
m
=
theano
.
tensor
.
specify_shape
(
x
+
numpy
.
float32
(
1
),
(
3
,))
...
...
theano/tensor/tests/test_merge.py
浏览文件 @
18708214
...
@@ -62,8 +62,8 @@ def test_merge_with_weird_eq():
...
@@ -62,8 +62,8 @@ def test_merge_with_weird_eq():
g
=
Env
([
x
,
y
],
[
x
+
y
])
g
=
Env
([
x
,
y
],
[
x
+
y
])
MergeOptimizer
()
.
optimize
(
g
)
MergeOptimizer
()
.
optimize
(
g
)
assert
len
(
g
.
nodes
)
==
1
assert
len
(
g
.
apply_
nodes
)
==
1
node
=
list
(
g
.
nodes
)[
0
]
node
=
list
(
g
.
apply_
nodes
)[
0
]
assert
len
(
node
.
inputs
)
==
2
assert
len
(
node
.
inputs
)
==
2
assert
node
.
inputs
[
0
]
is
node
.
inputs
[
1
]
assert
node
.
inputs
[
0
]
is
node
.
inputs
[
1
]
...
@@ -74,8 +74,7 @@ def test_merge_with_weird_eq():
...
@@ -74,8 +74,7 @@ def test_merge_with_weird_eq():
g
=
Env
([
x
,
y
],
[
x
+
y
])
g
=
Env
([
x
,
y
],
[
x
+
y
])
MergeOptimizer
()
.
optimize
(
g
)
MergeOptimizer
()
.
optimize
(
g
)
assert
len
(
g
.
nodes
)
==
1
assert
len
(
g
.
apply_
nodes
)
==
1
node
=
list
(
g
.
nodes
)[
0
]
node
=
list
(
g
.
apply_
nodes
)[
0
]
assert
len
(
node
.
inputs
)
==
2
assert
len
(
node
.
inputs
)
==
2
assert
node
.
inputs
[
0
]
is
node
.
inputs
[
1
]
assert
node
.
inputs
[
0
]
is
node
.
inputs
[
1
]
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论