Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
44f9d0f7
提交
44f9d0f7
authored
9月 09, 2013
作者:
Frédéric Bastien
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1001 from abergeron/compyte
Support for a new type based on compyte in theano
上级
93a7a5e3
d935ba06
隐藏空白字符变更
内嵌
并排
正在显示
9 个修改的文件
包含
1470 行增加
和
9 行删除
+1470
-9
__init__.py
theano/__init__.py
+4
-0
configdefaults.py
theano/configdefaults.py
+30
-9
__init__.py
theano/sandbox/gpuarray/__init__.py
+49
-0
basic_ops.py
theano/sandbox/gpuarray/basic_ops.py
+501
-0
elemwise.py
theano/sandbox/gpuarray/elemwise.py
+133
-0
opt.py
theano/sandbox/gpuarray/opt.py
+144
-0
__init__.py
theano/sandbox/gpuarray/tests/__init__.py
+0
-0
test_basic_ops.py
theano/sandbox/gpuarray/tests/test_basic_ops.py
+322
-0
type.py
theano/sandbox/gpuarray/type.py
+287
-0
没有找到文件。
theano/__init__.py
浏览文件 @
44f9d0f7
...
...
@@ -91,6 +91,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
theano
.
sandbox
.
cuda
.
tests
.
test_driver
.
test_nvidia_driver1
()
if
config
.
device
.
startswith
(
'cuda'
)
or
config
.
device
.
startswith
(
'opencl'
)
or
\
config
.
gpuarray
.
init_device
!=
''
:
import
theano.sandbox.gpuarray
# Use config.numpy to call numpy.seterr
import
numpy
...
...
theano/configdefaults.py
浏览文件 @
44f9d0f7
...
...
@@ -2,9 +2,8 @@ import os
import
logging
import
subprocess
from
theano.configparser
import
(
AddConfigVar
,
BoolParam
,
ConfigParam
,
EnumStr
,
IntParam
,
TheanoConfigParser
)
from
theano.configparser
import
(
AddConfigVar
,
BoolParam
,
ConfigParam
,
EnumStr
,
IntParam
,
StrParam
,
TheanoConfigParser
)
from
theano.misc.cpucount
import
cpuCount
from
theano.misc.windows
import
call_subprocess_Popen
...
...
@@ -44,20 +43,42 @@ AddConfigVar('int_division',
# gpu means let the driver select the gpu. Needed in case of gpu in
# exclusive mode.
# gpuX mean use the gpu number X.
class
DeviceParam
(
ConfigParam
):
def
__init__
(
self
,
default
,
*
options
,
**
kwargs
):
self
.
default
=
default
def
filter
(
val
):
if
val
.
startswith
(
'cpu'
)
or
val
.
startswith
(
'gpu'
)
\
or
val
.
startswith
(
'opencl'
)
or
val
.
startswith
(
'cuda'
):
return
val
else
:
raise
ValueError
((
'Invalid value ("
%
s") for configuration '
'variable "
%
s". Valid options start with '
'one of "cpu", "gpu", "opencl", "cuda"'
%
(
val
,
self
.
fullname
)))
over
=
kwargs
.
get
(
"allow_override"
,
True
)
super
(
DeviceParam
,
self
)
.
__init__
(
default
,
filter
,
over
)
def
__str__
(
self
):
return
'
%
s (cpu, gpu*, opencl*, cuda*) '
%
(
self
.
fullname
,)
AddConfigVar
(
'device'
,
(
"Default device for computations. If gpu*, change the default to try "
"to move computation to it and to put shared variable of float32 "
"on it. Do not use upper case letters, only lower case even if "
"NVIDIA use capital letters."
),
EnumStr
(
'cpu'
,
'gpu'
,
'gpu0'
,
'gpu1'
,
'gpu2'
,
'gpu3'
,
'gpu4'
,
'gpu5'
,
'gpu6'
,
'gpu7'
,
'gpu8'
,
'gpu9'
,
'gpu10'
,
'gpu11'
,
'gpu12'
,
'gpu13'
,
'gpu14'
,
'gpu15'
,
allow_override
=
False
),
DeviceParam
(
'cpu'
,
allow_override
=
False
),
in_c_key
=
False
,
)
AddConfigVar
(
'gpuarray.init_device'
,
"""
Device to initialize for gpuarray use without moving
computations automatically.
"""
,
StrParam
(
''
),
in_c_key
=
False
)
AddConfigVar
(
'init_gpu_device'
,
(
"Initialize the gpu device to use, works only if device=cpu. "
"Unlike 'device', setting this option will NOT move computations, "
...
...
theano/sandbox/gpuarray/__init__.py
0 → 100644
浏览文件 @
44f9d0f7
import
logging
import
theano
from
theano.configparser
import
config
from
theano.compile
import
optdb
_logger_name
=
'theano.sandbox.gpuarray'
_logger
=
logging
.
getLogger
(
_logger_name
)
_logger
.
setLevel
(
logging
.
WARNING
)
error
=
_logger
.
error
info
=
_logger
.
info
pygpu_activated
=
False
try
:
import
pygpu
import
pygpu.gpuarray
except
ImportError
:
pygpu
=
None
# This is for documentation not to depend on the availability of pygpu
from
type
import
(
GpuArrayType
,
GpuArrayVariable
,
GpuArrayConstant
,
GpuArraySharedVariable
,
gpuarray_shared_constructor
)
import
opt
def
init_dev
(
dev
):
global
pygpu_activated
context
=
pygpu
.
init
(
dev
)
pygpu
.
set_default_context
(
context
)
pygpu_activated
=
True
if
pygpu
:
try
:
if
(
config
.
device
.
startswith
(
'cuda'
)
or
config
.
device
.
startswith
(
'opencl'
)):
init_dev
(
config
.
device
)
import
theano.compile
theano
.
compile
.
shared_constructor
(
gpuarray_shared_constructor
)
optdb
.
add_tags
(
'gpuarray_opt'
,
'fast_run'
,
'inplace'
)
elif
config
.
gpuarray
.
init_device
!=
''
:
init_dev
(
config
.
gpuarray
.
init_device
)
except
Exception
:
error
(
"Could not initialize pygpu, support disabled"
,
exc_info
=
True
)
else
:
if
(
config
.
gpuarray
.
init_device
!=
''
or
config
.
device
.
startswith
(
'opencl'
)
or
config
.
device
.
startswith
(
'cuda'
)):
error
(
"pygpu was configured but could not be imported"
,
exc_info
=
True
)
theano/sandbox/gpuarray/basic_ops.py
0 → 100644
浏览文件 @
44f9d0f7
import
os
import
numpy
import
theano
from
theano
import
Op
,
Type
,
Apply
,
Variable
,
Constant
from
theano
import
tensor
,
scalar
,
config
from
theano.scalar
import
Scalar
from
theano.gof.python25
import
all
,
any
try
:
import
pygpu
from
pygpu
import
gpuarray
,
elemwise
except
ImportError
:
pass
from
type
import
GpuArrayType
def
as_gpuarray_variable
(
x
):
if
hasattr
(
x
,
'_as_GpuArrayVariable'
):
return
x
.
_as_GpuArrayVariable
()
# TODO we need to have the cuda -> gpu path taken care of.
tensor_x
=
tensor
.
as_tensor_variable
(
x
)
return
gpu_from_host
(
tensor_x
)
def
as_gpuarray
(
x
):
return
gpuarray
.
array
(
x
,
copy
=
False
)
class
HostFromGpu
(
Op
):
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
def
__hash__
(
self
):
return
hash
(
type
(
self
))
def
__str__
(
self
):
return
'HostFromGpu(gpuarray)'
def
make_node
(
self
,
x
):
if
not
isinstance
(
x
.
type
,
GpuArrayType
):
raise
TypeError
(
x
)
return
Apply
(
self
,
[
x
],
[
tensor
.
TensorType
(
dtype
=
x
.
dtype
,
broadcastable
=
x
.
broadcastable
,)()])
def
perform
(
self
,
node
,
inp
,
out
):
x
,
=
inp
z
,
=
out
z
[
0
]
=
numpy
.
asarray
(
x
)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
return
"""
GpuArray
%(name)
s_ga_s;
GpuArray *
%(name)
s_ga = NULL;
int
%(name)
serr;
PyArray_Descr *
%(name)
s_dtype;
if (!GpuArray_ISONESEGMENT(&
%(inp)
s->ga)) {
if (GpuArray_copy(&
%(name)
s_ga_s, &
%(inp)
s->ga, GA_C_ORDER) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't make contiguous copy");
%(fail)
s;
}
%(name)
s_ga = &
%(name)
s_ga_s;
} else {
%(name)
s_ga = &
%(inp)
s->ga;
}
%(name)
s_dtype = typecode_to_dtype(
%(inp)
s->ga.typecode);
Py_XDECREF(
%(out)
s);
// PyArray_Empty below steals a reference to the dtype we pass it
// so we need an extra one to spare.
Py_INCREF(
%(name)
s_dtype);
%(out)
s = (PyArrayObject *)PyArray_Empty(
%(inp)
s->ga.nd,
(npy_intp *)
%(inp)
s->ga.dimensions,
%(name)
s_dtype,
(
%(inp)
s->ga.flags & GA_F_CONTIGUOUS) &&
!(
%(inp)
s->ga.flags & GA_C_CONTIGUOUS));
if (
%(out)
s == NULL) {
if (
%(name)
s_ga == &
%(name)
s_ga_s) GpuArray_clear(
%(name)
s_ga);
%(fail)
s
}
%(name)
serr = GpuArray_read(PyArray_DATA(
%(out)
s),
PyArray_NBYTES(
%(out)
s),
%(name)
s_ga);
if (
%(name)
s_ga == &
%(name)
s_ga_s) GpuArray_clear(
%(name)
s_ga);
if (
%(name)
serr != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Could not read device data.");
%(fail)
s
}
"""
%
{
'name'
:
name
,
'fail'
:
sub
[
'fail'
],
'inp'
:
inputs
[
0
],
'out'
:
outputs
[
0
]}
def
c_code_cache_version
(
self
):
return
(
1
,)
def
grad
(
self
,
inputs
,
grads
):
gz
,
=
grads
return
[
gpu_from_host
(
gz
)]
def
R_op
(
self
,
inputs
,
eval_points
):
ev
,
=
eval_points
if
isinstance
(
ev
,
tensor
.
TensorType
):
return
[
gpu_from_host
(
ev
)]
else
:
return
[
ev
]
def
infer_shape
(
self
,
node
,
xshp
):
return
xshp
host_from_gpu
=
HostFromGpu
()
class
GpuFromHost
(
Op
):
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
def
__hash__
(
self
):
return
hash
(
type
(
self
))
def
__str__
(
self
):
return
'GpuFromHost(gpuarray)'
def
make_node
(
self
,
x
):
if
not
isinstance
(
x
.
type
,
tensor
.
TensorType
):
raise
TypeError
(
x
)
return
Apply
(
self
,
[
x
],
[
GpuArrayType
(
broadcastable
=
x
.
broadcastable
,
dtype
=
x
.
dtype
)()])
def
perform
(
self
,
node
,
inp
,
out
):
x
,
=
inp
z
,
=
out
type
=
node
.
outputs
[
0
]
.
type
z
[
0
]
=
gpuarray
.
array
(
x
)
def
grad
(
self
,
inputs
,
grads
):
gz
,
=
grads
return
[
host_from_gpu
(
as_gpuarray_variable
(
gz
))]
def
R_op
(
self
,
inputs
,
eval_points
):
ev
,
=
eval_points
if
isintance
(
ev
,
GpuArrayType
):
return
[
host_from_gpu
(
ev
)]
else
:
return
ev
def
infer_shape
(
self
,
node
,
xshp
):
return
xshp
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
return
"""
PyArrayObject *
%(name)
s_tmp;
int
%(name)
serr;
%(name)
s_tmp = PyArray_GETCONTIGUOUS(
%(inp)
s);
if (
%(name)
s_tmp == NULL) {
// PyArray_GETCONTIGUOUS sets an error message if it fails
%(fail)
s
}
Py_XDECREF(
%(out)
s);
%(out)
s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
if (
%(out)
s == NULL) {
Py_DECREF(
%(name)
s_tmp);
// new_GpuArray calls __new__ which will set an error message
// if it returns NULL.
%(fail)
s
}
%(name)
serr = GpuArray_empty(&
%(out)
s->ga,
GpuArray_default_context()->ops,
GpuArray_default_context()->ctx,
get_typecode((PyObject *)PyArray_DESCR(
%(name)
s_tmp)),
PyArray_NDIM(
%(inp)
s),
(size_t *)PyArray_DIMS(
%(inp)
s),
GA_C_ORDER);
if (
%(name)
serr != GA_NO_ERROR) {
Py_DECREF(
%(name)
s_tmp);
Py_DECREF(
%(out)
s);
%(out)
s = NULL;
PyErr_SetString(PyExc_MemoryError, "Can't allocate device memory for result.");
%(fail)
s
}
%(name)
serr = GpuArray_write(&
%(out)
s->ga, PyArray_DATA(
%(name)
s_tmp),
PyArray_NBYTES(
%(name)
s_tmp));
Py_DECREF(
%(name)
s_tmp);
if (
%(name)
serr != GA_NO_ERROR) {
Py_DECREF(
%(out)
s);
PyErr_SetString(PyExc_RuntimeError, "Could not copy array data to device");
%(fail)
s
}
"""
%
{
'name'
:
name
,
'inp'
:
inputs
[
0
],
'out'
:
outputs
[
0
],
'fail'
:
sub
[
'fail'
]}
def
c_code_cache_version
(
self
):
return
(
1
,)
gpu_from_host
=
GpuFromHost
()
class
GpuFromCuda
(
Op
):
view_map
=
{
0
:
[
0
]}
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
def
__hash__
(
self
):
return
hash
(
type
(
self
))
def
__str__
(
self
):
return
'GpuFromCuda'
def
make_node
(
self
,
x
):
from
theano.sandbox.cuda
import
CudaNdarrayType
if
not
isinstance
(
x
.
type
,
CudaNdarrayType
):
raise
TypeError
(
x
)
return
Apply
(
self
,
[
x
],
[
GpuArrayType
(
broadcastable
=
x
.
broadcastable
,
dtype
=
x
.
dtype
)()])
def
perform
(
self
,
node
,
inp
,
out
):
x
,
=
inp
z
,
=
out
z
[
0
]
=
gpuarray
.
array
(
numpy
.
asarray
(
x
))
def
grad
(
self
,
inputs
,
grads
):
gz
,
=
grads
return
[
cuda_from_gpu
(
gz
)]
def
R_op
(
self
,
inputs
,
eval_points
):
ev
,
=
eval_points
if
isintance
(
ev
,
GpuArrayType
):
return
[
cuda_from_gpu
(
ev
)]
else
:
return
ev
def
infer_shape
(
self
,
node
,
xshp
):
return
xshp
def
c_headers
(
self
):
return
[
'<cuda_ndarray.cuh>'
,
'<compyte/extension.h>'
,
'<compyte/types.h>'
,
'<cuda.h>'
]
def
c_header_dirs
(
self
):
import
cuda_ndarray
ret
=
[
os
.
path
.
dirname
(
cuda_ndarray
.
__file__
)]
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
ret
.
append
(
os
.
path
.
join
(
cuda_root
,
'include'
))
return
ret
def
c_lib_dirs
(
self
):
import
cuda_ndarray
ret
=
[
os
.
path
.
dirname
(
cuda_ndarray
.
__file__
)]
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
ret
.
append
(
os
.
path
.
join
(
cuda_root
,
'lib'
))
return
ret
def
c_libraries
(
self
):
return
[
'cudart'
,
'cublas'
,
'cuda'
]
def
c_support_code
(
self
):
return
"""
CUcontext (*cuda_get_ctx)(void *ctx);
gpudata *(*cuda_make_buf)(void *c, CUdeviceptr p, size_t sz);
"""
def
c_init_code
(
self
):
return
[
'cuda_get_ctx = (CUcontext (*)(void *))compyte_get_extension("cuda_get_ctx");'
,
'cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))compyte_get_extension("cuda_make_buf");'
]
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
return
"""
int
%(name)
serr;
gpudata *
%(name)
sdata;
CUcontext
%(name)
scur;
size_t *
%(name)
sdims;
ssize_t *
%(name)
sstr;
cuCtxGetCurrent(&
%(name)
scur);
if (
%(name)
scur != cuda_get_ctx(GpuArray_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)
s
}
%(name)
sdims = (size_t *)calloc(
%(in)
s->nd, sizeof(size_t));
if (
%(name)
sdims == NULL) {
PyErr_SetString(PyExc_MemoryError, "Can't allocate dimensions.");
%(fail)
s
}
%(name)
sstr = (ssize_t *)calloc(
%(in)
s->nd, sizeof(ssize_t));
if (
%(name)
sstr == NULL) {
free(
%(name)
sdims);
PyErr_SetString(PyExc_MemoryError, "Can't allocate strides.");
%(fail)
s
}
for (unsigned int i = 0; i <
%(in)
s->nd; i++) {
%(name)
sdims[i] = (size_t)CudaNdarray_HOST_DIMS(
%(in)
s)[i];
%(name)
sstr[i] = (ssize_t)CudaNdarray_HOST_STRIDES(
%(in)
s)[i]*4;
}
Py_XDECREF(
%(out)
s);
%(out)
s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
if (
%(out)
s == NULL) {
free(
%(name)
sdims);
free(
%(name)
sstr);
%(fail)
s
}
%(name)
sdata = cuda_make_buf(GpuArray_default_context()->ctx,
(CUdeviceptr)
%(in)
s->devdata,
((size_t)
%(in)
s->data_allocated)*4);
if (
%(name)
sdata == NULL) {
Py_DECREF(
%(out)
s);
free(
%(name)
sdims);
free(
%(name)
sstr);
PyErr_SetString(PyExc_MemoryError, "Could not allocate gpudata structure.");
%(fail)
s
}
%(name)
serr = GpuArray_fromdata(&
%(out)
s->ga,
GpuArray_default_context()->ops,
%(name)
sdata, 0, GA_FLOAT,
%(in)
s->nd,
%(name)
sdims,
%(name)
sstr, 1);
free(
%(name)
sdims);
free(
%(name)
sstr);
if (
%(name)
serr != GA_NO_ERROR) {
Py_DECREF(
%(out)
s);
PyErr_SetString(PyExc_MemoryError, "Could not allocate GpuArray structure.");
%(fail)
s
}
Py_INCREF(
%(in)
s);
%(out)
s->base = (PyObject *)
%(in)
s;
"""
%
{
'name'
:
name
,
'in'
:
inputs
[
0
],
'out'
:
outputs
[
0
],
'fail'
:
sub
[
'fail'
]}
def
c_code_cache_version
(
self
):
return
(
1
,)
gpu_from_cuda
=
GpuFromCuda
()
class
CudaFromGpu
(
Op
):
view_map
=
{
0
:
[
0
]}
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
def
__hash__
(
self
):
return
hash
(
type
(
self
))
def
__str__
(
self
):
return
'CudaFromGpu'
def
make_node
(
self
,
x
):
from
theano.sandbox.cuda
import
CudaNdarrayType
if
not
isinstance
(
x
.
type
,
GpuArrayType
):
raise
TypeError
(
x
)
if
x
.
type
.
dtype
!=
'float32'
:
raise
TypeError
(
x
)
return
Apply
(
self
,
[
x
],
[
CudaNdarrayType
(
broadcastable
=
x
.
broadcastable
)()])
def
perform
(
self
,
node
,
inp
,
out
):
from
theano.sandbox.cuda
import
filter
as
cuda_filter
x
,
=
inp
z
,
=
out
z
[
0
]
=
cuda_filter
(
theano
.
_asarray
(
x
,
dtype
=
'float32'
),
tuple
([
0
]
*
x
.
ndim
),
0
,
z
[
0
])
def
grad
(
self
,
inputs
,
grads
):
gz
,
=
grads
return
[
gpu_from_cuda
(
gz
)]
def
R_op
(
self
,
inputs
,
eval_points
):
from
theano.sandbox.cuda
import
CudaNdArrayType
ev
,
=
eval_points
if
(
isinstance
(
ev
,
CudaNdarrayType
)):
return
[
gpu_from_cuda
(
ev
)]
else
:
return
[
ev
]
def
infer_shape
(
self
,
node
,
shp
):
return
shp
def
c_headers
(
self
):
return
[
'<cuda_ndarray.cuh>'
,
'<compyte/extension.h>'
,
'<cuda.h>'
]
def
c_header_dirs
(
self
):
import
cuda_ndarray
ret
=
[
os
.
path
.
dirname
(
cuda_ndarray
.
__file__
)]
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
ret
.
append
(
os
.
path
.
join
(
cuda_root
,
'include'
))
return
ret
def
c_lib_dirs
(
self
):
import
cuda_ndarray
ret
=
[
os
.
path
.
dirname
(
cuda_ndarray
.
__file__
)]
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
ret
.
append
(
os
.
path
.
join
(
cuda_root
,
'lib'
))
return
ret
def
c_libraries
(
self
):
return
[
'cudart'
,
'cublas'
,
'cuda'
]
def
c_support_code
(
self
):
return
"""
CUcontext (*cuda_get_ctx)(void *ctx);
CUdeviceptr (*cuda_get_ptr)(gpudata *g);
"""
def
c_init_code
(
self
):
return
[
'cuda_get_ctx = (CUcontext (*)(void *ctx))compyte_get_extension("cuda_get_ctx");'
,
'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");'
]
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
return
"""
int
%(name)
serr = 0,
%(name)
si;
CUcontext
%(name)
scur;
cuCtxGetCurrent(&
%(name)
scur);
if (
%(name)
scur != cuda_get_ctx(GpuArray_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)
s
}
Py_XDECREF(
%(out)
s);
%(out)
s = (CudaNdarray *)CudaNdarray_new_nd(
%(inp)
s->ga.nd);
if (!
%(out)
s) {
%(fail)
s
}
for (
%(name)
si = 0;
%(name)
si <
%(inp)
s->ga.nd;
%(name)
si++) {
CudaNdarray_set_dim(
%(out)
s,
%(name)
si,
%(inp)
s->ga.dimensions[
%(name)
si]);
CudaNdarray_set_stride(
%(out)
s,
%(name)
si,
%(inp)
s->ga.strides[
%(name)
si]/4);
}
%(name)
serr = CudaNdarray_set_device_data(
%(out)
s,
(float *)(((char *)cuda_get_ptr(
%(inp)
s->ga.data))+
%(inp)
s->ga.offset),
(PyObject *)
%(inp)
s);
if (
%(name)
serr) {
%(fail)
s
}
"""
%
{
'name'
:
name
,
'inp'
:
inputs
[
0
],
'out'
:
outputs
[
0
],
'fail'
:
sub
[
'fail'
]}
def
c_code_cache_version
(
self
):
return
(
1
,)
cuda_from_gpu
=
CudaFromGpu
()
class
GpuAlloc
(
Op
):
def
__str__
(
self
):
return
'GpuAlloc'
def
__hash__
(
self
):
return
hash
(
type
(
self
))
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
def
make_node
(
self
,
value
,
*
shape
):
v
=
as_gpuarray_variable
(
value
)
sh
=
[
tensor
.
as_tensor_variable
(
s
)
for
s
in
shape
]
bcast
=
[]
if
v
.
ndim
>
len
(
shape
):
raise
TypeError
(
'GpuAlloc value has more dimensions than arguments'
,
value
.
ndim
,
len
(
shape
))
for
i
,
s
in
enumerate
(
sh
):
if
s
.
type
.
dtype
[:
3
]
not
in
(
'int'
,
'uint'
):
raise
TypeError
(
'Shape arguments must be integers'
,
s
)
try
:
const_shp
=
tensor
.
get_scalar_constant_value
(
s
)
except
tensor
.
NotScalarConstantError
:
const_shp
=
None
bcast
.
append
(
numpy
.
all
(
1
==
const_shp
))
otype
=
GpuArrayType
(
dtype
=
v
.
dtype
,
broadcastable
=
bcast
)
return
Apply
(
self
,
[
v
]
+
sh
,
[
otype
()])
def
perform
(
self
,
node
,
inputs
,
outs
):
out
,
=
outs
v
=
inputs
[
0
]
sh
=
tuple
(
map
(
int
,
inputs
[
1
:]))
if
out
[
0
]
is
None
or
out
[
0
]
.
shape
!=
sh
:
out
[
0
]
=
gpuarray
.
empty
(
sh
,
dtype
=
v
.
dtype
)
out
[
0
][
...
]
=
v
def
infer_shape
(
self
,
node
,
input_shapes
):
return
[
node
.
inputs
[
1
:]]
def
grad
(
self
,
input
,
grads
):
return
[
None
for
i
in
inputs
]
def
do_constant_folding
(
self
,
node
):
if
not
getattr
(
node
.
ouputs
[
0
],
'clients'
,
[]):
return
False
for
client
in
node
.
outputs
[
0
]
.
clients
:
if
client
[
0
]
==
'output'
:
return
False
return
True
gpu_alloc
=
GpuAlloc
()
theano/sandbox/gpuarray/elemwise.py
0 → 100644
浏览文件 @
44f9d0f7
import
numpy
from
theano
import
Op
,
Apply
,
scalar
try
:
from
pygpu.tools
import
ScalarArg
,
ArrayArg
from
pygpu.elemwise
import
ElemwiseKernel
except
ImportError
:
pass
from
basic_ops
import
as_gpuarray_variable
from
type
import
GpuArrayType
from
theano.gof.utils
import
MethodNotDefined
def
_is_scalar
(
v
):
False
def
make_argument
(
v
,
name
):
if
_is_scalar
(
v
):
return
ScalarArg
(
numpy
.
dtype
(
v
.
type
.
dtype
),
name
)
else
:
return
ArrayArg
(
numpy
.
dtype
(
v
.
type
.
dtype
),
name
)
def
ensure_out
(
o
,
ref
):
if
o
is
None
:
return
ref
.
_empty_like_me
()
else
:
return
o
class
GpuElemwise
(
Op
):
nin
=
property
(
lambda
self
:
self
.
scalar_op
.
nin
)
nout
=
property
(
lambda
self
:
self
.
scalar_op
.
nout
)
def
__init__
(
self
,
scalar_op
):
self
.
scalar_op
=
scalar_op
self
.
destroy_map
=
{}
def
__getstate__
(
self
):
d
=
copy
.
copy
(
self
.
__dict__
)
d
.
pop
(
'__epydoc_asRoutine'
,
None
)
d
.
pop
(
'_hashval'
)
return
d
def
__setstate__
(
self
,
d
):
self
.
__dict__
.
update
(
d
)
self
.
_rehash
()
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
and
self
.
scalar_op
==
other
.
scalar_op
)
def
__hash__
(
self
):
return
hash
(
type
(
self
))
^
hash
(
self
.
scalar_op
)
def
__str__
(
self
):
return
"GpuElemwise{
%
s}(gpuarray)"
%
(
self
.
scalar_op
,)
def
make_node
(
self
,
*
inputs
):
_inputs
=
[
as_gpuarray_variable
(
i
)
for
i
in
inputs
]
if
self
.
nin
>
0
and
len
(
_inputs
)
!=
self
.
nin
:
raise
TypeError
(
"Wrong argument count"
,
(
self
.
nin
,
len
(
_inputs
)))
for
i
in
_inputs
[
1
:]:
if
i
.
type
.
ndim
!=
inputs
[
0
]
.
type
.
ndim
:
raise
TypeError
(
'mismatched rank amongst inputs'
)
broadcastable
=
[]
for
d
in
xrange
(
_inputs
[
0
]
.
type
.
ndim
):
bcast_d
=
True
for
i
in
_inputs
:
if
not
i
.
type
.
broadcastable
[
d
]:
bcast_d
=
False
break
broadcastable
.
append
(
bcast_d
)
assert
len
(
broadcastable
)
==
_inputs
[
0
]
.
type
.
ndim
assert
self
.
nout
>
0
inps
=
[
make_argument
(
i
,
'i
%
d'
%
(
n
,))
for
n
,
i
in
enumerate
(
inputs
)]
scal_ins
=
[
scalar
.
Scalar
(
i
.
dtype
)
for
i
in
inputs
]
res
=
Apply
(
self
,
_inputs
,
[
GpuArrayType
(
o
.
dtype
,
broadcastable
)()
for
o
in
self
.
scalar_op
.
output_types
(
scal_ins
)])
outs
=
[
make_argument
(
o
,
'o
%
d'
%
(
n
,))
for
n
,
o
in
enumerate
(
res
.
outputs
)]
scal_out
=
[
scalar
.
Scalar
(
o
.
dtype
)
for
o
in
res
.
outputs
]
fake_node
=
Apply
(
self
.
scalar_op
,
[
i
()
for
i
in
scal_ins
],
[
o
()
for
o
in
scal_out
])
kcode
=
self
.
scalar_op
.
c_code
(
fake_node
,
'kcode'
,
[
i
.
expr
()
for
i
in
inps
],
[
o
.
expr
()
for
o
in
outs
],
sub
=
dict
(
fail
=
'return;'
))
res
.
tag
.
kcode
=
kcode
try
:
code
=
self
.
scalar_op
.
c_support_code_apply
(
fake_node
,
'kcode'
)
if
code
:
raise
SupportCodeError
()
except
MethodNotDefined
:
pass
support_code
=
""
try
:
support_code
+=
self
.
scalar_op
.
c_support_code
()
except
MethodNotDefined
:
pass
if
support_code
!=
"#define THEANO_MACRO_MOD(x,y) (x
%
y)"
:
# Avoid the C++ complex struct
raise
SupportCodeError
()
k
=
ElemwiseKernel
(
None
,
inps
+
outs
,
kcode
,
preamble
=
support_code
)
res
.
tag
.
kernel
=
k
return
res
def
perform
(
self
,
node
,
inps
,
out
):
k
=
node
.
tag
.
kernel
outs
=
[
ensure_out
(
o
[
0
],
inps
[
0
])
for
o
in
out
]
# the dict call is there to avoid syntax error in python <= 2.5
k
(
*
(
inps
+
outs
),
**
dict
(
broadcast
=
True
))
for
o
,
og
in
zip
(
out
,
outs
):
o
[
0
]
=
og
class
SupportCodeError
(
Exception
):
"""
We do not support certain things (such as the C++ complex struct)
"""
theano/sandbox/gpuarray/opt.py
0 → 100644
浏览文件 @
44f9d0f7
import
theano
,
numpy
from
theano
import
tensor
from
theano.compile
import
optdb
from
theano.gof
import
(
local_optimizer
,
EquilibriumDB
,
SequenceDB
,
ProxyDB
,
Optimizer
,
toolbox
,
DestroyHandler
,
InconsistencyError
,
EquilibriumOptimizer
)
from
theano.gof.python25
import
all
,
any
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
basic_ops
import
host_from_gpu
,
gpu_from_host
,
gpu_alloc
from
elemwise
import
GpuElemwise
,
_is_scalar
gpu_optimizer
=
EquilibriumDB
()
gpu_cut_copies
=
EquilibriumDB
()
gpu_seqopt
=
SequenceDB
()
gpu_seqopt
.
register
(
'gpuarray_local_optimiziations'
,
gpu_optimizer
,
1
,
'fast_run'
,
'inplace'
,
'gpuarray'
)
gpu_seqopt
.
register
(
'gpuarray_cut_transfers'
,
gpu_cut_copies
,
2
,
'fast_run'
,
'gpuarray'
)
# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb
.
register
(
'gpuarray_opt'
,
gpu_seqopt
,
optdb
.
__position__
.
get
(
'add_destroy_handler'
,
49.5
)
-
1
,
'gpuarray'
)
def
register_opt
(
*
tags
,
**
kwargs
):
def
f
(
local_opt
):
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
local_opt
.
__name__
gpu_optimizer
.
register
(
name
,
local_opt
,
'fast_run'
,
'gpuarray'
,
*
tags
)
return
local_opt
return
f
register_opt
()(
theano
.
tensor
.
opt
.
local_track_shape_i
)
class
InputToGpuOptimizer
(
Optimizer
):
"Transfer the input to the gpu to start the rolling wave."
def
add_requirements
(
self
,
fgraph
):
fgraph
.
attach_feature
(
toolbox
.
ReplaceValidate
())
fgraph
.
attach_feature
(
DestroyHandler
())
def
apply
(
self
,
fgraph
):
for
input
in
fgraph
.
inputs
:
if
isinstance
(
input
.
type
,
GpuArrayType
):
continue
if
(
len
(
input
.
clients
)
==
1
and
(
input
.
clients
[
0
][
0
]
==
'output'
or
input
.
clients
[
0
][
0
]
.
op
==
gpu_from_host
)):
continue
try
:
new_input
=
host_from_gpu
(
gpu_from_host
(
input
))
fgraph
.
replace_validate
(
input
,
new_input
,
"InputToGpuOptimizer"
)
except
TypeError
,
e
:
# This could fail if the inputs are not TensorTypes
pass
gpu_seqopt
.
register
(
'InputToGpuArrayOptimizer'
,
InputToGpuOptimizer
(),
0
,
'fast_run'
,
'fast_compile'
,
'merge'
)
@local_optimizer
([])
def
local_cut_gpu_host_gpu
(
node
):
if
tensor
.
opt
.
opt
.
check_chain
(
node
,
gpu_from_host
,
host_from_gpu
):
return
[
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]]
if
tensor
.
opt
.
opt
.
check_chain
(
node
,
host_from_gpu
,
gpu_from_host
):
return
[
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]]
return
False
gpu_cut_copies
.
register
(
'cut_gpua_host_transfers'
,
local_cut_gpu_host_gpu
,
'fast_run'
,
'inplace'
,
'gpuarray'
)
gpu_cut_copies
.
register
(
'cut_gpua_constant_transfers'
,
tensor
.
opt
.
constant_folding
,
'fast_run'
,
'gpuarray'
)
optdb
[
'canonicalize'
]
.
register
(
'local_cut_gpua_host_gpua'
,
local_cut_gpu_host_gpu
,
'fast_run'
,
'gpuarray'
)
@register_opt
()
@local_optimizer
([
tensor
.
Alloc
])
def
local_gpualloc
(
node
):
replace
=
False
if
node
.
op
==
tensor
.
alloc
:
if
node
.
inputs
[
0
]
.
owner
and
node
.
inputs
[
0
]
.
owner
.
op
==
host_from_gpu
:
replace
=
True
elif
all
([
c
!=
'output'
and
c
.
op
==
gpu_from_host
for
c
,
idx
in
node
.
outputs
[
0
]
.
clients
]):
replace
=
True
elif
all
([
c
!=
'output'
and
c
.
op
==
tensor
.
join
and
all
([
i
.
owner
and
i
.
owner
.
op
in
[
host_from_gpu
,
tensor
.
alloc
]
for
i
in
c
.
inputs
[
1
:]])
for
c
,
idx
in
node
.
outputs
[
0
]
.
clients
]):
replace
=
True
if
replace
:
val
=
node
.
inputs
[
0
]
shp
=
node
.
inputs
[
1
:]
old_out
=
node
.
outputs
[
0
]
val2
=
tensor
.
shape_padleft
(
val
,
len
(
shp
)
-
val
.
ndim
)
new_out
=
host_from_gpu
(
gpu_alloc
(
val
,
*
shp
))
if
new_out
.
type
!=
old_out
.
type
:
assert
new_out
.
type
.
ndim
==
old_out
.
type
.
ndim
assert
new_out
.
type
.
dtype
==
old_out
.
type
.
dtype
for
b_old
,
b_new
in
zip
(
old_out
.
type
.
broadcastable
,
new_out
.
type
.
broadcastable
):
assert
b_new
or
(
not
b_old
)
new_out
=
tensor
.
patternbroadcast
(
new_out
.
old_out
.
broadcastable
)
return
[
new_out
]
@register_opt
()
@local_optimizer
([])
def
local_gpu_elemwise
(
node
):
do_replace
=
False
gpu_out
=
False
# check for gpu_from_host(Elemwise)) and extract the Elemwise node
if
node
.
op
==
gpu_from_host
:
host_i
,
=
node
.
inputs
if
(
host_i
.
owner
and
isinstance
(
host_i
.
owner
.
op
,
tensor
.
Elemwise
)
and
len
(
host_i
.
clients
)
==
1
):
node
=
host_i
.
owner
do_replace
=
True
gpu_out
=
True
# check for elemwise(..., host_from_gpu, ...)
if
isinstance
(
node
.
op
,
tensor
.
Elemwise
):
if
numpy
.
any
([
i
.
owner
and
i
.
owner
.
op
==
host_from_gpu
for
i
in
node
.
inputs
]):
do_replace
=
True
if
numpy
.
all
([
_is_scalar
(
i
)
for
i
in
node
.
inputs
]):
do_replace
=
False
if
do_replace
:
new_op
=
GpuElemwise
(
node
.
op
.
scalar_op
)
gpu_elemwise
=
new_op
(
*
(
gpu_from_host
(
i
)
for
i
in
node
.
inputs
))
if
gpu_out
:
return
[
gpu_elemwise
]
else
:
return
[
host_from_gpu
(
gpu_elemwise
)]
else
:
return
False
theano/sandbox/gpuarray/tests/__init__.py
0 → 100644
浏览文件 @
44f9d0f7
theano/sandbox/gpuarray/tests/test_basic_ops.py
0 → 100644
浏览文件 @
44f9d0f7
import
unittest
from
itertools
import
izip
from
copy
import
copy
,
deepcopy
import
numpy
import
theano
import
theano.tensor
as
T
from
theano.compile
import
DeepCopyOp
from
theano.tensor.tests.test_basic
import
safe_make_node
from
theano.tests.unittest_tools
import
SkipTest
from
numpy.testing.noseclasses
import
KnownFailureTest
import
theano.sandbox.gpuarray
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
and
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
if
not
cuda_ndarray
.
use
.
device_number
:
cuda_ndarray
.
use
(
'gpu'
)
theano
.
sandbox
.
gpuarray
.
init_dev
(
'cuda'
)
if
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
raise
SkipTest
(
"pygpu disabled"
)
from
theano.sandbox.gpuarray.type
import
(
GpuArrayType
,
gpuarray_shared_constructor
)
from
theano.sandbox.gpuarray.basic_ops
import
(
host_from_gpu
,
gpu_from_host
,
gpu_alloc
,
gpu_from_cuda
,
cuda_from_gpu
)
from
theano.tests
import
unittest_tools
as
utt
utt
.
seed_rng
()
rng
=
numpy
.
random
.
RandomState
(
seed
=
utt
.
fetch_seed
())
from
pygpu
import
gpuarray
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpuarray'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpuarray'
\
)
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpuarray'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpuarray'
)
def
may_fail
(
msg
,
EClass
):
"""Mark a test that requires very specific conditions to work to
mask a specific exception class."""
def
test_decorator
(
f
):
def
wrapper
():
try
:
f
()
except
Exception
,
e
:
if
isinstance
(
e
,
EClass
):
raise
KnownFailureTest
(
msg
,
e
)
raise
wrapper
.
__name__
=
f
.
__name__
return
wrapper
return
test_decorator
def
inplace_func
(
inputs
,
outputs
,
mode
=
None
,
allow_input_downcast
=
False
,
on_unused_input
=
'raise'
,
name
=
None
):
if
mode
is
None
:
mode
=
mode_with_gpu
return
theano
.
function
(
inputs
,
outputs
,
mode
=
mode
,
allow_input_downcast
=
allow_input_downcast
,
accept_inplace
=
True
,
on_unused_input
=
on_unused_input
,
name
=
name
)
def
fake_shared
(
value
,
name
=
None
,
strict
=
False
,
allow_downcast
=
None
,
**
kwargs
):
from
theano.tensor.sharedvar
import
tensor_constructor
,
scalar_constructor
for
c
in
(
gpuarray_shared_constructor
,
tensor_constructor
,
scalar_constructor
):
try
:
return
c
(
value
,
name
=
name
,
strict
=
strict
,
allow_downcast
=
allow_downcast
,
**
kwargs
)
except
TypeError
:
continue
def
rand_gpuarray
(
*
shape
,
**
kwargs
):
r
=
rng
.
rand
(
*
shape
)
*
2
-
1
dtype
=
kwargs
.
pop
(
'dtype'
,
theano
.
config
.
floatX
)
if
len
(
kwargs
)
!=
0
:
raise
TypeError
(
'Unexpected argument
%
s'
,
kwargs
.
keys
()[
0
])
return
gpuarray
.
array
(
r
,
dtype
=
dtype
)
def
makeTester
(
name
,
op
,
expected
,
good
=
None
,
bad_build
=
None
,
checks
=
None
,
bad_runtime
=
None
,
mode
=
None
,
skip
=
False
,
eps
=
1e-10
):
if
good
is
None
:
good
=
{}
if
bad_build
is
None
:
bad_build
=
{}
if
bad_runtime
is
None
:
bad_runtime
=
{}
if
checks
is
None
:
checks
=
{}
_op
=
op
_expected
=
expected
_good
=
good
_bad_build
=
bad_build
_bad_runtime
=
bad_runtime
_skip
=
skip
_checks
=
checks
class
Checker
(
unittest
.
TestCase
):
op
=
staticmethod
(
_op
)
expected
=
staticmethod
(
_expected
)
good
=
_good
bad_build
=
_bad_build
bad_runtime
=
_bad_runtime
skip
=
_skip
checks
=
_checks
def
setUp
(
self
):
eval
(
self
.
__class__
.
__module__
+
'.'
+
self
.
__class__
.
__name__
)
def
test_good
(
self
):
if
skip
:
raise
SkipTest
(
skip
)
for
testname
,
inputs
in
good
.
items
():
inputs
=
[
copy
(
input
)
for
input
in
inputs
]
inputrs
=
[
fake_shared
(
input
)
for
input
in
inputs
]
try
:
node
=
safe_make_node
(
self
.
op
,
*
inputrs
)
except
Exception
,
exc
:
err_msg
=
(
"Test
%
s::
%
s: Error occured while making "
"a node with inputs
%
s"
)
%
(
self
.
op
,
testname
,
inputs
)
exc
.
args
+=
(
err_msg
,)
raise
try
:
f
=
inplace_func
([],
node
.
outputs
,
mode
=
mode
,
name
=
'test_good'
)
except
Exception
,
exc
:
err_msg
=
(
"Test
%
s::
%
s: Error occured while trying to "
"make a Function"
)
%
(
self
.
op
,
testname
)
exc
.
args
+=
(
err_msg
,)
raise
if
isinstance
(
self
.
expected
,
dict
)
and
\
testname
in
self
.
expected
:
expecteds
=
self
.
expected
[
testname
]
else
:
expecteds
=
self
.
expected
(
*
inputs
)
if
not
isinstance
(
expecteds
,
(
list
,
tuple
)):
expecteds
=
(
expecteds
,)
try
:
variables
=
f
()
except
Exception
,
exc
:
err_msg
=
(
"Test
%
s::
%
s: Error occured while calling "
"the Function on the inputs
%
s"
)
%
(
self
.
op
,
testname
,
inputs
)
exc
.
args
+=
(
err_msg
,)
raise
for
i
,
(
variable
,
expected
)
in
\
enumerate
(
izip
(
variables
,
expecteds
)):
if
variable
.
dtype
!=
expected
.
dtype
or
\
variable
.
shape
!=
expected
.
shape
or
\
not
GpuArrayType
.
values_eq_approx
(
variable
,
expected
):
self
.
fail
((
"Test
%
s::
%
s: Output
%
s gave the wrong "
"value. With inputs
%
s, expected
%
s "
"(dtype
%
s), got
%
s (dtype
%
s)."
)
%
(
self
.
op
,
testname
,
i
,
inputs
,
expected
,
expected
.
dtype
,
variable
,
variable
.
dtype
))
for
description
,
check
in
self
.
checks
.
items
():
if
not
check
(
inputs
,
variables
):
self
.
fail
((
"Test
%
s::
%
s: Failed check:
%
s "
"(inputs were
%
s, ouputs were
%
s)"
)
%
(
self
.
op
,
testname
,
description
,
inputs
,
variables
))
def
test_bad_build
(
self
):
if
skip
:
raise
SkipTest
(
skip
)
for
testname
,
inputs
in
self
.
bad_build
.
items
():
inputs
=
[
copy
(
input
)
for
input
in
inputs
]
inputrs
=
[
fake_shared
(
input
)
for
input
in
inputs
]
self
.
assertRaises
(
Exception
,
safe_make_node
,
self
.
op
,
*
inputrs
)
def
test_bad_runtime
(
self
):
if
skip
:
raise
SkipTest
(
skip
)
for
testname
,
inputs
in
self
.
bad_runtime
.
items
():
inputrs
=
[
fake_shared
(
input
)
for
input
in
inputs
]
try
:
node
=
safe_make_node
(
self
.
op
,
*
inputrs
)
except
Exception
,
exc
:
err_msg
=
(
"Test
%
s::
%
s: Error occured while trying to "
"make a node with inputs
%
s"
)
%
(
self
.
op
,
testname
,
inputs
)
exc
.
args
+=
(
err_msg
,)
raise
try
:
f
=
inplace_func
([],
node
.
outputs
,
mode
=
mode
,
name
=
"test_bad_runtime"
)
except
Exception
,
exc
:
err_msg
=
(
"Test
%
s::
%
s: Error occured while trying to "
"make a Function"
)
%
(
self
.
op
,
testname
)
exc
.
args
+=
(
err_msg
,)
raise
self
.
assertRaises
(
Exception
,
f
,
[])
Checker
.
__name__
=
name
return
Checker
def
test_transfer_cpu_gpu
():
a
=
T
.
fmatrix
(
'a'
)
g
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
(
False
,
False
))(
'g'
)
av
=
numpy
.
asarray
(
rng
.
rand
(
5
,
4
),
dtype
=
'float32'
)
gv
=
gpuarray
.
array
(
av
)
f
=
theano
.
function
([
a
],
gpu_from_host
(
a
))
fv
=
f
(
av
)
assert
GpuArrayType
.
values_eq
(
fv
,
gv
)
f
=
theano
.
function
([
g
],
host_from_gpu
(
g
))
fv
=
f
(
gv
)
assert
numpy
.
all
(
fv
==
av
)
def
test_transfer_strided
():
# This is just to ensure that it works in theano
# compyte has a much more comprehensive suit of tests to ensure correctness
a
=
T
.
fmatrix
(
'a'
)
g
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
(
False
,
False
))(
'g'
)
av
=
numpy
.
asarray
(
rng
.
rand
(
5
,
8
),
dtype
=
'float32'
)
gv
=
gpuarray
.
array
(
av
)
av
=
av
[:,::
2
]
gv
=
gv
[:,::
2
]
f
=
theano
.
function
([
a
],
gpu_from_host
(
a
))
fv
=
f
(
av
)
assert
GpuArrayType
.
values_eq
(
fv
,
gv
)
f
=
theano
.
function
([
g
],
host_from_gpu
(
g
))
fv
=
f
(
gv
)
assert
numpy
.
all
(
fv
==
av
)
@may_fail
(
"Op fails if both contexts are not the same and it's rare "
"that the tests will be run this way"
,
ValueError
)
def
test_transfer_cuda_gpu
():
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
==
False
:
raise
SkipTest
(
"Can't test interaction with cuda if cuda not present"
)
g
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
(
False
,
False
))(
'g'
)
c
=
cuda_ndarray
.
CudaNdarrayType
((
False
,
False
))(
'c'
)
av
=
theano
.
_asarray
(
rng
.
rand
(
5
,
4
),
dtype
=
'float32'
)
gv
=
gpuarray
.
array
(
av
)
cv
=
cuda_ndarray
.
CudaNdarray
(
av
)
gvs
=
gv
[:,::
-
2
]
cvs
=
cv
[:,::
-
2
]
f
=
theano
.
function
([
c
],
gpu_from_cuda
(
c
))
fv
=
f
(
cv
)
assert
GpuArrayType
.
values_eq_approx
(
fv
,
gv
)
fvs
=
f
(
cvs
)
assert
GpuArrayType
.
values_eq_approx
(
fvs
,
gvs
)
f
=
theano
.
function
([
g
],
cuda_from_gpu
(
g
))
fv
=
f
(
gv
)
assert
cuda_ndarray
.
CudaNdarrayType
.
values_eq_approx
(
fv
,
cv
)
fvs
=
f
(
gvs
)
assert
cuda_ndarray
.
CudaNdarrayType
.
values_eq_approx
(
fvs
,
cvs
)
def
gpu_alloc_expected
(
x
,
*
shp
):
g
=
gpuarray
.
empty
(
shp
,
dtype
=
x
.
dtype
)
g
[:]
=
x
return
g
GpuAllocTester
=
makeTester
(
name
=
"GpuAllocTester"
,
op
=
gpu_alloc
,
expected
=
gpu_alloc_expected
,
good
=
dict
(
correct01
=
(
rand_gpuarray
(),
numpy
.
int32
(
7
)),
correct01_bcast
=
(
rand_gpuarray
(
1
),
numpy
.
int32
(
7
)),
correct02
=
(
rand_gpuarray
(),
numpy
.
int32
(
4
),
numpy
.
int32
(
7
)),
correct12
=
(
rand_gpuarray
(
7
),
numpy
.
int32
(
4
),
numpy
.
int32
(
7
)),
correct13
=
(
rand_gpuarray
(
7
),
numpy
.
int32
(
2
),
numpy
.
int32
(
4
),
numpy
.
int32
(
7
)),
correct23
=
(
rand_gpuarray
(
4
,
7
),
numpy
.
int32
(
2
),
numpy
.
int32
(
4
),
numpy
.
int32
(
7
))
),
bad_runtime
=
dict
(
bad_shape12
=
(
rand_gpuarray
(
7
),
numpy
.
int32
(
7
),
numpy
.
int32
(
5
)),
)
)
def
test_deep_copy
():
a
=
rand_gpuarray
(
20
,
dtype
=
'float32'
)
g
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
(
False
,))(
'g'
)
f
=
theano
.
function
([
g
],
g
)
assert
isinstance
(
f
.
maker
.
fgraph
.
toposort
()[
0
]
.
op
,
DeepCopyOp
)
res
=
f
(
a
)
assert
GpuArrayType
.
values_eq
(
res
,
a
)
theano/sandbox/gpuarray/type.py
0 → 100644
浏览文件 @
44f9d0f7
import
numpy
import
theano
from
theano
import
Type
,
Variable
,
Constant
,
tensor
,
config
,
scalar
from
theano.compile
import
SharedVariable
# Make sure this is importable even if pygpu is absent
# (it will not work though)
try
:
import
pygpu
from
pygpu
import
gpuarray
from
pygpu.elemwise
import
compare
,
elemwise2
except
ImportError
:
pass
class
GpuArrayType
(
Type
):
def
__init__
(
self
,
dtype
,
broadcastable
,
name
=
None
):
# In case this was not provided and no global value is available
self
.
dtype
=
str
(
dtype
)
self
.
broadcastable
=
tuple
(
bool
(
b
)
for
b
in
broadcastable
)
self
.
ndim
=
len
(
self
.
broadcastable
)
self
.
name
=
name
try
:
self
.
typecode
=
gpuarray
.
dtype_to_typecode
(
self
.
dtype
)
except
gpuarray
.
GpuArrayException
:
raise
TypeError
(
"Unsupported dtype for
%
s:
%
s"
%
(
self
.
__class__
.
__name__
,
self
.
dtype
))
def
filter
(
self
,
data
,
strict
=
False
,
allow_downcast
=
None
):
if
strict
:
if
not
isinstance
(
data
,
gpuarray
.
GpuArray
):
raise
TypeError
(
"
%
s expected a GpuArray object."
%
self
,
data
,
type
(
data
))
if
self
.
typecode
!=
data
.
typecode
:
raise
TypeError
(
"
%
s expected typecode
%
d (dtype
%
s), "
"got
%
d (dtype
%
s)."
%
(
self
,
self
.
typecode
,
self
.
dtype
,
data
.
typecode
,
str
(
data
.
dtype
)))
# fallthrough to ndim check
elif
allow_downcast
:
data
=
gpuarray
.
array
(
data
,
dtype
=
self
.
typecode
,
copy
=
False
,
ndmin
=
len
(
self
.
broadcastable
))
else
:
up_dtype
=
scalar
.
upcast
(
self
.
dtype
,
data
.
dtype
)
if
up_dtype
==
self
.
dtype
:
data
=
gpuarray
.
array
(
data
,
dtype
=
self
.
typecode
,
copy
=
False
)
else
:
raise
TypeError
(
"
%
s cannot store a value of dtype
%
s "
"without risking loss of precision."
%
(
self
,
data
.
dtype
))
if
self
.
ndim
!=
data
.
ndim
:
raise
TypeError
(
"Wrong number of dimensions: expected
%
s, "
"got
%
s with shape
%
s."
%
(
self
.
ndim
,
data
.
ndim
,
data
.
shape
),
data
)
shp
=
data
.
shape
for
i
,
b
in
enumerate
(
self
.
broadcastable
):
if
b
and
shp
[
i
]
!=
1
:
raise
TypeError
(
"Non-unit value on shape on a broadcastable"
" dimension."
,
shp
,
self
.
broadcastable
)
return
data
def
filter_variable
(
self
,
other
):
if
hasattr
(
other
,
'_as_GpuArrayVariable'
):
other
=
other
.
_as_GpuArrayVariable
()
if
not
isinstance
(
other
,
Variable
):
other
=
self
.
Constant
(
type
=
self
,
data
=
other
)
if
other
.
type
==
self
:
return
other
if
not
isinstance
(
other
.
type
,
tensor
.
TensorType
):
raise
TypeError
(
'Incompatible type'
,
(
self
,
other
.
type
))
if
(
other
.
type
.
dtype
!=
self
.
dtype
):
raise
TypeError
(
'Incompatible dtype'
,
(
self
.
dtype
,
other
.
type
.
dtype
))
if
other
.
type
.
ndim
!=
self
.
ndim
:
raise
TypeError
(
'Incompatible number of dimensions.'
' Expected
%
d, got
%
d.'
%
(
self
.
ndim
,
other
.
ndim
))
if
other
.
type
.
broadcastable
!=
self
.
broadcastable
:
raise
TypeError
(
'Incompatible broadcastable dimensions.'
' Expected
%
s, got
%
s.'
%
(
str
(
other
.
type
.
broadcastable
),
str
(
self
.
broadcastable
)))
return
theano
.
sandbox
.
gpuarray
.
basic_ops
.
gpu_from_host
(
other
)
@staticmethod
def
values_eq
(
a
,
b
):
if
a
.
shape
!=
b
.
shape
:
return
False
if
a
.
typecode
!=
b
.
typecode
:
return
False
return
numpy
.
asarray
(
compare
(
a
,
'=='
,
b
))
.
all
()
@staticmethod
def
values_eq_approx
(
a
,
b
):
if
a
.
shape
!=
b
.
shape
or
a
.
dtype
!=
b
.
dtype
:
return
False
if
'int'
in
str
(
a
.
dtype
):
return
GpuArrayType
.
values_eq
(
a
,
b
)
else
:
res
=
elemwise2
(
a
,
''
,
b
,
a
,
odtype
=
numpy
.
dtype
(
'bool'
),
op_tmpl
=
"res[i] = ((
%(a)
s -
%(b)
s) <"
\
"(1e-8 + 1e-5 * fabs(
%(b)
s)))"
)
return
numpy
.
asarray
(
res
)
.
all
()
def
value_zeros
(
self
,
shape
):
return
pygpu
.
gpuarray
.
zeros
(
shape
,
dtype
=
self
.
typecode
)
def
make_variable
(
self
,
name
=
None
):
return
self
.
Variable
(
self
,
name
=
name
)
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
and
self
.
typecode
==
other
.
typecode
and
self
.
broadcastable
==
other
.
broadcastable
)
def
__hash__
(
self
):
return
(
hash
(
self
.
typecode
)
^
hash
(
self
.
broadcastable
))
def
__str__
(
self
):
return
"GpuArray<
%
s>"
%
(
self
.
dtype
,)
def
get_shape_info
(
self
,
obj
):
return
obj
.
shape
def
get_size
(
self
,
shape_info
):
if
shape_info
:
return
numpy
.
prod
(
shape_info
)
*
numpy
.
dtype
(
self
.
dtype
)
.
itemsize
else
:
return
numpy
.
dtype
(
self
.
dtype
)
.
itemsize
def
c_declare
(
self
,
name
,
sub
):
return
"GpuArrayObject *
%
s;"
%
(
name
,)
def
c_init
(
self
,
name
,
sub
):
return
"
%
s = NULL;"
%
(
name
,)
def
c_extract
(
self
,
name
,
sub
):
# TODO I don't check broadcast stuff for now.
return
"""
%(name)
s = NULL;
if (py_
%(name)
s == Py_None) {
PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not None");
%(fail)
s
}
/* First check if we are the base type exactly (the most common case),
then do the full subclass check if needed. */
if (py_
%(name)
s->ob_type != &GpuArrayType &&
!PyObject_TypeCheck(py_
%(name)
s, &GpuArrayType)) {
PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
%(fail)
s
}
%(name)
s = (GpuArrayObject *)py_
%(name)
s;
Py_INCREF(
%(name)
s);
"""
%
{
'name'
:
name
,
'fail'
:
sub
[
'fail'
]}
def
c_cleanup
(
self
,
name
,
sub
):
return
"Py_XDECREF(
%(name)
s);
%(name)
s = NULL;"
%
{
'name'
:
name
}
def
c_sync
(
self
,
name
,
sub
):
return
"""
if (!
%(name)
s) {
Py_XDECREF(py_
%(name)
s);
Py_INCREF(Py_None);
py_
%(name)
s = Py_None;
} else if ((void *)py_
%(name)
s != (void *)
%(name)
s) {
Py_XDECREF(py_
%(name)
s);
py_
%(name)
s = (PyObject *)
%(name)
s;
Py_INCREF(py_
%(name)
s);
}
"""
%
{
'name'
:
name
}
def
c_init_code
(
self
):
# We don't actually need the numpy API except in
# HostFromGpu and GpuFromHost and those case will be covered
# by the TensorType parameter
return
[
'import_pygpu__gpuarray();'
]
def
c_headers
(
self
):
# We need arrayobject for the PyArrayDescr struct def
# (even if we just use a pointer to it in a function def)
return
[
'<compyte/array.h>'
,
'<compyte/kernel.h>'
,
'<compyte/error.h>'
,
'<numpy/arrayobject.h>'
,
'<gpuarray_api.h>'
]
def
c_header_dirs
(
self
):
return
[
pygpu
.
get_include
(),
numpy
.
get_include
()]
def
c_libraries
(
self
):
return
[
'compyte'
]
def
c_code_cache_version
(
self
):
return
(
1
,)
class
_operators
(
tensor
.
basic
.
_tensor_py_operators
):
def
_as_TensorVariable
(
self
):
from
basic_ops
import
host_from_gpu
return
host_from_gpu
(
self
)
def
_as_GpuArrayVariable
(
self
):
return
self
dtype
=
property
(
lambda
s
:
s
.
type
.
dtype
)
broadcastable
=
property
(
lambda
s
:
s
.
type
.
broadcastable
)
ndim
=
property
(
lambda
s
:
s
.
type
.
ndim
)
class
GpuArrayVariable
(
_operators
,
Variable
):
pass
GpuArrayType
.
Variable
=
GpuArrayVariable
class
GpuArraySignature
(
tensor
.
basic
.
TensorConstantSignature
):
pass
# might do something better if we can run the sum on the
# GPU, but for now this will suffice.
class
GpuArrayConstant
(
_operators
,
Constant
):
def
signature
(
self
):
return
GpuArraySignature
((
self
.
type
,
numpy
.
asarray
(
self
.
data
)))
def
__str__
(
self
):
if
self
.
name
is
not
None
:
return
self
.
name
return
"GpuArrayConstant{
%
s}"
%
numpy
.
asarray
(
self
.
data
)
GpuArrayType
.
Constant
=
GpuArrayConstant
class
GpuArraySharedVariable
(
_operators
,
SharedVariable
):
def
get_value
(
self
,
borrow
=
False
,
return_internal_type
=
False
):
if
return_internal_type
:
if
borrow
:
return
self
.
container
.
value
else
:
return
self
.
container
.
value
.
copy
()
else
:
return
numpy
.
asarray
(
self
.
container
.
value
)
def
set_value
(
self
,
value
,
borrow
=
False
):
self
.
container
.
value
=
pygpu
.
gpuarray
.
array
(
value
,
copy
=
(
not
borrow
))
def
__getitem__
(
self
,
*
args
):
return
_operators
.
__getitem__
(
self
,
*
args
)
GpuArrayType
.
SharedVariable
=
GpuArraySharedVariable
def
gpuarray_shared_constructor
(
value
,
name
=
None
,
strict
=
False
,
allow_downcast
=
None
,
borrow
=
False
,
broadcastable
=
None
):
"""SharedVariable constructor for GpuArrayType"""
if
not
isinstance
(
value
,
(
numpy
.
ndarray
,
pygpu
.
gpuarray
.
GpuArray
)):
raise
TypeError
(
'ndarray or GpuArray required'
)
if
broadcastable
is
None
:
broadcastable
=
(
False
,)
*
value
.
ndim
type
=
GpuArrayType
(
value
.
dtype
,
broadcastable
)
deviceval
=
pygpu
.
gpuarray
.
array
(
value
,
copy
=
(
not
borrow
))
return
GpuArraySharedVariable
(
type
=
type
,
value
=
deviceval
,
name
=
name
,
strict
=
strict
)
theano
.
compile
.
register_view_op_c_code
(
GpuArrayType
,
"""
Py_XDECREF(
%(oname)
s);
%(oname)
s =
%(iname)
s;
Py_XINCREF(
%(oname)
s);
"""
,
version
=
(
0
,))
theano
.
compile
.
register_deep_copy_op_c_code
(
GpuArrayType
,
"""
Py_XDECREF(
%(oname)
s);
%(oname)
s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
if (!
%(oname)
s) {
%(fail)
s }
int err;
err = GpuArray_copy(&
%(oname)
s->ga, &
%(iname)
s->ga, GA_ANY_ORDER);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Error during copy");
%(fail)
s
}
"""
,
version
=
(
1
,))
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论