Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
c84ef1d8
提交
c84ef1d8
authored
3月 16, 2011
作者:
Olivier Breuleux
浏览文件
操作
浏览文件
下载
差异文件
merge
上级
8ca1bf09
bb724b38
隐藏空白字符变更
内嵌
并排
正在显示
9 个修改的文件
包含
621 行增加
和
101 行删除
+621
-101
debugmode.py
theano/compile/debugmode.py
+3
-1
__init__.py
theano/sandbox/cuda/__init__.py
+5
-1
cuda_ndarray.cu
theano/sandbox/cuda/cuda_ndarray.cu
+1
-8
var.py
theano/sandbox/cuda/var.py
+2
-2
multinomial.py
theano/sandbox/multinomial.py
+341
-66
rng_mrg.py
theano/sandbox/rng_mrg.py
+15
-7
test_multinomial.py
theano/sandbox/test_multinomial.py
+74
-0
raw_random.py
theano/tensor/raw_random.py
+134
-16
test_raw_random.py
theano/tensor/tests/test_raw_random.py
+46
-0
没有找到文件。
theano/compile/debugmode.py
浏览文件 @
c84ef1d8
...
...
@@ -381,6 +381,7 @@ class InvalidValueError(DebugModeError):
client_node
=
self
.
client_node
hint
=
self
.
hint
specific_hint
=
self
.
specific_hint
context
=
debugprint
(
r
,
prefix
=
' '
,
depth
=
12
,
file
=
StringIO
())
.
getvalue
()
return
"""InvalidValueError
type(variable) =
%(type_r)
s
variable =
%(r)
s
...
...
@@ -393,7 +394,8 @@ class InvalidValueError(DebugModeError):
isfinite =
%(v_isfinite)
s
client_node =
%(client_node)
s
hint =
%(hint)
s
specific_hint =
%(specific_hint)
s
specific_hint =
%(specific_hint)
s
context = ...
\n
%(context)
s
"""
%
locals
()
########################
...
...
theano/sandbox/cuda/__init__.py
浏览文件 @
c84ef1d8
...
...
@@ -156,7 +156,11 @@ def use(device, force=False, default_to_move_computation_to_gpu = True,
raise
EnvironmentError
(
"You forced use of device
%
s, but CUDA initialization failed "
"with error:
\n
%
s"
%
(
device
,
cuda_initialization_error_message
))
if
not
cuda_available
:
warning
(
'CUDA is installed, but device
%
s is not available'
%
device
)
if
cuda_initialization_error_message
:
error_addendum
=
" (error:
%
s)"
%
cuda_initialization_error_message
else
:
error_addendum
=
""
warning
(
'CUDA is installed, but device
%
s is not available
%
s'
%
(
device
,
error_addendum
))
return
if
device
==
'gpu'
:
...
...
theano/sandbox/cuda/cuda_ndarray.cu
浏览文件 @
c84ef1d8
...
...
@@ -2427,6 +2427,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
if (CudaNdarray_is_c_contiguous(self) && CudaNdarray_is_c_contiguous(other) && size == size_source)
{
cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1);
CNDA_THREAD_SYNC;
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{
PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
...
...
@@ -2442,14 +2443,6 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
{
// THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS
assert(0);
assert (size==1);
cublasScopy(1, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1);
CNDA_THREAD_SYNC;
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{
PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
return -1;
}
}; break;
case 1: // vector
{
...
...
theano/sandbox/cuda/var.py
浏览文件 @
c84ef1d8
...
...
@@ -132,9 +132,9 @@ class CudaNdarraySharedVariable(SharedVariable, _operators):
return
other
.
_as_CudaNdarrayVariable
()
if
not
isinstance
(
other
.
type
,
tensor
.
TensorType
):
raise
TypeError
(
'Incompatible type'
,
other
.
type
)
raise
TypeError
(
'Incompatible type'
,
(
self
,
(
self
.
type
,
other
.
type
))
)
if
(
other
.
type
.
dtype
!=
self
.
dtype
):
raise
TypeError
(
'Incompatible dtype'
,
(
self
.
dtype
,
other
.
type
.
dtype
))
raise
TypeError
(
'Incompatible dtype'
,
(
self
,
(
self
.
dtype
,
other
.
type
.
dtype
)
))
if
(
other
.
type
.
broadcastable
!=
self
.
broadcastable
):
raise
TypeError
(
'Incompatible broadcastable'
,
(
self
,
(
self
.
broadcastable
,
other
.
type
.
broadcastable
)))
...
...
theano/sandbox/multinomial.py
浏览文件 @
c84ef1d8
...
...
@@ -10,30 +10,44 @@ if cuda_available:
from
theano.sandbox.cuda.basic_ops
import
host_from_gpu
,
gpu_from_host
class
Multinomial
(
Op
):
def
__init__
(
self
,
odtype
):
self
.
odtype
=
odtype
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
return
type
(
self
)
==
type
(
other
)
and
self
.
odtype
==
other
.
odtype
def
__hash__
(
self
):
return
hash
(
type
(
self
))
return
hash
(
(
type
(
self
),
self
.
odtype
))
def
__str__
(
self
):
return
self
.
__class__
.
__name__
return
'
%
s{
%
s}'
%
(
self
.
__class__
.
__name__
,
self
.
odtype
)
def
__setstate__
(
self
,
dct
):
self
.
__dict__
.
update
(
dct
)
try
:
self
.
odtype
except
:
self
.
odtype
=
'auto'
def
make_node
(
self
,
pvals
,
unis
):
pvals
=
T
.
as_tensor_variable
(
pvals
)
unis
=
T
.
as_tensor_variable
(
unis
)
#assert pvals.dtype == 'float32'
#assert unis.dtype == 'float32'
return
Apply
(
self
,
[
pvals
,
unis
],
[
pvals
.
type
()])
def
grad
(
self
,
inp
,
grads
):
pvals
,
unis
=
inp
gz
,
=
grads
if
pvals
.
ndim
!=
2
:
raise
NotImplementedError
(
'pvals ndim'
,
pvals
.
ndim
)
if
unis
.
ndim
!=
1
:
raise
NotImplementedError
(
'unis ndim'
,
unis
.
ndim
)
if
self
.
odtype
==
'auto'
:
odtype
=
pvals
.
dtype
else
:
odtype
=
self
.
odtype
return
Apply
(
self
,
[
pvals
,
unis
],
[
T
.
matrix
(
dtype
=
odtype
)])
def
grad
(
self
,
ins
,
outs
):
pvals
,
unis
=
ins
(
gz
,)
=
outs
return
[
None
,
None
]
def
c_code_cache_version
(
self
):
return
(
3
,)
return
(
5
,)
def
c_code
(
self
,
node
,
name
,
in
p
,
out
,
sub
):
pvals
,
unis
=
inp
z
,
=
out
def
c_code
(
self
,
node
,
name
,
in
s
,
outs
,
sub
):
(
pvals
,
unis
)
=
ins
(
z
,)
=
outs
fail
=
sub
[
'fail'
]
return
"""
...
...
@@ -48,9 +62,9 @@ class Multinomial(Op):
%(fail)
s;
}
if (
%(unis)
s->dimensions[0] !=
%(pvals)
s->dimensions[
1
])
if (
%(unis)
s->dimensions[0] !=
%(pvals)
s->dimensions[
0
])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[
1
]");
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[
0
]");
%(fail)
s;
}
...
...
@@ -60,15 +74,10 @@ class Multinomial(Op):
)
{
Py_XDECREF(
%(z)
s);
npy_intp dims[2];
dims[0] = (
%(pvals)
s->dimensions)[0];
dims[1] = (
%(pvals)
s->dimensions)[1];
%(z)
s = (PyArrayObject*) PyArray_ZEROS(2,
dim
s,
type_num_
%(
pvals
)
s,
%(pvals)
s->dimension
s,
type_num_
%(
z
)
s,
0);
if (!
%(z)
s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
...
...
@@ -78,32 +87,37 @@ class Multinomial(Op):
{ // NESTED SCOPE
const int nb_
outcomes
=
%(pvals)
s->dimensions[0];
const int nb_
multi
=
%(pvals)
s->dimensions[1];
const int nb_
multi
=
%(pvals)
s->dimensions[0];
const int nb_
outcomes
=
%(pvals)
s->dimensions[1];
//
// For each multinomials, loop over each possible outcome
//
for (int n = 0; n < nb_multi; ++n)
{
int waiting = 1;
dtype_
%(pvals)
s cummul = 0.;
const dtype_
%(unis)
s* unis_n = (dtype_
%(unis)
s*)PyArray_GETPTR1(
%(unis)
s, n);
for (int m = 0; m < nb_outcomes; ++m)
{
dtype_
%(z)
s* z_nm = (dtype_
%(z)
s*)PyArray_GETPTR2(
%(z)
s,
m,n
);
const dtype_
%(pvals)
s* pvals_nm = (dtype_
%(pvals)
s*)PyArray_GETPTR2(
%(pvals)
s,
m,n
);
dtype_
%(z)
s* z_nm = (dtype_
%(z)
s*)PyArray_GETPTR2(
%(z)
s,
n,m
);
const dtype_
%(pvals)
s* pvals_nm = (dtype_
%(pvals)
s*)PyArray_GETPTR2(
%(pvals)
s,
n,m
);
cummul += *pvals_nm;
if (
*unis_n < cummul
)
if (
waiting && (cummul > *unis_n)
)
{
*z_nm = 1.;
break;
waiting = 0;
}
else
{
// if we re-used old z pointer, we have to clear it out.
*z_nm = 0.;
}
}
}
} // END NESTED SCOPE
"""
%
locals
()
multinomial
=
Multinomial
()
#
multinomial = Multinomial()
class
GpuMultinomial
(
Multinomial
):
...
...
@@ -115,11 +129,16 @@ class GpuMultinomial(Multinomial):
raise
TypeError
(
'pvals must be cudandarray'
,
pvals
)
if
not
isinstance
(
unis
.
type
,
CudaNdarrayType
):
raise
TypeError
(
'unis must be cudandarray'
,
unis
)
if
self
.
odtype
==
'auto'
:
odtype
=
pvals
.
dtype
else
:
odtype
=
self
.
odtype
if
odtype
!=
pvals
.
dtype
:
raise
NotImplementedError
()
return
Apply
(
self
,
[
pvals
,
unis
],
[
pvals
.
type
()])
def
c_code_cache_version
(
self
):
#return ()
return
(
super
(
GpuMultinomial
,
self
)
.
c_code_cache_version
(),
2
)
return
(
6
,)
def
c_support_code_apply
(
self
,
node
,
nodename
):
return
"""
...
...
@@ -128,28 +147,32 @@ class GpuMultinomial(Multinomial):
const int nb_outcomes,
const int pvals_row_strides,
const int pvals_col_strides,
const int unis_stride,
float * global_pvals,
float * global_unis,
float * global_outs
)
{
// each thread takes care of one multinomial draw
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
float cummul = 0.;
bool done = false;
const float unis_n = global_unis[n*unis_stride];
for (int m = 0; m < nb_outcomes; ++m)
{
cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
float current_out = 0.;
if (!done && global_unis[n] < cummul)
if (!done)
{
current_out = 1.;
done = true;
cummul += global_pvals[m * pvals_col_strides + n * pvals_row_strides];
if (unis_n < cummul)
{
current_out = 1.;
done = true;
}
}
//write out transposed for speed.
global_outs[n + m * nb_multi] = current_out;
}
}
...
...
@@ -158,12 +181,12 @@ class GpuMultinomial(Multinomial):
"""
%
locals
()
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
pvals
,
unis
=
inp
z
,
=
out
def
c_code
(
self
,
node
,
name
,
ins
,
outs
,
sub
):
(
pvals
,
unis
)
=
ins
(
z
,)
=
outs
fail
=
sub
[
'fail'
]
return
"""
if (
%(pvals)
s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
...
...
@@ -174,28 +197,21 @@ class GpuMultinomial(Multinomial):
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)
s;
}
if (CudaNdarray_HOST_DIMS(
%(unis)
s)[0] != CudaNdarray_HOST_DIMS(
%(pvals)
s)[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
%(fail)
s;
}
if (!CudaNdarray_is_c_contiguous(
%(unis)
s))
if (CudaNdarray_HOST_DIMS(
%(unis)
s)[0] != CudaNdarray_HOST_DIMS(
%(pvals)
s)[0])
{
PyErr_Format(PyExc_
NotImplementedError, "require unis to be contiguous
");
PyErr_Format(PyExc_
ValueError, "unis.shape[0] != pvals.shape[0]
");
%(fail)
s;
}
// Would be more efficient if pvals were also contiguous but practically I think it is not often the cas,
// since we are working on pvals.T here
//N.B. that the output is TRANSPOSED compared with pvals
if ((NULL ==
%(z)
s)
|| (CudaNdarray_HOST_DIMS(
%(z)
s)[0] != CudaNdarray_HOST_DIMS(
%(pvals)
s)[
0
])
|| (CudaNdarray_HOST_DIMS(
%(z)
s)[1] != CudaNdarray_HOST_DIMS(
%(pvals)
s)[
1
]))
|| (CudaNdarray_HOST_DIMS(
%(z)
s)[0] != CudaNdarray_HOST_DIMS(
%(pvals)
s)[
1
])
|| (CudaNdarray_HOST_DIMS(
%(z)
s)[1] != CudaNdarray_HOST_DIMS(
%(pvals)
s)[
0
]))
{
Py_XDECREF(
%(z)
s);
npy_intp dims[2];
dims[0] = (CudaNdarray_HOST_DIMS(
%(pvals)
s)[
0
]);
dims[1] = (CudaNdarray_HOST_DIMS(
%(pvals)
s)[
1
]);
dims[0] = (CudaNdarray_HOST_DIMS(
%(pvals)
s)[
1
]);
dims[1] = (CudaNdarray_HOST_DIMS(
%(pvals)
s)[
0
]);
%(z)
s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
if (!
%(z)
s)
{
...
...
@@ -205,9 +221,8 @@ class GpuMultinomial(Multinomial):
}
{ // NESTED SCOPE
int nb_outcomes = CudaNdarray_HOST_DIMS(
%(z)
s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(
%(z)
s)[1];
int nb_multi = CudaNdarray_HOST_DIMS(
%(pvals)
s)[0];
int nb_outcomes = CudaNdarray_HOST_DIMS(
%(pvals)
s)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1;
...
...
@@ -226,20 +241,21 @@ class GpuMultinomial(Multinomial):
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for
as
many rows in the matrix (
%%
i)", nb_multi);
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for
so
many rows in the matrix (
%%
i)", nb_multi);
%(fail)
s;
}
dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1);
int n_shared = 0;
assert(nb_blocks*nb_threads >= nb_multi);
k_multi_warp_
%(name)
s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(
%(z)
s)[1],
CudaNdarray_HOST_DIMS(
%(z)
s)[0],
CudaNdarray_HOST_STRIDES(
%(pvals)
s)[0],
CudaNdarray_HOST_STRIDES(
%(pvals)
s)[1],
CudaNdarray_HOST_STRIDES(
%(unis)
s)[0],
CudaNdarray_DEV_DATA(
%(pvals)
s),
CudaNdarray_DEV_DATA(
%(unis)
s),
CudaNdarray_DEV_DATA(
%(z)
s)
...
...
@@ -262,12 +278,271 @@ class GpuMultinomial(Multinomial):
} // END NESTED SCOPE
"""
%
locals
()
gpu_multinomial
=
GpuMultinomial
()
@local_optimizer
()
def
use_gpu_multinomial
(
node
):
if
node
.
op
==
multinomial
:
return
[
host_from_gpu
(
gpu_multinomial
(
*
[
gpu_from_host
(
i
)
for
i
in
node
.
inputs
]))]
p
,
u
=
node
.
inputs
m
,
=
node
.
outputs
if
p
.
dtype
==
u
.
dtype
==
m
.
dtype
==
'float32'
:
gpu_op
=
GpuMultinomial
(
op
.
odtype
)
return
[
host_from_gpu
(
gpu_op
(
*
[
gpu_from_host
(
i
)
for
i
in
node
.
inputs
]))]
if
cuda_enabled
:
#theano.config.device.startswith('gpu'):
register_specialize
(
use_gpu_multinomial
)
if
0
:
# I hate you hg rebase, I hate you so very, very much.
class
Multinomial
(
Op
):
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
def
__hash__
(
self
):
return
hash
(
type
(
self
))
def
__str__
(
self
):
return
self
.
__class__
.
__name__
def
make_node
(
self
,
pvals
,
unis
):
pvals
=
T
.
as_tensor_variable
(
pvals
)
unis
=
T
.
as_tensor_variable
(
unis
)
#assert pvals.dtype == 'float32'
#assert unis.dtype == 'float32'
return
Apply
(
self
,
[
pvals
,
unis
],
[
pvals
.
type
()])
def
grad
(
self
,
inp
,
grads
):
pvals
,
unis
=
inp
gz
,
=
grads
return
[
None
,
None
]
def
c_code_cache_version
(
self
):
return
(
3
,)
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
pvals
,
unis
=
inp
z
,
=
out
fail
=
sub
[
'fail'
]
return
"""
if (
%(pvals)
s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)
s;
}
if (
%(unis)
s->nd != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)
s;
}
if (
%(unis)
s->dimensions[0] !=
%(pvals)
s->dimensions[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
%(fail)
s;
}
if ((NULL ==
%(z)
s)
|| ((
%(z)
s->dimensions)[0] != (
%(pvals)
s->dimensions)[0])
|| ((
%(z)
s->dimensions)[1] != (
%(pvals)
s->dimensions)[1])
)
{
Py_XDECREF(
%(z)
s);
npy_intp dims[2];
dims[0] = (
%(pvals)
s->dimensions)[0];
dims[1] = (
%(pvals)
s->dimensions)[1];
%(z)
s = (PyArrayObject*) PyArray_ZEROS(2,
dims,
type_num_
%(pvals)
s,
0);
if (!
%(z)
s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
%(fail)
s;
}
}
{ // NESTED SCOPE
const int nb_outcomes =
%(pvals)
s->dimensions[0];
const int nb_multi =
%(pvals)
s->dimensions[1];
//
// For each multinomials, loop over each possible outcome
//
for (int n = 0; n < nb_multi; ++n)
{
dtype_
%(pvals)
s cummul = 0.;
const dtype_
%(unis)
s* unis_n = (dtype_
%(unis)
s*)PyArray_GETPTR1(
%(unis)
s, n);
for (int m = 0; m < nb_outcomes; ++m)
{
dtype_
%(z)
s* z_nm = (dtype_
%(z)
s*)PyArray_GETPTR2(
%(z)
s, m,n);
const dtype_
%(pvals)
s* pvals_nm = (dtype_
%(pvals)
s*)PyArray_GETPTR2(
%(pvals)
s, m,n);
cummul += *pvals_nm;
if (*unis_n < cummul)
{
*z_nm = 1.;
break;
}
}
}
} // END NESTED SCOPE
"""
%
locals
()
multinomial
=
Multinomial
()
class
GpuMultinomial
(
Multinomial
):
def
make_node
(
self
,
pvals
,
unis
):
assert
pvals
.
dtype
==
'float32'
assert
unis
.
dtype
==
'float32'
if
not
isinstance
(
pvals
.
type
,
CudaNdarrayType
):
raise
TypeError
(
'pvals must be cudandarray'
,
pvals
)
if
not
isinstance
(
unis
.
type
,
CudaNdarrayType
):
raise
TypeError
(
'unis must be cudandarray'
,
unis
)
return
Apply
(
self
,
[
pvals
,
unis
],
[
pvals
.
type
()])
def
c_code_cache_version
(
self
):
#return ()
return
(
super
(
GpuMultinomial
,
self
)
.
c_code_cache_version
(),
2
)
def
c_support_code_apply
(
self
,
node
,
nodename
):
return
"""
static __global__ void k_multi_warp_
%(nodename)
s(
const int nb_multi,
const int nb_outcomes,
const int pvals_row_strides,
const int pvals_col_strides,
float * global_pvals,
float * global_unis,
float * global_outs
)
{
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
float cummul = 0.;
bool done = false;
for (int m = 0; m < nb_outcomes; ++m)
{
cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
float current_out = 0.;
if (!done && global_unis[n] < cummul)
{
current_out = 1.;
done = true;
}
global_outs[n + m * nb_multi] = current_out;
}
}
}
"""
%
locals
()
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
pvals
,
unis
=
inp
z
,
=
out
fail
=
sub
[
'fail'
]
return
"""
if (
%(pvals)
s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)
s;
}
if (
%(unis)
s->nd != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)
s;
}
if (CudaNdarray_HOST_DIMS(
%(unis)
s)[0] != CudaNdarray_HOST_DIMS(
%(pvals)
s)[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
%(fail)
s;
}
if (!CudaNdarray_is_c_contiguous(
%(unis)
s))
{
PyErr_Format(PyExc_NotImplementedError, "require unis to be contiguous");
%(fail)
s;
}
// Would be more efficient if pvals were also contiguous but practically I think it is not often the cas,
// since we are working on pvals.T here
if ((NULL ==
%(z)
s)
|| (CudaNdarray_HOST_DIMS(
%(z)
s)[0] != CudaNdarray_HOST_DIMS(
%(pvals)
s)[0])
|| (CudaNdarray_HOST_DIMS(
%(z)
s)[1] != CudaNdarray_HOST_DIMS(
%(pvals)
s)[1]))
{
Py_XDECREF(
%(z)
s);
npy_intp dims[2];
dims[0] = (CudaNdarray_HOST_DIMS(
%(pvals)
s)[0]);
dims[1] = (CudaNdarray_HOST_DIMS(
%(pvals)
s)[1]);
%(z)
s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
if (!
%(z)
s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
%(fail)
s;
}
}
{ // NESTED SCOPE
int nb_outcomes = CudaNdarray_HOST_DIMS(
%(z)
s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(
%(z)
s)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1;
int nb_threads=16; // so it really starts at 32, because of the *2
do
{
nb_threads*=2;
if (nb_multi
%%
nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("
\\
nN=
%%
i b=
%%
i t=
%%
i t*b=
%%
i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
// TODO : next line is a bit hardcoded...
if (nb_threads > 512)
{
PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (
%%
i)", nb_multi);
%(fail)
s;
}
dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1);
int n_shared = 0;
k_multi_warp_
%(name)
s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(
%(z)
s)[1],
CudaNdarray_HOST_DIMS(
%(z)
s)[0],
CudaNdarray_HOST_STRIDES(
%(pvals)
s)[0],
CudaNdarray_HOST_STRIDES(
%(pvals)
s)[1],
CudaNdarray_DEV_DATA(
%(pvals)
s),
CudaNdarray_DEV_DATA(
%(unis)
s),
CudaNdarray_DEV_DATA(
%(z)
s)
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error:
%%
s:
%%
s. (grid:
%%
i x
%%
i; block:
%%
i x
%%
i x
%%
i; shared:
%%
i)
\\
n",
"k_multi_warp_
%(name)
s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
%(fail)
s;
}
} // END NESTED SCOPE
"""
%
locals
()
gpu_multinomial
=
GpuMultinomial
()
theano/sandbox/rng_mrg.py
浏览文件 @
c84ef1d8
...
...
@@ -9,13 +9,14 @@ import sys
import
numpy
from
theano
import
Op
,
Apply
,
shared
,
config
,
Variable
from
theano.tensor
import
raw_random
,
TensorType
,
as_tensor_variable
,
get_vector_length
,
cast
,
opt
from
theano.tensor
import
(
raw_random
,
TensorType
,
as_tensor_variable
,
get_vector_length
,
cast
,
opt
)
from
theano.tensor
import
zeros_like
,
sqrt
,
log
,
sin
,
cos
,
join
,
prod
from
theano.compile
import
optdb
from
theano.gof
import
local_optimizer
from
theano.gof.python25
import
all
from
multinomial
import
multinomial
import
multinomial
from
theano.sandbox.cuda
import
cuda_available
,
cuda_enabled
if
cuda_available
:
...
...
@@ -83,10 +84,12 @@ def mrg_next_value(rstate, new_rstate):
x11
,
x12
,
x13
,
x21
,
x22
,
x23
=
rstate
assert
type
(
x11
)
==
numpy
.
int32
i0
,
i7
,
i9
,
i15
,
i16
,
i22
,
i24
=
[
numpy
.
int32
(
i
)
for
i
in
(
0
,
7
,
9
,
15
,
16
,
22
,
24
)]
i0
,
i7
,
i9
,
i15
,
i16
,
i22
,
i24
=
[
numpy
.
int32
(
i
)
for
i
in
(
0
,
7
,
9
,
15
,
16
,
22
,
24
)]
#first component
y1
=
((
x12
&
MASK12
)
<<
i22
)
+
(
x12
>>
i9
)
+
((
x13
&
MASK13
)
<<
i7
)
+
(
x13
>>
i24
);
y1
=
(((
x12
&
MASK12
)
<<
i22
)
+
(
x12
>>
i9
)
+
((
x13
&
MASK13
)
<<
i7
)
+
(
x13
>>
i24
))
assert
type
(
y1
)
==
numpy
.
int32
if
(
y1
<
0
or
y1
>=
M1
):
#must also check overflow
...
...
@@ -741,10 +744,15 @@ class MRG_RandomStreams(object):
raise
TypeError
(
"You have to specify pvals"
)
pvals
=
as_tensor_variable
(
pvals
)
if
n
==
1
and
pvals
.
ndim
==
2
:
unis
=
self
.
uniform
(
size
=
pvals
.
shape
[
0
:
1
],
ndim
=
1
)
return
cast
(
multinomial
(
pvals
.
T
,
unis
)
.
T
,
dtype
)
ndim
,
size
,
bcast
=
raw_random
.
_infer_ndim_bcast
(
ndim
,
size
,
n
,
pvals
[:,
0
])
bcast
=
bcast
+
(
pvals
.
type
.
broadcastable
[
-
1
],)
unis
=
self
.
uniform
(
size
=
size
,
ndim
=
1
)
op
=
multinomial
.
Multinomial
(
dtype
)
return
op
(
pvals
,
unis
)
else
:
raise
NotImplementedError
(
"MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2"
)
raise
NotImplementedError
((
"MRG_RandomStreams.multinomial only"
" implemented with n == 1 and pvals.ndim = 2"
))
def
normal
(
self
,
size
=
None
,
avg
=
0.0
,
std
=
1.0
,
ndim
=
None
,
dtype
=
config
.
floatX
):
"""
...
...
theano/sandbox/test_multinomial.py
0 → 100644
浏览文件 @
c84ef1d8
import
numpy
from
theano
import
tensor
,
shared
,
function
import
multinomial
def
test_multimomial_0
():
# This tests the multinomial Op directly, not going through the
# multinomial() call in GPU random generation.
p
=
tensor
.
matrix
()
u
=
tensor
.
vector
()
m
=
multinomial
.
Multinomial
(
'auto'
)(
p
,
u
)
#the m*2 allows the multinomial to reuse output
f
=
function
([
p
,
u
],
m
*
2
,
allow_input_downcast
=
True
)
# test that both first and second samples can be drawn
assert
numpy
.
allclose
(
f
([[
1
,
0
],
[
0
,
1
]],
[
.
1
,
.
1
]),
[[
2
,
0
],
[
0
,
2
]])
# test that both second labels can be drawn
r
=
f
([[
.
2
,
.
8
],
[
.
3
,
.
7
]],
[
.
31
,
.
31
])
assert
numpy
.
allclose
(
r
,
[[
0
,
2
],
[
0
,
2
]]),
r
# test that both first labels can be drawn
r
=
f
([[
.
2
,
.
8
],
[
.
3
,
.
7
]],
[
.
21
,
.
21
])
assert
numpy
.
allclose
(
r
,
[[
0
,
2
],
[
2
,
0
]]),
r
#change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r
=
f
([[
.
2
,
.
8
]
],
[
.
25
])
assert
numpy
.
allclose
(
r
,
[[
0
,
2
]]),
r
#TODO: check a bigger example (make sure blocking on GPU is handled correctly)
def
test_multinomial_large
():
# DEBUG_MODE will test this on GPU
p
=
tensor
.
fmatrix
()
u
=
tensor
.
fvector
()
m
=
multinomial
.
Multinomial
(
'auto'
)(
p
,
u
)
f
=
function
([
p
,
u
],
m
*
2
,
allow_input_downcast
=
True
)
pval
=
numpy
.
arange
(
10000
*
4
,
dtype
=
'float32'
)
.
reshape
((
10000
,
4
))
+
0.1
pval
=
pval
/
pval
.
sum
(
axis
=
1
)[:,
None
]
uval
=
numpy
.
ones_like
(
pval
[:,
0
])
*
0.5
mval
=
f
(
pval
,
uval
)
assert
mval
.
shape
==
pval
.
shape
assert
mval
.
dtype
==
pval
.
dtype
assert
numpy
.
allclose
(
mval
.
sum
(
axis
=
1
),
2
)
asdf
=
numpy
.
asarray
([
0
,
0
,
2
,
0
])
+
0
*
pval
assert
numpy
.
allclose
(
mval
,
asdf
)
#broadcast over all rows
def
test_multinomial_dtypes
():
p
=
tensor
.
dmatrix
()
u
=
tensor
.
dvector
()
m
=
multinomial
.
Multinomial
(
'auto'
)(
p
,
u
)
assert
m
.
dtype
==
'float64'
,
m
.
dtype
p
=
tensor
.
fmatrix
()
u
=
tensor
.
fvector
()
m
=
multinomial
.
Multinomial
(
'auto'
)(
p
,
u
)
assert
m
.
dtype
==
'float32'
,
m
.
dtype
p
=
tensor
.
fmatrix
()
u
=
tensor
.
fvector
()
m
=
multinomial
.
Multinomial
(
'float64'
)(
p
,
u
)
assert
m
.
dtype
==
'float64'
,
m
.
dtype
theano/tensor/raw_random.py
浏览文件 @
c84ef1d8
...
...
@@ -236,6 +236,10 @@ class RandomFunction(gof.Op):
def
_infer_ndim
(
ndim
,
shape
,
*
args
):
ndim
,
ivec
,
bcast
=
_infer_ndim_bcast
(
ndim
,
shape
,
*
args
)
return
ndim
,
ivec
def
_infer_ndim_bcast
(
ndim
,
shape
,
*
args
):
"""
Infer the number of dimensions from the shape or the other arguments.
...
...
@@ -255,7 +259,11 @@ def _infer_ndim(ndim, shape, *args):
else
:
args_ndim
=
0
if
isinstance
(
shape
,
(
tuple
,
list
)):
# there is a convention that -1 means the corresponding shape of a
# potentially-broadcasted symbolic arg
if
(
isinstance
(
shape
,
(
tuple
,
list
))
and
numpy
.
all
(
numpy
.
asarray
(
shape
)
>=
0
)):
bcast
=
[(
s
==
1
)
for
s
in
shape
]
v_shape
=
tensor
.
TensorConstant
(
type
=
tensor
.
lvector
,
data
=
theano
.
_asarray
(
shape
,
dtype
=
'int64'
))
shape_ndim
=
len
(
shape
)
if
ndim
is
None
:
...
...
@@ -265,6 +273,53 @@ def _infer_ndim(ndim, shape, *args):
raise
ValueError
(
'ndim should be equal to len(shape), but
\n
'
,
'ndim =
%
s, len(shape) =
%
s, shape =
%
s'
%
(
ndim
,
shape_ndim
,
shape
))
elif
isinstance
(
shape
,
(
tuple
,
list
)):
# there is a convention that -1 means the corresponding shape of a
# potentially-broadcasted symbolic arg
#
# This case combines together symbolic and non-symbolic shape
# information
if
ndim
is
None
:
ndim
=
args_ndim
else
:
ndim
=
max
(
args_ndim
,
ndim
)
ndim
=
max
(
args_ndim
,
len
(
shape
))
shape
=
[
-
1
]
*
(
ndim
-
len
(
shape
))
+
list
(
shape
)
bcast
=
[]
pre_v_shape
=
[]
for
i
,
s
in
enumerate
(
shape
):
if
hasattr
(
s
,
'type'
):
# s is symbolic
bcast
.
append
(
False
)
# todo - introspect further
pre_v_shape
.
append
(
s
)
else
:
if
s
>=
0
:
pre_v_shape
.
append
(
tensor
.
as_tensor_variable
(
s
))
bcast
.
append
((
s
==
1
))
elif
s
==
-
1
:
n_a_i
=
0
for
a
in
args
:
# ndim: _ _ _ _ _ _
# ashp: s0 s1 s2 s3
# i
if
i
>=
ndim
-
a
.
ndim
:
n_a_i
+=
1
a_i
=
i
+
a
.
ndim
-
ndim
if
not
a
.
broadcastable
[
a_i
]:
pre_v_shape
.
append
(
a
.
shape
[
a_i
])
bcast
.
append
(
False
)
break
else
:
if
n_a_i
==
0
:
raise
ValueError
((
'Auto-shape of -1 must overlap'
'with the shape of one of the broadcastable'
'inputs'
))
else
:
pre_v_shape
.
append
(
tensor
.
as_tensor_variable
(
1
))
bcast
.
append
(
True
)
else
:
ValueError
(
'negative shape'
,
s
)
# post-condition: shape may still contain both symbolic and non-symbolic things
v_shape
=
tensor
.
stack
(
*
pre_v_shape
)
elif
shape
is
None
:
# The number of drawn samples will be determined automatically,
...
...
@@ -272,20 +327,23 @@ def _infer_ndim(ndim, shape, *args):
v_shape
=
tensor
.
constant
([],
dtype
=
'int64'
)
if
ndim
is
None
:
ndim
=
args_ndim
bcast
=
[
False
]
*
ndim
#TODO: retrieve broadcasting patterns of arguments
else
:
v_shape
=
tensor
.
as_tensor_variable
(
shape
)
if
ndim
is
None
:
ndim
=
tensor
.
get_vector_length
(
v_shape
)
bcast
=
[
False
]
*
ndim
if
not
(
v_shape
.
dtype
.
startswith
(
'int'
)
or
v_shape
.
dtype
.
startswith
(
'uint'
)):
raise
TypeError
(
'shape must be an integer vector or list'
)
raise
TypeError
(
'shape must be an integer vector or list'
,
v_shape
.
dtype
)
if
args_ndim
>
ndim
:
raise
ValueError
(
'ndim should be at least as big as required by args value'
,
(
ndim
,
args_ndim
),
args
)
return
ndim
,
v_shape
assert
ndim
==
len
(
bcast
)
return
ndim
,
tensor
.
cast
(
v_shape
,
'int32'
),
tuple
(
bcast
)
def
_generate_broadcasting_indices
(
out_shape
,
*
shapes
):
'''
...
...
@@ -549,29 +607,89 @@ def multinomial_helper(random_state, n, pvals, size):
out
=
numpy
.
ndarray
(
out_size
)
broadcast_ind
=
_generate_broadcasting_indices
(
size
,
n
.
shape
,
pvals
.
shape
[:
-
1
])
# Iterate over these indices, drawing from one multinomial at a time from numpy
assert
pvals
.
min
()
>=
0
for
mi
,
ni
,
pi
in
zip
(
*
broadcast_ind
):
out
[
mi
]
=
random_state
.
multinomial
(
n
=
n
[
ni
],
pvals
=
pvals
[
pi
])
pvi
=
pvals
[
pi
]
# This might someday be fixed upstream
# Currently numpy raises an exception in this method if the sum
# of probabilities meets or exceeds 1.0.
# In perfect arithmetic this would be correct, but in float32 or
# float64 it is too strict.
pisum
=
numpy
.
sum
(
pvi
)
if
1.0
<
pisum
<
1.0
+
1e-5
:
#correct if we went a little over
# because mtrand.pyx has a ValueError that will trigger if
# sum(pvals[:-1]) > 1.0
pvi
=
pvi
*
(
1.0
-
5e-5
)
#pvi = pvi * .9
pisum
=
numpy
.
sum
(
pvi
)
elif
pvi
[
-
1
]
<
5e-5
:
#will this even work?
pvi
=
pvi
*
(
1.0
-
5e-5
)
pisum
=
numpy
.
sum
(
pvi
)
assert
pisum
<=
1.0
,
pisum
out
[
mi
]
=
random_state
.
multinomial
(
n
=
n
[
ni
],
pvals
=
pvi
.
astype
(
'float64'
))
return
out
def
multinomial
(
random_state
,
size
=
None
,
n
=
1
,
pvals
=
[
0.5
,
0.5
],
ndim
=
None
,
dtype
=
'int64'
):
def
multinomial
(
random_state
,
size
=
None
,
n
=
1
,
pvals
=
[
0.5
,
0.5
],
ndim
=
None
,
dtype
=
'int64'
):
"""
Sample n times from a multinomial distribution defined by
probabilities pvals, as many times as required by size. For
instance, if size=(p,q), p*q samples will be drawn, and the output
shape will be (p,q,len(pvals)).
Sample from one or more multinomial distributions defined by
one-dimensional slices in pvals.
Theano tries to infer the number of dimensions from the length of
the size argument and the shapes of n and pvals, but you may always
specify it with the `ndim` parameter.
:param pvals: a tensor of shape "nmulti+(L,)" describing each multinomial
distribution. This tensor must have the property that
numpy.allclose(pvals.sum(axis=-1), 1) is true.
:param size: a vector of shape information for the output; this can also
specify the "nmulti" part of pvals' shape. A -1 in the k'th position
from the right means to borrow the k'th position from the
right in nmulti. (See examples below.)
Default ``None`` means size=nmulti.
:param n: the number of experiments to simulate for each multinomial. This
can be a scalar, or tensor, it will be broadcasted to have shape "nmulti".
:param dtype: the dtype of the return value (which will represent counts)
:returns: tensor of len(size)+1 dimensions, and shape[-1]==L, with the specified ``dtype``,
with the experiment counts. See examples to understand the shape of the
return value, which is derived from both size and pvals.shape.
In return value rval, "numpy.allclose(rval.sum(axis=-1), n)" will be true.
For example, to simulate n experiments from each multinomial in a batch of
size B:
size=None, pvals.shape=(B,L) --> rval.shape=[B,L]
rval[i,j] is the count of possibility j in the i'th distribution (row)
in pvals.
Using size:
size=(1,-1), pvals.shape=(A,B,L)
--> rval.shape=[1,B,L], and requires that A==1.
rval[k,i,j] is the count of possibility j in the distribution specified
by pvals[k,i].
Using size for broadcasting of pvals:
size=(10,1,-1), pvals.shape=(A,B,L)
--> rval.shape=[10,1,B,L], and requires that A==1.
rval[l,k,i,j] is the count of possibility j in the distribution specified
by pvals[k,i], in the l'th of 10 draws.
.. note::
Note that the output will then be of dimension ndim+1.
"""
n
=
tensor
.
as_tensor_variable
(
n
)
pvals
=
tensor
.
as_tensor_variable
(
pvals
)
ndim
,
size
=
_infer_ndim
(
ndim
,
size
,
n
,
pvals
[
0
])
# until ellipsis is implemented (argh)
tmp
=
pvals
.
T
[
0
]
.
T
ndim
,
size
,
bcast
=
_infer_ndim_bcast
(
ndim
,
size
,
n
,
tmp
)
bcast
=
bcast
+
(
pvals
.
type
.
broadcastable
[
-
1
],)
op
=
RandomFunction
(
multinomial_helper
,
tensor
.
TensorType
(
dtype
=
'int64'
,
broadcastable
=
(
False
,)
*
(
ndim
+
1
)
),
tensor
.
TensorType
(
dtype
=
dtype
,
broadcastable
=
bcast
),
ndim_added
=
1
)
return
op
(
random_state
,
size
,
n
,
pvals
)
...
...
theano/tensor/tests/test_raw_random.py
浏览文件 @
c84ef1d8
...
...
@@ -732,6 +732,52 @@ class T_random_function(unittest.TestCase):
assert
numpy
.
all
(
val2
==
numpy_val2
)
self
.
assertRaises
(
ValueError
,
g
,
rng2
,
n_val
[:
-
1
],
pvals_val
[:
-
1
])
def
test_multinomial_tensor3_a
(
self
):
# Test the examples given in the multinomial documentation regarding
# tensor3 objects
rng_R
=
random_state_type
()
n
=
9
pvals
=
tensor
.
dtensor3
()
post_r
,
out
=
multinomial
(
rng_R
,
n
=
n
,
pvals
=
pvals
,
size
=
(
1
,
-
1
))
assert
out
.
ndim
==
3
assert
out
.
broadcastable
==
(
True
,
False
,
False
)
f
=
compile
.
function
([
rng_R
,
pvals
],
[
post_r
,
out
],
accept_inplace
=
True
)
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
numpy_rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
pvals_val
=
numpy
.
asarray
([[[
.
1
,
.
9
],
[
.
2
,
.
8
],
[
.
3
,
.
7
]]])
assert
pvals_val
.
shape
==
(
1
,
3
,
2
)
new_rng
,
draw
=
f
(
rng
,
pvals_val
)
assert
draw
.
shape
==
(
1
,
3
,
2
)
assert
numpy
.
allclose
(
draw
.
sum
(
axis
=
2
),
9
)
def
test_multinomial_tensor3_b
(
self
):
# Test the examples given in the multinomial documentation regarding
# tensor3 objects
rng_R
=
random_state_type
()
n
=
9
pvals
=
tensor
.
dtensor3
()
post_r
,
out
=
multinomial
(
rng_R
,
n
=
n
,
pvals
=
pvals
,
size
=
(
10
,
1
,
-
1
))
assert
out
.
ndim
==
4
assert
out
.
broadcastable
==
(
False
,
True
,
False
,
False
)
f
=
compile
.
function
([
rng_R
,
pvals
],
[
post_r
,
out
],
accept_inplace
=
True
)
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
numpy_rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
pvals_val
=
numpy
.
asarray
([[[
.
1
,
.
9
],
[
.
2
,
.
8
],
[
.
3
,
.
7
]]])
assert
pvals_val
.
shape
==
(
1
,
3
,
2
)
out_rng
,
draw
=
f
(
rng
,
pvals_val
)
assert
draw
.
shape
==
(
10
,
1
,
3
,
2
)
assert
numpy
.
allclose
(
draw
.
sum
(
axis
=
3
),
9
)
def
test_dtype
(
self
):
rng_R
=
random_state_type
()
low
=
tensor
.
lscalar
()
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论