提交 aed3540c authored 作者: Frederic's avatar Frederic

Add GpuEye

上级 522eb45e
...@@ -3106,3 +3106,95 @@ def profile_printer(fct_name, compile_time, fct_call_time, fct_call, ...@@ -3106,3 +3106,95 @@ def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
for i in node.inputs]), for i in node.inputs]),
print str([getattr(i, 'dtype', None) print str([getattr(i, 'dtype', None)
for i in node.outputs]) for i in node.outputs])
class GpuEye(GpuOp):
def __init__(self, dtype=None):
if dtype is None:
dtype = config.floatX
assert dtype == 'float32'
self.dtype = dtype
def make_node(self, n, m, k):
n = tensor.as_tensor_variable(n)
m = tensor.as_tensor_variable(m)
k = tensor.as_tensor_variable(k)
assert n.ndim == 0
assert m.ndim == 0
assert k.ndim == 0
# k != 0 isn't implemented on the GPU yet.
assert tensor.get_scalar_constant_value(k) == 0
return Apply(self, [n, m], [matrix(dtype=self.dtype)])
def infer_shape(self, node, in_shapes):
out_shape = [node.inputs[0], node.inputs[1]]
return [out_shape]
def grad(self, inp, grads):
return [grad_undefined(self, i, inp[i]) for i in xrange(3)]
def __eq__(self, other):
return type(self) == type(other) and self.dtype == other.dtype
def __hash__(self):
return hash(self.dtype) ^ hash(type(self))
def c_support_code_apply(self, node, nodename):
return """
//Only 1 block is used.
__global__ void kEye(float* a, int n, int m) {
int nb_elem = min(n, m);
for (unsigned int i = threadIdx.x; i < nb_elem; i += blockDim.x) {
a[i*m + i] = 1;
}
}"""
def c_code(self, node, name, inp, out, sub):
n, m = inp
z, = out
fail = sub['fail']
s = """
int dims[] = {0, 0};
dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
int total_size = dims[0] * dims[1] * sizeof(float);
cudaError_t sts;
void * orig_z = %(z)s;
if (CudaNdarray_prep_output(&%(z)s, 2, dims))
{
%(fail)s;
}
sts = cudaMemset(CudaNdarray_DEV_DATA(%(z)s), 0, total_size);
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_MemoryError,
"GpuEye: Error in memset %%d bytes of device memory.",
total_size);
if(orig_z == NULL)
Py_XDECREF(%(z)s);
%(fail)s;
}
kEye<<<1, 256>>>(CudaNdarray_DEV_DATA(%(z)s), dims[0], dims[1]);
CNDA_THREAD_SYNC;
sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: kEye: %%s. n=%%d, m=%%d.",
cudaGetErrorString(sts),
dims[0], dims[1]);
%(fail)s;
}
""" % locals()
return s
def c_code_cache_version(self):
return (2,)
gpu_eye = GpuEye(dtype='float32')
...@@ -1354,6 +1354,27 @@ def local_gpualloc_memset_0(node): ...@@ -1354,6 +1354,27 @@ def local_gpualloc_memset_0(node):
return [new_out] return [new_out]
@register_opt()
@local_optimizer([])
def local_gpu_eye(node):
"""
gpu_from_host(eye) -> gpueye(gpu_from_host)
eye(host_from_gpu) -> host_from_gpu(gpueye)
"""
if node.op == gpu_from_host:
host_input = node.inputs[0]
if (host_input.owner and
isinstance(host_input.owner.op, tensor.Eye) and
host_input.owner.op.dtype == "float32"):
return [gpu_eye(*host_input.owner.inputs)]
if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32":
if numpy.any([(i.owner and i.owner.op == host_from_gpu)
for i in node.inputs]):
return [host_from_gpu(gpu_eye(*node.inputs))]
return False
def safe_to_gpu(x): def safe_to_gpu(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'): x.type.dtype == 'float32'):
......
...@@ -1137,6 +1137,35 @@ def test_shared_cudandarray(): ...@@ -1137,6 +1137,35 @@ def test_shared_cudandarray():
assert isinstance(a.type, tcn.CudaNdarrayType) assert isinstance(a.type, tcn.CudaNdarrayType)
def test_gpueye():
def check(dtype, N, M_=None):
# Theano does not accept None as a tensor.
# So we must use a real value.
M = M_
# Currently DebugMode does not support None as inputs even if this is
# allowed.
if M is None:
M = N
N_symb = T.iscalar()
M_symb = T.iscalar()
k_symb = numpy.asarray(0)
out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
f = theano.function([N_symb, M_symb],
B.as_cuda_ndarray_variable(out),
mode=mode_with_gpu)
result = numpy.asarray(f(N, M))
assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype))
assert result.dtype == numpy.dtype(dtype)
assert any([isinstance(node.op, B.GpuEye)
for node in f.maker.fgraph.toposort()])
for dtype in ['float32']:
yield check, dtype, 3
# M != N, k = 0
yield check, dtype, 3, 5
yield check, dtype, 5, 3
class test_size(unittest.TestCase): class test_size(unittest.TestCase):
""" """
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论