提交 98c61e15 authored 作者: abergeron's avatar abergeron

Merge pull request #1590 from nouiz/gpua_eye

[MRG]Gpuarray eye
......@@ -347,8 +347,6 @@ class NVCC_compiler(object):
cmd.append(os.path.split(cppfilename)[-1])
cmd.extend(['-L%s' % ldir for ldir in lib_dirs])
cmd.extend(['-l%s' % l for l in libs])
if module_name != 'cuda_ndarray':
cmd.append("-lcuda_ndarray")
if sys.platform == 'darwin':
cmd.extend(darwin_python_lib.split())
......
......@@ -400,7 +400,7 @@ class CudaNdarrayType(Type):
def c_libraries(self):
# returning cublas because the cuda_ndarray.cuh header
# includes calls to SetVector and cublasGetError
return ['cudart', 'cublas']
return ['cudart', 'cublas', 'cuda_ndarray']
def c_support_code(cls):
return ""
......
......@@ -11,6 +11,7 @@ from theano.tensor.basic import Alloc
from theano.gof.python25 import all, any
from theano.gof.utils import MethodNotDefined
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
try:
import pygpu
from pygpu import gpuarray, elemwise
......@@ -559,3 +560,115 @@ class GpuReshape(HideC, tensor.Reshape):
else:
raise ValueError("total size of new array must be unchanged")
out[0] = x.reshape(tuple(shp))
class GpuEye(Op):
def __init__(self, dtype=None):
if dtype is None:
dtype = config.floatX
self.dtype = dtype
def make_node(self, n, m, k):
n = tensor.as_tensor_variable(n)
m = tensor.as_tensor_variable(m)
k = tensor.as_tensor_variable(k)
assert n.ndim == 0
assert m.ndim == 0
assert k.ndim == 0
otype = GpuArrayType(dtype=self.dtype,
broadcastable=(False, False))
# k != 0 isn't implemented on the GPU yet.
assert tensor.get_scalar_constant_value(k) == 0
return Apply(self, [n, m], [otype()])
def infer_shape(self, node, in_shapes):
out_shape = [node.inputs[0], node.inputs[1]]
return [out_shape]
def grad(self, inp, grads):
return [grad_undefined(self, i, inp[i]) for i in xrange(3)]
def __eq__(self, other):
return type(self) == type(other) and self.dtype == other.dtype
def __hash__(self):
return hash(self.dtype) ^ hash(type(self))
def c_headers(self):
return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>']
def c_support_code(self):
dtype = self.dtype
return """
CUdeviceptr (*cuda_get_ptr)(gpudata *g);
//TODO OPT: Only 1 block is used.
__global__ void kEye_%(dtype)s(npy_%(dtype)s* a, int n, int m) {
int nb_elem = min(n, m);
for (unsigned int i = threadIdx.x; i < nb_elem; i += blockDim.x) {
a[i*m + i] = 1;
}
}""" % locals()
def c_init_code(self):
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
def c_code(self, node, name, inp, out, sub):
#TODO assert that the back-end is cuda!
n, m = inp
z, = out
fail = sub['fail']
dtype = self.dtype
typecode = pygpu.gpuarray.dtype_to_typecode(dtype)
sync = bool(config.gpuarray.sync)
s = """
npy_%(dtype)s* ptr;
size_t dims[] = {0, 0};
dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
int total_size = dims[0] * dims[1] * sizeof(float);
cudaError_t sts;
Py_CLEAR(%(z)s);
%(z)s = pygpu_empty(2, dims,
%(typecode)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(z)s) {
%(fail)s
}
ptr = (npy_%(dtype)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset);
sts = cudaMemset(ptr, 0, total_size);
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_MemoryError,
"GpuEye: Error in memset %%d bytes of device memory.",
total_size);
%(fail)s;
}
kEye_%(dtype)s<<<1, 256>>>(ptr, dims[0], dims[1]);
if(%(sync)d)
GpuArray_sync(&%(z)s->ga);
sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: kEye: %%s. n=%%ld, m=%%ld.",
cudaGetErrorString(sts),
(long int)dims[0], (long int)dims[1]);
%(fail)s;
}
""" % locals()
return s
def c_code_cache_version(self):
return (1,)
def c_compiler(self):
return NVCC_compiler
......@@ -10,8 +10,10 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
from theano.gof.python25 import all, any
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
gpu_alloc, GpuReshape)
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
gpu_from_host,
gpu_alloc, GpuReshape,
GpuEye)
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduce)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor
......@@ -232,3 +234,9 @@ def local_gpua_gemv2(node):
@op_lifter(tensor.blas.Gemm)
def local_gpua_gemm(node):
return GpuGemm(inplace=node.op.inplace)
@register_opt()
@op_lifter(tensor.basic.Eye)
def local_gpua_eye(node):
return GpuEye(dtype=node.op.dtype)
......@@ -35,7 +35,8 @@ from theano.sandbox.gpuarray.type import (GpuArrayType,
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
gpu_alloc, gpu_from_cuda,
cuda_from_gpu, HostFromGpu,
GpuFromHost, GpuReshape)
GpuFromHost, GpuReshape,
GpuEye)
from theano.tests import unittest_tools as utt
utt.seed_rng()
......@@ -306,3 +307,32 @@ class G_reshape(T_reshape):
theano.tensor.opt.Shape_i,
theano.tensor.opt.MakeVector))
assert self.op == GpuReshape
def test_gpueye():
def check(dtype, N, M_=None):
# Theano does not accept None as a tensor.
# So we must use a real value.
M = M_
# Currently DebugMode does not support None as inputs even if this is
# allowed.
if M is None:
M = N
N_symb = T.iscalar()
M_symb = T.iscalar()
k_symb = numpy.asarray(0)
out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
f = theano.function([N_symb, M_symb],
out,
mode=mode_with_gpu)
result = numpy.asarray(f(N, M))
assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype))
assert result.dtype == numpy.dtype(dtype)
assert any([isinstance(node.op, GpuEye)
for node in f.maker.fgraph.toposort()])
for dtype in ['float32', 'int32']:
yield check, dtype, 3
# M != N, k = 0
yield check, dtype, 3, 5
yield check, dtype, 5, 3
......@@ -48,7 +48,7 @@ class GpuArrayType(Type):
else:
up_dtype = scalar.upcast(self.dtype, data.dtype)
if up_dtype == self.dtype:
data = gpuarray.array(data, dtype=self.typecode, copy=False)
data = gpuarray.array(data, dtype=self.dtype, copy=False)
else:
raise TypeError("%s cannot store a value of dtype %s "
"without risking loss of precision." %
......
......@@ -172,7 +172,7 @@ class test_Broadcast(unittest.TestCase):
yv = rand_val(ysh)
zv = xv + yv
self.assertTrue((f(xv, yv) == zv).all())
unittest_tools.assert_allclose(f(xv, yv), zv)
#test Elemwise.infer_shape
#the Shape op don't implement c_code!
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论