提交 98c61e15 authored 作者: abergeron's avatar abergeron

Merge pull request #1590 from nouiz/gpua_eye

[MRG]Gpuarray eye
...@@ -347,8 +347,6 @@ class NVCC_compiler(object): ...@@ -347,8 +347,6 @@ class NVCC_compiler(object):
cmd.append(os.path.split(cppfilename)[-1]) cmd.append(os.path.split(cppfilename)[-1])
cmd.extend(['-L%s' % ldir for ldir in lib_dirs]) cmd.extend(['-L%s' % ldir for ldir in lib_dirs])
cmd.extend(['-l%s' % l for l in libs]) cmd.extend(['-l%s' % l for l in libs])
if module_name != 'cuda_ndarray':
cmd.append("-lcuda_ndarray")
if sys.platform == 'darwin': if sys.platform == 'darwin':
cmd.extend(darwin_python_lib.split()) cmd.extend(darwin_python_lib.split())
......
...@@ -400,7 +400,7 @@ class CudaNdarrayType(Type): ...@@ -400,7 +400,7 @@ class CudaNdarrayType(Type):
def c_libraries(self): def c_libraries(self):
# returning cublas because the cuda_ndarray.cuh header # returning cublas because the cuda_ndarray.cuh header
# includes calls to SetVector and cublasGetError # includes calls to SetVector and cublasGetError
return ['cudart', 'cublas'] return ['cudart', 'cublas', 'cuda_ndarray']
def c_support_code(cls): def c_support_code(cls):
return "" return ""
......
...@@ -11,6 +11,7 @@ from theano.tensor.basic import Alloc ...@@ -11,6 +11,7 @@ from theano.tensor.basic import Alloc
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
try: try:
import pygpu import pygpu
from pygpu import gpuarray, elemwise from pygpu import gpuarray, elemwise
...@@ -559,3 +560,115 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -559,3 +560,115 @@ class GpuReshape(HideC, tensor.Reshape):
else: else:
raise ValueError("total size of new array must be unchanged") raise ValueError("total size of new array must be unchanged")
out[0] = x.reshape(tuple(shp)) out[0] = x.reshape(tuple(shp))
class GpuEye(Op):
def __init__(self, dtype=None):
if dtype is None:
dtype = config.floatX
self.dtype = dtype
def make_node(self, n, m, k):
n = tensor.as_tensor_variable(n)
m = tensor.as_tensor_variable(m)
k = tensor.as_tensor_variable(k)
assert n.ndim == 0
assert m.ndim == 0
assert k.ndim == 0
otype = GpuArrayType(dtype=self.dtype,
broadcastable=(False, False))
# k != 0 isn't implemented on the GPU yet.
assert tensor.get_scalar_constant_value(k) == 0
return Apply(self, [n, m], [otype()])
def infer_shape(self, node, in_shapes):
out_shape = [node.inputs[0], node.inputs[1]]
return [out_shape]
def grad(self, inp, grads):
return [grad_undefined(self, i, inp[i]) for i in xrange(3)]
def __eq__(self, other):
return type(self) == type(other) and self.dtype == other.dtype
def __hash__(self):
return hash(self.dtype) ^ hash(type(self))
def c_headers(self):
return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>']
def c_support_code(self):
dtype = self.dtype
return """
CUdeviceptr (*cuda_get_ptr)(gpudata *g);
//TODO OPT: Only 1 block is used.
__global__ void kEye_%(dtype)s(npy_%(dtype)s* a, int n, int m) {
int nb_elem = min(n, m);
for (unsigned int i = threadIdx.x; i < nb_elem; i += blockDim.x) {
a[i*m + i] = 1;
}
}""" % locals()
def c_init_code(self):
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
def c_code(self, node, name, inp, out, sub):
#TODO assert that the back-end is cuda!
n, m = inp
z, = out
fail = sub['fail']
dtype = self.dtype
typecode = pygpu.gpuarray.dtype_to_typecode(dtype)
sync = bool(config.gpuarray.sync)
s = """
npy_%(dtype)s* ptr;
size_t dims[] = {0, 0};
dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
int total_size = dims[0] * dims[1] * sizeof(float);
cudaError_t sts;
Py_CLEAR(%(z)s);
%(z)s = pygpu_empty(2, dims,
%(typecode)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(z)s) {
%(fail)s
}
ptr = (npy_%(dtype)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset);
sts = cudaMemset(ptr, 0, total_size);
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_MemoryError,
"GpuEye: Error in memset %%d bytes of device memory.",
total_size);
%(fail)s;
}
kEye_%(dtype)s<<<1, 256>>>(ptr, dims[0], dims[1]);
if(%(sync)d)
GpuArray_sync(&%(z)s->ga);
sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: kEye: %%s. n=%%ld, m=%%ld.",
cudaGetErrorString(sts),
(long int)dims[0], (long int)dims[1]);
%(fail)s;
}
""" % locals()
return s
def c_code_cache_version(self):
return (1,)
def c_compiler(self):
return NVCC_compiler
...@@ -10,8 +10,10 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB, ...@@ -10,8 +10,10 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host, from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
gpu_alloc, GpuReshape) gpu_from_host,
gpu_alloc, GpuReshape,
GpuEye)
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar, from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduce) GpuDimShuffle, GpuCAReduce)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor from theano.sandbox.gpuarray.subtensor import GpuSubtensor
...@@ -232,3 +234,9 @@ def local_gpua_gemv2(node): ...@@ -232,3 +234,9 @@ def local_gpua_gemv2(node):
@op_lifter(tensor.blas.Gemm) @op_lifter(tensor.blas.Gemm)
def local_gpua_gemm(node): def local_gpua_gemm(node):
return GpuGemm(inplace=node.op.inplace) return GpuGemm(inplace=node.op.inplace)
@register_opt()
@op_lifter(tensor.basic.Eye)
def local_gpua_eye(node):
return GpuEye(dtype=node.op.dtype)
...@@ -35,7 +35,8 @@ from theano.sandbox.gpuarray.type import (GpuArrayType, ...@@ -35,7 +35,8 @@ from theano.sandbox.gpuarray.type import (GpuArrayType,
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host, from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
gpu_alloc, gpu_from_cuda, gpu_alloc, gpu_from_cuda,
cuda_from_gpu, HostFromGpu, cuda_from_gpu, HostFromGpu,
GpuFromHost, GpuReshape) GpuFromHost, GpuReshape,
GpuEye)
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
utt.seed_rng() utt.seed_rng()
...@@ -306,3 +307,32 @@ class G_reshape(T_reshape): ...@@ -306,3 +307,32 @@ class G_reshape(T_reshape):
theano.tensor.opt.Shape_i, theano.tensor.opt.Shape_i,
theano.tensor.opt.MakeVector)) theano.tensor.opt.MakeVector))
assert self.op == GpuReshape assert self.op == GpuReshape
def test_gpueye():
def check(dtype, N, M_=None):
# Theano does not accept None as a tensor.
# So we must use a real value.
M = M_
# Currently DebugMode does not support None as inputs even if this is
# allowed.
if M is None:
M = N
N_symb = T.iscalar()
M_symb = T.iscalar()
k_symb = numpy.asarray(0)
out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
f = theano.function([N_symb, M_symb],
out,
mode=mode_with_gpu)
result = numpy.asarray(f(N, M))
assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype))
assert result.dtype == numpy.dtype(dtype)
assert any([isinstance(node.op, GpuEye)
for node in f.maker.fgraph.toposort()])
for dtype in ['float32', 'int32']:
yield check, dtype, 3
# M != N, k = 0
yield check, dtype, 3, 5
yield check, dtype, 5, 3
...@@ -48,7 +48,7 @@ class GpuArrayType(Type): ...@@ -48,7 +48,7 @@ class GpuArrayType(Type):
else: else:
up_dtype = scalar.upcast(self.dtype, data.dtype) up_dtype = scalar.upcast(self.dtype, data.dtype)
if up_dtype == self.dtype: if up_dtype == self.dtype:
data = gpuarray.array(data, dtype=self.typecode, copy=False) data = gpuarray.array(data, dtype=self.dtype, copy=False)
else: else:
raise TypeError("%s cannot store a value of dtype %s " raise TypeError("%s cannot store a value of dtype %s "
"without risking loss of precision." % "without risking loss of precision." %
......
...@@ -172,7 +172,7 @@ class test_Broadcast(unittest.TestCase): ...@@ -172,7 +172,7 @@ class test_Broadcast(unittest.TestCase):
yv = rand_val(ysh) yv = rand_val(ysh)
zv = xv + yv zv = xv + yv
self.assertTrue((f(xv, yv) == zv).all()) unittest_tools.assert_allclose(f(xv, yv), zv)
#test Elemwise.infer_shape #test Elemwise.infer_shape
#the Shape op don't implement c_code! #the Shape op don't implement c_code!
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论