提交 0cd038b6 authored 作者: Frederic's avatar Frederic

First version of GpuEye for cuda only.

上级 85aede29
...@@ -11,6 +11,7 @@ from theano.tensor.basic import Alloc ...@@ -11,6 +11,7 @@ from theano.tensor.basic import Alloc
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
try: try:
import pygpu import pygpu
from pygpu import gpuarray, elemwise from pygpu import gpuarray, elemwise
...@@ -561,11 +562,10 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -561,11 +562,10 @@ class GpuReshape(HideC, tensor.Reshape):
out[0] = x.reshape(tuple(shp)) out[0] = x.reshape(tuple(shp))
class GpuEye(GpuOp): class GpuEye(Op):
def __init__(self, dtype=None): def __init__(self, dtype=None):
if dtype is None: if dtype is None:
dtype = config.floatX dtype = config.floatX
assert dtype == 'float32'
self.dtype = dtype self.dtype = dtype
def make_node(self, n, m, k): def make_node(self, n, m, k):
...@@ -575,10 +575,12 @@ class GpuEye(GpuOp): ...@@ -575,10 +575,12 @@ class GpuEye(GpuOp):
assert n.ndim == 0 assert n.ndim == 0
assert m.ndim == 0 assert m.ndim == 0
assert k.ndim == 0 assert k.ndim == 0
otype = GpuArrayType(dtype=self.dtype,
broadcastable=(False, False))
# k != 0 isn't implemented on the GPU yet. # k != 0 isn't implemented on the GPU yet.
assert tensor.get_scalar_constant_value(k) == 0 assert tensor.get_scalar_constant_value(k) == 0
return Apply(self, [n, m], [matrix(dtype=self.dtype)]) return Apply(self, [n, m], [otype()])
def infer_shape(self, node, in_shapes): def infer_shape(self, node, in_shapes):
out_shape = [node.inputs[0], node.inputs[1]] out_shape = [node.inputs[0], node.inputs[1]]
...@@ -593,55 +595,71 @@ class GpuEye(GpuOp): ...@@ -593,55 +595,71 @@ class GpuEye(GpuOp):
def __hash__(self): def __hash__(self):
return hash(self.dtype) ^ hash(type(self)) return hash(self.dtype) ^ hash(type(self))
def c_headers(self):
return ['cuda.h', '<compyte/extension.h>']
def c_support_code(self): def c_support_code(self):
dtype = self.dtype
return """ return """
//Only 1 block is used. CUdeviceptr (*cuda_get_ptr)(gpudata *g);
__global__ void kEye(float* a, int n, int m) {
//TODO OPT: Only 1 block is used.
__global__ void kEye_%(dtype)s(npy_%(dtype)s* a, int n, int m) {
int nb_elem = min(n, m); int nb_elem = min(n, m);
for (unsigned int i = threadIdx.x; i < nb_elem; i += blockDim.x) { for (unsigned int i = threadIdx.x; i < nb_elem; i += blockDim.x) {
a[i*m + i] = 1; a[i*m + i] = 1;
} }
}""" }""" % locals()
def c_init_code(self):
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
#TODO assert that the back-end is cuda!
n, m = inp n, m = inp
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
dtype = self.dtype
typecode = numpy.dtype(self.dtype).num
sync = bool(config.gpuarray.sync)
s = """ s = """
int dims[] = {0, 0}; npy_%(dtype)s* ptr;
size_t dims[] = {0, 0};
dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0]; dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0]; dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
int total_size = dims[0] * dims[1] * sizeof(float); int total_size = dims[0] * dims[1] * sizeof(float);
cudaError_t sts; cudaError_t sts;
void * orig_z = %(z)s; Py_XDECREF(%(z)s);
%(z)s = pygpu_empty(2, dims,
if (CudaNdarray_prep_output(&%(z)s, 2, dims)) %(typecode)d, GA_C_ORDER,
{ pygpu_default_context(), Py_None);
%(fail)s; if (!%(z)s) {
%(fail)s
} }
ptr = (npy_%(dtype)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
sts = cudaMemset(CudaNdarray_DEV_DATA(%(z)s), 0, total_size); %(z)s->ga.offset);
sts = cudaMemset(ptr, 0, total_size);
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
PyErr_Format(PyExc_MemoryError, PyErr_Format(PyExc_MemoryError,
"GpuEye: Error in memset %%d bytes of device memory.", "GpuEye: Error in memset %%d bytes of device memory.",
total_size); total_size);
if(orig_z == NULL)
Py_XDECREF(%(z)s);
%(fail)s; %(fail)s;
} }
kEye<<<1, 256>>>(CudaNdarray_DEV_DATA(%(z)s), dims[0], dims[1]); kEye_%(dtype)s<<<1, 256>>>(ptr, dims[0], dims[1]);
CNDA_THREAD_SYNC;
if(%(sync)d)
GpuArray_sync(&%(z)s->ga);
sts = cudaGetLastError(); sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Cuda error: kEye: %%s. n=%%d, m=%%d.", "Cuda error: kEye: %%s. n=%%ld, m=%%ld.",
cudaGetErrorString(sts), cudaGetErrorString(sts),
dims[0], dims[1]); (long int)dims[0], (long int)dims[1]);
%(fail)s; %(fail)s;
} }
""" % locals() """ % locals()
...@@ -649,5 +667,7 @@ __global__ void kEye(float* a, int n, int m) { ...@@ -649,5 +667,7 @@ __global__ void kEye(float* a, int n, int m) {
return s return s
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (1,)
gpu_eye = GpuEye(dtype='float32')
def c_compiler(self):
return NVCC_compiler
...@@ -10,8 +10,10 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB, ...@@ -10,8 +10,10 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host, from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
gpu_alloc, GpuReshape) gpu_from_host,
gpu_alloc, GpuReshape,
GpuEye)
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar, from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduce) GpuDimShuffle, GpuCAReduce)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor from theano.sandbox.gpuarray.subtensor import GpuSubtensor
...@@ -232,3 +234,9 @@ def local_gpua_gemv2(node): ...@@ -232,3 +234,9 @@ def local_gpua_gemv2(node):
@op_lifter(tensor.blas.Gemm) @op_lifter(tensor.blas.Gemm)
def local_gpua_gemm(node): def local_gpua_gemm(node):
return GpuGemm(inplace=node.op.inplace) return GpuGemm(inplace=node.op.inplace)
@register_opt()
@op_lifter(tensor.basic.Eye)
def local_gpua_eye(node):
return GpuEye(dtype=node.op.dtype)
...@@ -35,7 +35,8 @@ from theano.sandbox.gpuarray.type import (GpuArrayType, ...@@ -35,7 +35,8 @@ from theano.sandbox.gpuarray.type import (GpuArrayType,
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host, from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
gpu_alloc, gpu_from_cuda, gpu_alloc, gpu_from_cuda,
cuda_from_gpu, HostFromGpu, cuda_from_gpu, HostFromGpu,
GpuFromHost, GpuReshape) GpuFromHost, GpuReshape,
GpuEye)
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
utt.seed_rng() utt.seed_rng()
...@@ -322,15 +323,15 @@ def test_gpueye(): ...@@ -322,15 +323,15 @@ def test_gpueye():
k_symb = numpy.asarray(0) k_symb = numpy.asarray(0)
out = T.eye(N_symb, M_symb, k_symb, dtype=dtype) out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
f = theano.function([N_symb, M_symb], f = theano.function([N_symb, M_symb],
B.as_cuda_ndarray_variable(out), out,
mode=mode_with_gpu) mode=mode_with_gpu)
result = numpy.asarray(f(N, M)) result = numpy.asarray(f(N, M))
assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype)) assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype))
assert result.dtype == numpy.dtype(dtype) assert result.dtype == numpy.dtype(dtype)
assert any([isinstance(node.op, B.GpuEye) assert any([isinstance(node.op, GpuEye)
for node in f.maker.fgraph.toposort()]) for node in f.maker.fgraph.toposort()])
for dtype in ['float32']: for dtype in ['float32', 'int32']:
yield check, dtype, 3 yield check, dtype, 3
# M != N, k = 0 # M != N, k = 0
yield check, dtype, 3, 5 yield check, dtype, 3, 5
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论