提交 bd9e0243 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Change GPUA_mrg over to use GpuKernelBase. (I couldn't get it to work otherwise).

上级 c7c2a019
...@@ -25,6 +25,9 @@ if cuda_available: ...@@ -25,6 +25,9 @@ if cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType, from theano.sandbox.cuda import (CudaNdarrayType,
float32_shared_constructor) float32_shared_constructor)
from theano.sandbox.gpuarray.basic_ops import GpuKernelBase
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
def matVecModM(A, s, m): def matVecModM(A, s, m):
# return (A * s) % m # return (A * s) % m
...@@ -608,7 +611,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -608,7 +611,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
return (7,) return (7,)
class GPUA_mrg_uniform(mrg_uniform_base): class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
#GpuArray version #GpuArray version
@classmethod @classmethod
...@@ -620,12 +623,9 @@ class GPUA_mrg_uniform(mrg_uniform_base): ...@@ -620,12 +623,9 @@ class GPUA_mrg_uniform(mrg_uniform_base):
return op(rstate, cast(v_size, 'int32')) return op(rstate, cast(v_size, 'int32'))
def c_headers(self): def c_headers(self):
return ["<compyte/ext_cuda.h>"] return GpuKernelBase.c_headers(self) + ['numpy_compat.h']
def c_init_code(self): def c_kernel_code(self, node):
return ["setup_ext_cuda();"]
def c_support_code_apply(self, node, nodename):
if self.output_type.dtype == 'float32': if self.output_type.dtype == 'float32':
otype = 'float' otype = 'float'
NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65)) NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
...@@ -635,32 +635,28 @@ class GPUA_mrg_uniform(mrg_uniform_base): ...@@ -635,32 +635,28 @@ class GPUA_mrg_uniform(mrg_uniform_base):
otype = 'double' otype = 'double'
NORM = '4.656612873077392578125e-10' NORM = '4.656612873077392578125e-10'
return """ return """
static int %(nodename)s_printed_warning = 0; KERNEL void mrg_uniform(
%(otype)s *sample_data,
static __global__ void %(nodename)s_mrg_uniform( ga_int *state_data,
%(otype)s*sample_data, const ga_uint Nsamples,
npy_int32*state_data, const ga_uint Nstreams_used)
const int Nsamples,
const int Nstreams_used)
{ {
const npy_int32 i0 = 0; const ga_int i7 = 7;
const npy_int32 i7 = 7; const ga_int i9 = 9;
const npy_int32 i9 = 9; const ga_int i15 = 15;
const npy_int32 i15 = 15; const ga_int i16 = 16;
const npy_int32 i16 = 16; const ga_int i22 = 22;
const npy_int32 i22 = 22; const ga_int i24 = 24;
const npy_int32 i24 = 24;
const ga_int M1 = 2147483647; //2^31 - 1
const ga_int M2 = 2147462579; //2^31 - 21069
const ga_int MASK12 = 511; //2^9 - 1
const ga_int MASK13 = 16777215; //2^24 - 1
const ga_int MASK2 = 65535; //2^16 - 1
const ga_int MULT2 = 21069;
const npy_int32 M1 = 2147483647; //2^31 - 1
const npy_int32 M2 = 2147462579; //2^31 - 21069
const npy_int32 MASK12 = 511; //2^9 - 1
const npy_int32 MASK13 = 16777215; //2^24 - 1
const npy_int32 MASK2 = 65535; //2^16 - 1
const npy_int32 MULT2 = 21069;
const unsigned int numThreads = blockDim.x * gridDim.x;
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
npy_int32 y1, y2, x11, x12, x13, x21, x22, x23; ga_int y1, y2, x11, x12, x13, x21, x22, x23;
if (idx < Nstreams_used) if (idx < Nstreams_used)
{ {
...@@ -714,6 +710,15 @@ class GPUA_mrg_uniform(mrg_uniform_base): ...@@ -714,6 +710,15 @@ class GPUA_mrg_uniform(mrg_uniform_base):
""" % locals() """ % locals()
def c_kernel_params(self, node):
return ["GA_BUFFER", "GA_BUFFER", "GA_UINT", "GA_UINT"]
def c_kernel_name(self):
return "mrg_uniform"
def c_kernel_flags(self, node):
return self._get_kernel_flags(self.output_type.dtype, 'int32')
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp rstate, size = inp
o_rstate, o_sample = out o_rstate, o_sample = out
...@@ -721,18 +726,21 @@ class GPUA_mrg_uniform(mrg_uniform_base): ...@@ -721,18 +726,21 @@ class GPUA_mrg_uniform(mrg_uniform_base):
ndim = self.output_type.ndim ndim = self.output_type.ndim
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail'] fail = sub['fail']
kname = self.c_kernel_obj(nodename)
if self.output_type.dtype == 'float32': if self.output_type.dtype == 'float32':
otype = 'float' otype = 'float'
otypecode = 'GA_FLOAT'
else: else:
otype = 'double' otype = 'double'
otypecode = 'GA_DOUBLE'
return """ return """
//////// <code generated by mrg_uniform> //////// <code generated by mrg_uniform>
size_t odims[%(ndim)s]; size_t odims[%(ndim)s];
unsigned int n_elements = 1; unsigned int n_elements = 1;
unsigned int n_streams, n_streams_used_in_this_call; unsigned int n_streams;
int must_alloc_sample = ((NULL == %(o_sample)s) int must_alloc_sample = ((NULL == %(o_sample)s)
|| !pygpu_GpuArray_Check(py_%(o_sample)s) || !pygpu_GpuArray_Check(py_%(o_sample)s)
|| !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS) || !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
...@@ -745,7 +753,7 @@ class GPUA_mrg_uniform(mrg_uniform_base): ...@@ -745,7 +753,7 @@ class GPUA_mrg_uniform(mrg_uniform_base):
} }
if (PyArray_DIMS(%(size)s)[0] != %(ndim)s) if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
{ {
PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%i)", PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
%(ndim)s, PyArray_DIMS(%(size)s)[0]); %(ndim)s, PyArray_DIMS(%(size)s)[0]);
%(fail)s %(fail)s
} }
...@@ -756,7 +764,7 @@ class GPUA_mrg_uniform(mrg_uniform_base): ...@@ -756,7 +764,7 @@ class GPUA_mrg_uniform(mrg_uniform_base):
} }
for (int i = 0; i < %(ndim)s; ++i) for (int i = 0; i < %(ndim)s; ++i)
{ {
odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0]; odims[i] = ((npy_int32 *)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
n_elements *= odims[i]; n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample must_alloc_sample = (must_alloc_sample
|| PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]); || PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
...@@ -764,7 +772,7 @@ class GPUA_mrg_uniform(mrg_uniform_base): ...@@ -764,7 +772,7 @@ class GPUA_mrg_uniform(mrg_uniform_base):
if (must_alloc_sample) if (must_alloc_sample)
{ {
Py_XDECREF(%(o_sample)s); Py_XDECREF(%(o_sample)s);
%(o_sample)s = pygpu_empty(%(ndim)s, odims, GA_FLOAT, GA_C_ORDER, %(o_sample)s = pygpu_empty(%(ndim)s, odims, %(otypecode)s, GA_C_ORDER,
pygpu_default_context(), Py_None); pygpu_default_context(), Py_None);
if(!%(o_sample)s) if(!%(o_sample)s)
{ {
...@@ -785,7 +793,7 @@ class GPUA_mrg_uniform(mrg_uniform_base): ...@@ -785,7 +793,7 @@ class GPUA_mrg_uniform(mrg_uniform_base):
} }
else else
{ {
%(o_rstate)s = pygpu_copy(%(rstate)s); %(o_rstate)s = pygpu_copy(%(rstate)s, GA_ANY_ORDER);
} }
if (PyGpuArray_NDIM(%(o_rstate)s) != 1) if (PyGpuArray_NDIM(%(o_rstate)s) != 1)
...@@ -799,44 +807,28 @@ class GPUA_mrg_uniform(mrg_uniform_base): ...@@ -799,44 +807,28 @@ class GPUA_mrg_uniform(mrg_uniform_base):
%(fail)s; %(fail)s;
} }
n_streams = PyGpuArray_DIMS(%(o_rstate)s)[0]/6; n_streams = PyGpuArray_DIMS(%(o_rstate)s)[0]/6;
n_streams_used_in_this_call = std::min(n_streams, n_elements); if (n_streams > n_elements)
n_streams = n_elements;
{ {
unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); void *args[4];
unsigned int n_blocks = std::min(ceil_intdiv((unsigned int)n_streams_used_in_this_call, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS); args[0] = &%(o_sample)s->ga;
args[1] = &%(o_rstate)s->ga;
if (threads_per_block * n_blocks < n_streams) args[2] = &n_elements;
{ args[3] = &n_streams;
if (! %(nodename)s_printed_warning) int err = GpuKernel_call(&%(kname)s, n_elements, 0, 0, args);
fprintf(stderr, "WARNING: unused streams above %%i (Tune GPU_mrg get_n_streams)\\n", threads_per_block * n_blocks ); if (err != GA_NO_ERROR) {
%(nodename)s_printed_warning = 1; PyErr_Format(PyExc_RuntimeError, "GpuKernel_call: %%s\\n",
} GpuKernel_error(&%(kname)s, err));
cuda_enter(pygpu_default_context()->ctx); %(fail)s
%(nodename)s_mrg_uniform<<<n_blocks,threads_per_block>>>(
cuda_get_ptr(%(o_sample)s),
cuda_get_ptr(%(o_rstate)s),
n_elements, n_streams_used_in_this_call);
/* We need the full sync since we just modified libgpu
objects without informing it */
cudaDeviceSynchronize();
} }
cudaError_t err = cudaGetLastError();
cuda_exit(pygpu_default_context()->ctx);
if (cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "mrg_uniform", cudaGetErrorString(err));
%(fail)s;
} }
//////// </ code generated by mrg_uniform> //////// </ code generated by mrg_uniform>
""" % locals() """ % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,) return (1, self.GpuKernelBase_version)
def guess_n_streams(size, warn=True): def guess_n_streams(size, warn=True):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论