提交 bd9e0243 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Change GPUA_mrg over to use GpuKernelBase. (I couldn't get it to work otherwise).

上级 c7c2a019
......@@ -25,6 +25,9 @@ if cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType,
float32_shared_constructor)
from theano.sandbox.gpuarray.basic_ops import GpuKernelBase
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
def matVecModM(A, s, m):
# return (A * s) % m
......@@ -608,7 +611,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
return (7,)
class GPUA_mrg_uniform(mrg_uniform_base):
class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
#GpuArray version
@classmethod
......@@ -620,12 +623,9 @@ class GPUA_mrg_uniform(mrg_uniform_base):
return op(rstate, cast(v_size, 'int32'))
def c_headers(self):
return ["<compyte/ext_cuda.h>"]
return GpuKernelBase.c_headers(self) + ['numpy_compat.h']
def c_init_code(self):
return ["setup_ext_cuda();"]
def c_support_code_apply(self, node, nodename):
def c_kernel_code(self, node):
if self.output_type.dtype == 'float32':
otype = 'float'
NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
......@@ -635,32 +635,28 @@ class GPUA_mrg_uniform(mrg_uniform_base):
otype = 'double'
NORM = '4.656612873077392578125e-10'
return """
static int %(nodename)s_printed_warning = 0;
static __global__ void %(nodename)s_mrg_uniform(
%(otype)s*sample_data,
npy_int32*state_data,
const int Nsamples,
const int Nstreams_used)
KERNEL void mrg_uniform(
%(otype)s *sample_data,
ga_int *state_data,
const ga_uint Nsamples,
const ga_uint Nstreams_used)
{
const npy_int32 i0 = 0;
const npy_int32 i7 = 7;
const npy_int32 i9 = 9;
const npy_int32 i15 = 15;
const npy_int32 i16 = 16;
const npy_int32 i22 = 22;
const npy_int32 i24 = 24;
const npy_int32 M1 = 2147483647; //2^31 - 1
const npy_int32 M2 = 2147462579; //2^31 - 21069
const npy_int32 MASK12 = 511; //2^9 - 1
const npy_int32 MASK13 = 16777215; //2^24 - 1
const npy_int32 MASK2 = 65535; //2^16 - 1
const npy_int32 MULT2 = 21069;
const ga_int i7 = 7;
const ga_int i9 = 9;
const ga_int i15 = 15;
const ga_int i16 = 16;
const ga_int i22 = 22;
const ga_int i24 = 24;
const ga_int M1 = 2147483647; //2^31 - 1
const ga_int M2 = 2147462579; //2^31 - 21069
const ga_int MASK12 = 511; //2^9 - 1
const ga_int MASK13 = 16777215; //2^24 - 1
const ga_int MASK2 = 65535; //2^16 - 1
const ga_int MULT2 = 21069;
const unsigned int numThreads = blockDim.x * gridDim.x;
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
npy_int32 y1, y2, x11, x12, x13, x21, x22, x23;
ga_int y1, y2, x11, x12, x13, x21, x22, x23;
if (idx < Nstreams_used)
{
......@@ -714,6 +710,15 @@ class GPUA_mrg_uniform(mrg_uniform_base):
""" % locals()
def c_kernel_params(self, node):
return ["GA_BUFFER", "GA_BUFFER", "GA_UINT", "GA_UINT"]
def c_kernel_name(self):
return "mrg_uniform"
def c_kernel_flags(self, node):
return self._get_kernel_flags(self.output_type.dtype, 'int32')
def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp
o_rstate, o_sample = out
......@@ -721,18 +726,21 @@ class GPUA_mrg_uniform(mrg_uniform_base):
ndim = self.output_type.ndim
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
kname = self.c_kernel_obj(nodename)
if self.output_type.dtype == 'float32':
otype = 'float'
otypecode = 'GA_FLOAT'
else:
otype = 'double'
otypecode = 'GA_DOUBLE'
return """
//////// <code generated by mrg_uniform>
size_t odims[%(ndim)s];
unsigned int n_elements = 1;
unsigned int n_streams, n_streams_used_in_this_call;
unsigned int n_streams;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| !pygpu_GpuArray_Check(py_%(o_sample)s)
|| !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
......@@ -745,7 +753,7 @@ class GPUA_mrg_uniform(mrg_uniform_base):
}
if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
{
PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%i)",
PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
%(ndim)s, PyArray_DIMS(%(size)s)[0]);
%(fail)s
}
......@@ -756,7 +764,7 @@ class GPUA_mrg_uniform(mrg_uniform_base):
}
for (int i = 0; i < %(ndim)s; ++i)
{
odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
odims[i] = ((npy_int32 *)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample
|| PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
......@@ -764,7 +772,7 @@ class GPUA_mrg_uniform(mrg_uniform_base):
if (must_alloc_sample)
{
Py_XDECREF(%(o_sample)s);
%(o_sample)s = pygpu_empty(%(ndim)s, odims, GA_FLOAT, GA_C_ORDER,
%(o_sample)s = pygpu_empty(%(ndim)s, odims, %(otypecode)s, GA_C_ORDER,
pygpu_default_context(), Py_None);
if(!%(o_sample)s)
{
......@@ -785,7 +793,7 @@ class GPUA_mrg_uniform(mrg_uniform_base):
}
else
{
%(o_rstate)s = pygpu_copy(%(rstate)s);
%(o_rstate)s = pygpu_copy(%(rstate)s, GA_ANY_ORDER);
}
if (PyGpuArray_NDIM(%(o_rstate)s) != 1)
......@@ -799,44 +807,28 @@ class GPUA_mrg_uniform(mrg_uniform_base):
%(fail)s;
}
n_streams = PyGpuArray_DIMS(%(o_rstate)s)[0]/6;
n_streams_used_in_this_call = std::min(n_streams, n_elements);
if (n_streams > n_elements)
n_streams = n_elements;
{
unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int n_blocks = std::min(ceil_intdiv((unsigned int)n_streams_used_in_this_call, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
if (threads_per_block * n_blocks < n_streams)
{
if (! %(nodename)s_printed_warning)
fprintf(stderr, "WARNING: unused streams above %%i (Tune GPU_mrg get_n_streams)\\n", threads_per_block * n_blocks );
%(nodename)s_printed_warning = 1;
}
cuda_enter(pygpu_default_context()->ctx);
%(nodename)s_mrg_uniform<<<n_blocks,threads_per_block>>>(
cuda_get_ptr(%(o_sample)s),
cuda_get_ptr(%(o_rstate)s),
n_elements, n_streams_used_in_this_call);
/* We need the full sync since we just modified libgpu
objects without informing it */
cudaDeviceSynchronize();
void *args[4];
args[0] = &%(o_sample)s->ga;
args[1] = &%(o_rstate)s->ga;
args[2] = &n_elements;
args[3] = &n_streams;
int err = GpuKernel_call(&%(kname)s, n_elements, 0, 0, args);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuKernel_call: %%s\\n",
GpuKernel_error(&%(kname)s, err));
%(fail)s
}
}
cudaError_t err = cudaGetLastError();
cuda_exit(pygpu_default_context()->ctx);
if (cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "mrg_uniform", cudaGetErrorString(err));
%(fail)s;
}
//////// </ code generated by mrg_uniform>
""" % locals()
def c_code_cache_version(self):
return (0,)
return (1, self.GpuKernelBase_version)
def guess_n_streams(size, warn=True):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论