提交 9483c7c5 authored 作者: gdesjardins's avatar gdesjardins

Bug fix for MRG_RandomStreams on GPU:

* number fo threads on GPU must be a multiple of NUM_VECTOR_OP_THREADS_PER_BLOCK * this means we will have more threads on the GPU than the number of streams we actually want. * the fix involves making these extra threads do nothing, instead of corrupting the stream of random samples ! TODO: review MRG_RandomStreams test suite.
上级 48dd7516
...@@ -380,7 +380,8 @@ class GPU_mrg_uniform(mrg_uniform_base): ...@@ -380,7 +380,8 @@ class GPU_mrg_uniform(mrg_uniform_base):
static __global__ void %(nodename)s_mrg_uniform( static __global__ void %(nodename)s_mrg_uniform(
%(otype)s*sample_data, %(otype)s*sample_data,
npy_int32*state_data, npy_int32*state_data,
const int Nsamples) const int Nsamples,
const int Nstreams_used)
{ {
const npy_int32 i0 = 0; const npy_int32 i0 = 0;
const npy_int32 i7 = 7; const npy_int32 i7 = 7;
...@@ -401,6 +402,8 @@ class GPU_mrg_uniform(mrg_uniform_base): ...@@ -401,6 +402,8 @@ class GPU_mrg_uniform(mrg_uniform_base):
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
npy_int32 y1, y2, x11, x12, x13, x21, x22, x23; npy_int32 y1, y2, x11, x12, x13, x21, x22, x23;
if (idx < Nstreams_used)
{
x11 = state_data[idx*6+0]; x11 = state_data[idx*6+0];
x12 = state_data[idx*6+1]; x12 = state_data[idx*6+1];
x13 = state_data[idx*6+2]; x13 = state_data[idx*6+2];
...@@ -408,7 +411,7 @@ class GPU_mrg_uniform(mrg_uniform_base): ...@@ -408,7 +411,7 @@ class GPU_mrg_uniform(mrg_uniform_base):
x22 = state_data[idx*6+4]; x22 = state_data[idx*6+4];
x23 = state_data[idx*6+5]; x23 = state_data[idx*6+5];
for (int i = idx; i < Nsamples; i += numThreads) for (int i = idx; i < Nsamples; i += Nstreams_used)
{ {
y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24); y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0; y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
...@@ -446,6 +449,7 @@ class GPU_mrg_uniform(mrg_uniform_base): ...@@ -446,6 +449,7 @@ class GPU_mrg_uniform(mrg_uniform_base):
state_data[idx*6+3]= x21; state_data[idx*6+3]= x21;
state_data[idx*6+4]= x22; state_data[idx*6+4]= x22;
state_data[idx*6+5]= x23; state_data[idx*6+5]= x23;
}
} }
""" %locals() """ %locals()
...@@ -467,7 +471,7 @@ class GPU_mrg_uniform(mrg_uniform_base): ...@@ -467,7 +471,7 @@ class GPU_mrg_uniform(mrg_uniform_base):
int odims[%(ndim)s]; int odims[%(ndim)s];
int n_elements = 1; int n_elements = 1;
unsigned int n_streams; int n_streams, n_streams_used_in_this_call;
int must_alloc_sample = ((NULL == %(o_sample)s) int must_alloc_sample = ((NULL == %(o_sample)s)
|| !CudaNdarray_Check(py_%(o_sample)s) || !CudaNdarray_Check(py_%(o_sample)s)
|| (%(o_sample)s->nd != %(ndim)s)); || (%(o_sample)s->nd != %(ndim)s));
...@@ -530,11 +534,12 @@ class GPU_mrg_uniform(mrg_uniform_base): ...@@ -530,11 +534,12 @@ class GPU_mrg_uniform(mrg_uniform_base):
PyErr_Format(PyExc_ValueError, "rstate len must be multiple of 6"); PyErr_Format(PyExc_ValueError, "rstate len must be multiple of 6");
%(fail)s; %(fail)s;
} }
n_streams = std::min(CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6, n_elements); n_streams = CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6;
n_streams_used_in_this_call = std::min(n_streams, n_elements);
{ {
unsigned int threads_per_block = std::min(n_streams, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int n_blocks = std::min(ceil_intdiv(n_streams, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS); unsigned int n_blocks = std::min(ceil_intdiv((unsigned int)n_streams_used_in_this_call, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
if (threads_per_block * n_blocks < n_streams) if (threads_per_block * n_blocks < n_streams)
{ {
...@@ -543,7 +548,7 @@ class GPU_mrg_uniform(mrg_uniform_base): ...@@ -543,7 +548,7 @@ class GPU_mrg_uniform(mrg_uniform_base):
%(nodename)s_mrg_uniform<<<n_blocks,threads_per_block>>>( %(nodename)s_mrg_uniform<<<n_blocks,threads_per_block>>>(
CudaNdarray_DEV_DATA(%(o_sample)s), CudaNdarray_DEV_DATA(%(o_sample)s),
(npy_int32*)CudaNdarray_DEV_DATA(%(o_rstate)s), (npy_int32*)CudaNdarray_DEV_DATA(%(o_rstate)s),
n_elements); n_elements, n_streams_used_in_this_call);
} }
%(SYNC)s; %(SYNC)s;
...@@ -632,10 +637,6 @@ class MRG_RandomStreams(object): ...@@ -632,10 +637,6 @@ class MRG_RandomStreams(object):
r *= s r *= s
if r > 6: if r > 6:
r = r/6 # chosen as fastest for rbm_benchmark r = r/6 # chosen as fastest for rbm_benchmark
# make sure its a multiple of 256 so that CPU and GPU work the same way
r = numpy.ceil(r/256.) * 256
return r return r
print >> sys.stderr, "MRG_RandomStreams Can't determine #streams from size (%s), guessing 30*256"%str(size) print >> sys.stderr, "MRG_RandomStreams Can't determine #streams from size (%s), guessing 30*256"%str(size)
......
...@@ -350,7 +350,7 @@ def test_uniform(): ...@@ -350,7 +350,7 @@ def test_uniform():
for node in f.maker.env.toposort()]) for node in f.maker.env.toposort()])
theano.printing.debugprint(f) theano.printing.debugprint(f)
cpu_c_out = f(*input) cpu_c_out = f(*input)
pickle.dump(cpu_c_out, open('debug_rng_cpu_c.pkl','w')) #pickle.dump(cpu_c_out, open('debug_rng_cpu_c.pkl','w'))
print 'random?[:10]\n' print 'random?[:10]\n'
print cpu_c_out[0,0:10] print cpu_c_out[0,0:10]
...@@ -371,7 +371,7 @@ def test_uniform(): ...@@ -371,7 +371,7 @@ def test_uniform():
for node in f.maker.env.toposort()]) for node in f.maker.env.toposort()])
theano.printing.debugprint(f) theano.printing.debugprint(f)
gpu_out = numpy.asarray(f(*input)) gpu_out = numpy.asarray(f(*input))
pickle.dump(gpu_out, open('debug_rng_gpu.pkl','w')) #pickle.dump(gpu_out, open('debug_rng_gpu.pkl','w'))
print 'random?[:10]\n' print 'random?[:10]\n'
print gpu_out[0,0:10] print gpu_out[0,0:10]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论