rng_mgr - GPU version

f8414635 · James Bergstra · ddf3109c · f8414635
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -9,7 +9,12 @@ import sys
 import numpy
 from theano import Op, Apply, shared, config
-from theano.tensor import raw_random, TensorType, as_tensor_variable, get_vector_length, cast
+from theano.tensor import raw_random, TensorType, as_tensor_variable, get_vector_length, cast, opt
+from theano.compile import optdb
+from theano.gof import local_optimizer
+from theano.sandbox.cuda.opt import register_opt as gpu_register_opt
+from theano.sandbox.cuda import cuda_enabled, CudaNdarrayType #, gpu_from_host, host_from_gpu, CudaNdarrayType
 def mulmod(a, b, c, m):
    r = numpy.int32(numpy.int64(a*b + c) % m)
@@ -114,8 +119,9 @@ def mrg_next_value(rstate, new_rstate):
    else:
        return (x11 - x21) * NORM
-class mrg_uniform(Op):
+class mrg_uniform_base(Op):
    def __init__(self, output_type, inplace=False):
+        Op.__init__(self)
        self.output_type = output_type
        self.inplace=inplace
        if inplace:
@@ -129,6 +135,18 @@ class mrg_uniform(Op):
    def __hash__(self):
        return hash(type(self)) ^ hash(self.output_type) ^ hash(self.inplace)
+    def make_node(self, rstate, size):
+        # error checking slightly redundant here, since
+        # this op should not be called directly.
+        #
+        # call through MRG_RandomStreams instead.
+        return Apply(self, 
+                [rstate, size], 
+                [rstate.type(), self.output_type()])
+class mrg_uniform(mrg_uniform_base):
+    #CPU VERSION
    @classmethod
    def new(cls, rstate, ndim, dtype, size):
        v_size = as_tensor_variable(size)
@@ -137,12 +155,10 @@ class mrg_uniform(Op):
        op = cls(TensorType(dtype, (False,)*ndim))
        return op(rstate, cast(v_size, 'int32'))
-    def make_node(self, rstate, size):
-        return Apply(self, 
-                [rstate, size], 
-                [rstate.type(), self.output_type()])
    def perform(self, node, (rstate, size), (o_rstate, o_sample)):
        n_elements = 1
+        rstate = numpy.asarray(rstate) # bring state from GPU if necessary
        if not self.inplace:
            rstate = rstate.copy()
@@ -157,8 +173,8 @@ class mrg_uniform(Op):
            sample = mrg_next_value(rstate[i%n_streams], rstate[i%n_streams])
            rval[i] = sample
-        o_rstate[0] = rstate.copy()
+        o_rstate[0] = node.outputs[0].type.filter(rstate) # send to GPU if necessary
-        o_sample[0] = rval.reshape(size)
+        o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))# send to GPU if necessary
    def c_code_cache_version(self):
        return ()
@@ -317,10 +333,223 @@ class mrg_uniform(Op):
        //////// </ code generated by mrg_uniform>
        """ %locals()
+class GPU_mrg_uniform(mrg_uniform_base):
+    #GPU VERSION
+    @classmethod
+    def new(cls, rstate, ndim, dtype, size):
+        v_size = as_tensor_variable(size)
+        if ndim is None:
+            ndim = get_vector_length(v_size)
+        op = cls(CudaNdarrayType((False,)*ndim))
+        return op(rstate, cast(v_size, 'int32'))
+    def c_support_code_apply(self, node, nodename):
+        if self.output_type.dtype == 'float32':
+            otype = 'float' 
+            NORM = '4.6566126e-10f' #numpy.float32(1.0/(2**31+65))
+            # this was determined by finding the biggest number such that
+            # numpy.float32(number * M1) < 1.0
+        else:
+            otype = 'double' 
+            NORM = '4.656612873077392578125e-10'
+        return """
+        static __global__ void %(nodename)s_mrg_uniform(
+                %(otype)s*sample_data,
+                npy_int32*state_data,
+                const int Nsamples)
+        {
+            const npy_int32 i0 = 0;
+            const npy_int32 i7 = 7;
+            const npy_int32 i9 = 9;
+            const npy_int32 i15 = 15;
+            const npy_int32 i16 = 16;
+            const npy_int32 i22 = 22;
+            const npy_int32 i24 = 24;
+            const npy_int32 M1 = 2147483647;      //2^31 - 1
+            const npy_int32 M2 = 2147462579;      //2^31 - 21069
+            const npy_int32 MASK12 = 511;       //2^9 - 1
+            const npy_int32 MASK13 = 16777215;  //2^24 - 1
+            const npy_int32 MASK2 = 65535;      //2^16 - 1
+            const npy_int32 MULT2 = 21069;
+            const unsigned int numThreads = blockDim.x * gridDim.x;
+            const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            npy_int32 y1, y2, x11, x12, x13, x21, x22, x23;
+            x11 = state_data[idx*6+0];
+            x12 = state_data[idx*6+1];
+            x13 = state_data[idx*6+2];
+            x21 = state_data[idx*6+3];
+            x22 = state_data[idx*6+4];
+            x23 = state_data[idx*6+5];
+            for (int i = idx; i < Nsamples; i += numThreads)
+            {
+                y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
+                if ((y1 < 0 || y1 >= M1))     //must also check overflow
+                    y1 -= M1;
+                y1 += x13;
+                if ((y1 < 0 or y1 >= M1))
+                    y1 -= M1;
+                x13 = x12;
+                x12 = x11;
+                x11 = y1;
+                y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
+                if (y1 < 0 || y1 >= M2)
+                    y1 -= M2;
+                y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
+                if (y2 < 0 || y2 >= M2)
+                    y2 -= M2;
+                y2 += x23;
+                if (y2 < 0 || y2 >= M2)
+                    y2 -= M2;
+                y2 += y1;
+                if (y2 < 0 or y2 >= M2)
+                    y2 -= M2;
+                x23 = x22;
+                x22 = x21;
+                x21 = y2;
+                if (x11 <= x21) {
+                    sample_data[i] = (x11 - x21 + M1) * %(NORM)s;
+                }
+                else
+                {
+                    sample_data[i] = (x11 - x21) * %(NORM)s;
+                }
+            }
+            state_data[idx*6+0]= x11;
+            state_data[idx*6+1]= x12;
+            state_data[idx*6+2]= x13;
+            state_data[idx*6+3]= x21;
+            state_data[idx*6+4]= x22;
+            state_data[idx*6+5]= x23;
+        }  
+        """ %locals()
+    def c_code_cache_version(self):
+        return ()
+    def c_code(self, node, nodename, (rstate, size), (o_rstate, o_sample), sub):
+        inplace = int(self.inplace)
+        ndim = self.output_type.ndim
+        o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
+        fail = sub['fail']
+        if self.output_type.dtype == 'float32':
+            otype = 'float' 
+        else:
+            otype = 'double' 
+        SYNC="CNDA_THREAD_SYNC";
+        return """
+        //////// <code generated by mrg_uniform>
+        int odims[%(ndim)s];
+        int n_elements = 1;
+        unsigned int n_streams;
+        int must_alloc_sample = ((NULL == %(o_sample)s)
+                || !CudaNdarray_Check(py_%(o_sample)s)
+                || (%(o_sample)s->nd != %(ndim)s));
+        if (%(size)s->nd != 1)
+        {
+            PyErr_SetString(PyExc_ValueError, "size must be vector");
+            %(fail)s
+        }
+        if (%(size)s->dimensions[0] != %(ndim)s)
+        {
+            PyErr_Format(PyExc_ValueError, "size must have length %%i", %(ndim)s);
+            %(fail)s
+        }
+        if (%(size)s->descr->type_num != PyArray_INT32)
+        {
+            PyErr_SetString(PyExc_ValueError, "size must be int32");
+            %(fail)s
+        }
+        for (int i = 0; i < %(ndim)s; ++i)
+        {
+            odims[i] = ((npy_int32*)(%(size)s->data + %(size)s->strides[0] * i))[0];
+            n_elements *= odims[i];
+            must_alloc_sample = (must_alloc_sample 
+                    || CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
+        }
+        if (must_alloc_sample)
+        {
+            Py_XDECREF(%(o_sample)s);
+            %(o_sample)s = (CudaNdarray*)CudaNdarray_NewDims(%(ndim)s, odims);
+            if(!%(o_sample)s)
+            {
+                %(fail)s;
+            }
+        }
+        if (!CudaNdarray_Check(py_%(rstate)s))
+        {
+            PyErr_Format(PyExc_ValueError, "rstate must be cudandarray");
+            %(fail)s;
+        }
+        Py_XDECREF(%(o_rstate)s);
+        if (%(inplace)s)
+        {
+            Py_INCREF(%(rstate)s);
+            %(o_rstate)s = %(rstate)s;
+        }
+        else
+        {
+            %(o_rstate)s = (CudaNdarray*)CudaNdarray_Copy(%(rstate)s);
+        }
+        if (%(o_rstate)s->nd != 1)
+        {
+            PyErr_SetString(PyExc_ValueError, "rstate must be vector");
+            %(fail)s;
+        }
+        if (CudaNdarray_HOST_DIMS(%(o_rstate)s)[0] %% 6)
+        {
+            PyErr_Format(PyExc_ValueError, "rstate len must be multiple of 6");
+            %(fail)s;
+        }
+        n_streams = CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6;
+        {
+            unsigned int threads_per_block = std::min(n_streams, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
+            unsigned int n_blocks = std::min(ceil_intdiv(n_streams, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
+            if (threads_per_block * n_blocks < n_streams)
+            {
+                fprintf(stderr, "WARNING: unused streams above %%i (Tune GPU_mrg get_n_streams)\\n", threads_per_block * n_blocks );
+            }
+            %(nodename)s_mrg_uniform<<<n_blocks,threads_per_block>>>(
+                CudaNdarray_DEV_DATA(%(o_sample)s),
+                (npy_int32*)CudaNdarray_DEV_DATA(%(o_rstate)s),
+                n_elements);
+        }
+        %(SYNC)s;
+        {
+            cudaError_t err = cudaGetLastError();
+            if( cudaSuccess != err) 
+            {
+                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "mrg_uniform", cudaGetErrorString(err));
+                %(fail)s;
+            }                         
+        }
+        //////// </ code generated by mrg_uniform>
+        """ %locals()
 class MRG_RandomStreams(object):
    """Module component with similar interface to numpy.random (numpy.random.RandomState)"""
-    def __init__(self, seed=None):
+    def __init__(self, seed=None, use_cuda=None):
        """
        :type seed: None or int
@@ -329,6 +558,10 @@ class MRG_RandomStreams(object):
        """
        super(MRG_RandomStreams, self).__init__()
        self.rstate = numpy.asarray([12345]*6, dtype='int32')
+        if use_cuda is None:
+            self.use_cuda = cuda_enabled
+        else:
+            self.use_cuda = use_cuda
    def inc_rstate(self):
        """Update self.rstate to be skipped 2^134 steps forward to the next stream start"""
@@ -361,7 +594,6 @@ class MRG_RandomStreams(object):
        node_rstate.default_update = new_rstate
        return sample
    def uniform(self, size=None, low=0.0, high=1.0, ndim=None, dtype=config.floatX):
        """
        Sample a tensor of given size whose element from a uniform
@@ -371,6 +603,24 @@ class MRG_RandomStreams(object):
        ndim may be a plain integer to supplement the missing
        information.
        """
+        if self.use_cuda and dtype=='float32':
+            rstates = self.get_substream_rstates(self.n_streams(size))
+            rstates = rstates.flatten()
+            # HACK - we use fact that int32 and float32 have same size to 
+            # sneak ints into the CudaNdarray type.
+            # these *SHOULD NEVER BE USED AS FLOATS*
+            tmp_float_buf = numpy.frombuffer(rstates.data, dtype='float32')
+            assert tmp_float_buf.shape == rstates.shape
+            assert tmp_float_buf.data[:24] == rstates.data[:24]
+            node_rstate = shared(tmp_float_buf) # transfer to device
+            assert isinstance(node_rstate.type, CudaNdarrayType)
+            # we can't use the normal mrg_uniform constructor + later optimization
+            # because of the tmp_float_buf hack above.  There is
+            # currently no Theano node that will do a frombuffer reinterpretation.
+            u = self.pretty_return(node_rstate, 
+                    *GPU_mrg_uniform.new(node_rstate, ndim, dtype, size))
+        else:
            node_rstate = shared(self.get_substream_rstates(self.n_streams(size)))
            u = self.pretty_return(node_rstate, 
                    *mrg_uniform.new(node_rstate, ndim, dtype, size))
@@ -380,6 +630,17 @@ class MRG_RandomStreams(object):
            raise NotImplementedError( 'Increase the size to match the broadcasting pattern of `low` and `high` arguments')
        return  r
+@local_optimizer([None])
+def mrg_random_make_inplace(node):
+    op = node.op
+    if isinstance(op, mrg_uniform) and not op.inplace:
+        # op might be gpu version
+        new_op = op.__class__(op.output_type, inplace=True)
+        return new_op.make_node(*node.inputs).outputs
+    return False
+optdb.register('random_make_inplace_mrg', opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, 'fast_run', 'inplace')
 #
 #
 #
@@ -391,37 +652,61 @@ import theano
 def test_rng0():
    def basictest(f, steps, prefix=""):
+        dt = 0.0
+        for i in xrange(steps):
            t0 = time.time()
-        l = [f() for i in xrange(steps)]
+            ival = f()
-        tt = time.time()
+            dt += time.time() - t0
+            ival = numpy.asarray(ival)
+            if i == 0:
+                mean = numpy.array(ival, copy=True)
+            else:
+                alpha = 1.0 / (1+i)
+                mean = alpha * ival + (1-alpha)*mean
+        print prefix, 'mean', numpy.mean(mean)
+        assert abs(numpy.mean(mean) - 0.5) < .01, 'bad mean?'
+        print prefix, 'time', dt
+        print prefix, 'elements', steps*sample_size[0]*sample_size[1]
+        print prefix, 'samples/sec', steps*sample_size[0]*sample_size[1] / dt
+        if 0:
            mean, std, min, max = numpy.mean(l), numpy.std(l), numpy.min(l), numpy.max(l)
            print prefix, 'mean', mean
            print prefix, 'std', std
            print prefix, 'min', repr(min)
            print prefix, 'max', repr(max)
-        print prefix, 'samples/sec', steps*sample_size[0]*sample_size[1] / (tt-t0)
            assert max < 1.0
            assert min >= 0.0
            assert abs(mean - 0.5) < .01, 'bad mean?'
+    sample_size = (1000,100)
-    R = MRG_RandomStreams(234)
+    print ''
+    print 'ON CPU:'
-    sample_size = (200,20)
+    R = MRG_RandomStreams(234, use_cuda=False)
    u = R.uniform(size=sample_size)
-    print "U dtype", u.dtype
    f = theano.function([], u)
+    theano.printing.debugprint(f)
+    print 'random?[:10]\n', f()[0,0:10]
+    basictest(f, 1000, prefix='mrg  ')
-    print 'random?', f()[0]
+    print ''
-    print 'random?', f()[0]
+    print 'ON GPU:'
+    R = MRG_RandomStreams(234, use_cuda=True)
+    u = R.uniform(size=sample_size)
+    assert u.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
+    f = theano.function([], theano.Out(
+        theano.sandbox.cuda.basic_ops.gpu_from_host(u),
+        borrow=True))
+    theano.printing.debugprint(f)
+    print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
    basictest(f, 1000, prefix='mrg  ')
+    print ''
+    print 'ON CPU w NUMPY:'
    RR = theano.tensor.shared_randomstreams.RandomStreams(234)
    uu = RR.uniform(size=sample_size)