Remove tentacles in sandbox

a388d94d · Arnaud Bergeron · 9dcf3f4c · a388d94d · a388d94d · a388d94d
--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -10,12 +10,6 @@ from theano.tensor import NotScalarConstantError, get_scalar_constant_value
 from theano.scalar import as_scalar
 import copy
-from theano.sandbox.cuda import cuda_available, GpuOp, register_opt
-if cuda_available:
-    from theano.sandbox.cuda import CudaNdarrayType
-    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
 class MultinomialFromUniform(Op):
    # TODO : need description for parameter 'odtype'
    """
@@ -403,232 +397,6 @@ class ChoiceFromUniform(MultinomialFromUniform):
                        break
-class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
-    """
-    The output is transposed compared to MultinomialFromUniform.
-    We must insert a Transpose op after it.
-    The optimization that moves it to the gpu does it.
-    """
-    def make_node(self, pvals, unis):
-        assert pvals.dtype == 'float32'
-        assert unis.dtype == 'float32'
-        if not isinstance(pvals.type, CudaNdarrayType):
-            raise TypeError('pvals must be cudandarray', pvals)
-        if not isinstance(unis.type, CudaNdarrayType):
-            raise TypeError('unis must be cudandarray', unis)
-        if self.odtype == 'auto':
-            odtype = pvals.dtype
-        else:
-            odtype = self.odtype
-        if odtype != pvals.dtype:
-            raise NotImplementedError(
-                'GpuMultinomialFromUniform works only if '
-                'self.odtype == pvals.dtype', odtype, pvals.dtype)
-        br = (pvals.broadcastable[1], pvals.broadcastable[0])
-        out = CudaNdarrayType(broadcastable=br)()
-        return Apply(self, [pvals, unis], [out])
-    def perform(self, node, ins, outs):
-        # The perform from parent don't work with CudaNdarray.  We
-        # don't need it as DebugMode will test again it as an
-        # optimization insert the GPU op.
-        return Op.perform(self, node, ins, outs)
-    def c_code_cache_version(self):
-        return (9,)
-    def c_support_code_apply(self, node, nodename):
-        return """
-        static __global__ void k_multi_warp_%(nodename)s(
-            const int nb_multi,
-            const int nb_outcomes,
-            float * global_pvals,
-            const int pvals_row_stride,
-            const int pvals_col_stride,
-            float * global_unis,
-            const int unis_stride,
-            float * global_outs,
-            const int outs_row_stride,
-            const int outs_col_stride
-        )
-        {
-            // each thread takes care of one multinomial draw
-            int n = blockDim.x*blockIdx.x + threadIdx.x;
-            if (n < nb_multi)
-            {
-            float cummul = 0.;
-            bool done = false;
-            const float unis_n = global_unis[n*unis_stride];
-            for (int m = 0; m < nb_outcomes; ++m)
-            {
-                float current_out = 0.;
-                if (!done)
-                {
-                    cummul += global_pvals[m * pvals_col_stride + n * pvals_row_stride];
-                    if (unis_n < cummul)
-                    {
-                        current_out = 1.;
-                        done = true;
-                    }
-                }
-                //write out transposed for speed.
-                global_outs[n * outs_col_stride + m * outs_row_stride] = current_out;
-            }
-            }
-        }
-        """ % locals()
-    def c_code(self, node, name, ins, outs, sub):
-        (pvals, unis) = ins
-        (z,) = outs
-        fail = sub['fail']
-        return """
-        if (CudaNdarray_NDIM(%(pvals)s) != 2)
-        {
-            PyErr_Format(PyExc_TypeError, "pvals wrong rank");
-            %(fail)s;
-        }
-        if (CudaNdarray_NDIM(%(unis)s) != 1)
-        {
-            PyErr_Format(PyExc_TypeError, "unis wrong rank");
-            %(fail)s;
-        }
-        if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
-        {
-            PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
-            %(fail)s;
-        }
-        //N.B. that the output is TRANSPOSED compared with pvals
-        if ((NULL == %(z)s)
-            || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
-            || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[0]))
-        {
-            Py_XDECREF(%(z)s);
-            npy_intp dims[2];
-            dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
-            dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
-            %(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
-            if (!%(z)s)
-            {
-                PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
-                %(fail)s;
-            }
-        }
-        { // NESTED SCOPE
-            int nb_multi = CudaNdarray_HOST_DIMS(%(pvals)s)[0];
-            int nb_outcomes = CudaNdarray_HOST_DIMS(%(pvals)s)[1];
-            //TODO : change this for a beautiful constant
-            int max_nb_blocks = 2<<15 - 1;
-            int nb_blocks = max_nb_blocks + 1;
-            int nb_threads=16; // so it really starts at 32, because of the *2
-            do
-            {
-                nb_threads*=2;
-                if (nb_multi %% nb_threads == 0)
-                    nb_blocks = nb_multi/nb_threads;
-                else
-                    nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
-            } while (nb_blocks > max_nb_blocks);
-            //printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
-            // TODO : next line is a bit hardcoded...
-            if (nb_threads > 512)
-            {
-                PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for so many rows in the matrix (%%i)", nb_multi);
-                %(fail)s;
-            }
-            dim3 n_blocks(nb_blocks,1,1);
-            dim3 n_threads(nb_threads,1,1);
-            int n_shared = 0;
-            assert(nb_blocks*nb_threads >= nb_multi);
-            k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                CudaNdarray_HOST_DIMS(%(z)s)[1],
-                CudaNdarray_HOST_DIMS(%(z)s)[0],
-                CudaNdarray_DEV_DATA(%(pvals)s),
-                CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
-                CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
-                CudaNdarray_DEV_DATA(%(unis)s),
-                CudaNdarray_HOST_STRIDES(%(unis)s)[0],
-                CudaNdarray_DEV_DATA(%(z)s),
-                CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                CudaNdarray_HOST_STRIDES(%(z)s)[1]
-            );
-            CNDA_THREAD_SYNC;
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
-                    "k_multi_warp_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z,
-                    n_shared);
-                %(fail)s;
-            }
-        } // END NESTED SCOPE
-        """ % locals()
-@register_opt()
-@local_optimizer([MultinomialFromUniform])
-def local_gpu_multinomial(node):
-    # TODO : need description for function
-    if type(node.op) is MultinomialFromUniform:
-        if len(node.inputs) == 2:
-            p, u = node.inputs
-            n_samples = 1
-        else:
-            p, u, n_samples = node.inputs
-        try:
-            if get_scalar_constant_value(n_samples) != 1:
-                return None
-        except NotScalarConstantError:
-            return None
-        m, = node.outputs
-        if (p.dtype == u.dtype == m.dtype == 'float32' and
-            any([i.owner and isinstance(i.owner.op,
-                                        theano.sandbox.cuda.HostFromGpu)
-                 for i in node.inputs])):
-            gpu_op = GpuMultinomialFromUniform(node.op.odtype)
-            return [host_from_gpu(gpu_op(*[gpu_from_host(i)
-                                           for i in [p, u]])).T]
-    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
-            node.inputs[0].owner and
-            type(node.inputs[0].owner.op) is MultinomialFromUniform):
-        multi = node.inputs[0].owner
-        if len(node.inputs) == 2:
-            p, u = node.inputs
-            n_samples = 1
-        else:
-            p, u, n_samples = node.inputs
-        try:
-            if get_scalar_constant_value(n_samples) != 1:
-                return None
-        except NotScalarConstantError:
-            return None
-        m, = multi.outputs
-        if (p.dtype == u.dtype == m.dtype == 'float32'):
-            gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
-            ret = gpu_op(*[gpu_from_host(i) for i in [p, u]]).T
-            # The dimshuffle is on the cpu, but will be moved to the
-            # gpu by an opt.
-            return [gpu_from_host(ret)]
 class MultinomialWOReplacementFromUniform(ChoiceFromUniform):
    def __init__(self, *args, **kwargs):
        warnings.warn("MultinomialWOReplacementFromUniform is deprecated, "

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
--- a/theano/sandbox/tests/test_multinomial.py
+++ b/theano/sandbox/tests/test_multinomial.py
@@ -10,28 +10,11 @@ import theano
 from theano import config, function, tensor
 from theano.sandbox import multinomial
 from theano.compile.mode import get_default_mode
-import theano.sandbox.cuda as cuda
 import theano.tests.unittest_tools as utt
 from theano.compat import PY3
 from theano.misc.pkl_utils import CompatUnpickler
-def get_mode(gpu):
-    mode = get_default_mode()
-    if theano.config.mode == 'FAST_COMPILE':
-        mode = theano.compile.get_mode('FAST_RUN')
-    if gpu:
-        mode = mode.including('gpu', 'gpu_local_optimizations',
-                              'local_cut_gpu_host_gpu',
-                              'local_gpu_multinomial')
-    return mode
-def run_with_c(f, gpu=False):
-    mode = get_mode(gpu)
-    f(mode, gpu)
 def test_n_samples_1():
    p = tensor.fmatrix()
    u = tensor.fvector()
@@ -117,69 +100,52 @@ def test_multinomial_0():
    m = multinomial.MultinomialFromUniform('auto')(p, u)
-    def body(mode, gpu):
+    # the m*2 allows the multinomial to reuse output
-        # the m*2 allows the multinomial to reuse output
+    f = function([p, u], m * 2, allow_input_downcast=True)
-        f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
-        if gpu:
+    # test that both first and second samples can be drawn
-            assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
+    utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
-                        for node in f.maker.fgraph.toposort()])
+                        [[2, 0], [0, 2]])
-        # test that both first and second samples can be drawn
+    # test that both second labels can be drawn
-        utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
+    r = f([[.2, .8], [.3, .7]], [.31, .31])
-                            [[2, 0], [0, 2]])
+    utt.assert_allclose(r, [[0, 2], [0, 2]])
-        # test that both second labels can be drawn
+    # test that both first labels can be drawn
-        r = f([[.2, .8], [.3, .7]], [.31, .31])
+    r = f([[.2, .8], [.3, .7]], [.21, .21])
-        utt.assert_allclose(r, [[0, 2], [0, 2]])
+    utt.assert_allclose(r, [[0, 2], [2, 0]])
-        # test that both first labels can be drawn
+    # change the size to make sure output gets reallocated ok
-        r = f([[.2, .8], [.3, .7]], [.21, .21])
+    # and also make sure that the GPU version doesn't screw up the
-        utt.assert_allclose(r, [[0, 2], [2, 0]])
+    # transposed-ness
+    r = f([[.2, .8]], [.25])
-        # change the size to make sure output gets reallocated ok
+    utt.assert_allclose(r, [[0, 2]])
-        # and also make sure that the GPU version doesn't screw up the
-        # transposed-ness
-        r = f([[.2, .8]], [.25])
-        utt.assert_allclose(r, [[0, 2]])
-    run_with_c(body)
-    if cuda.cuda_available:
-        run_with_c(body, True)
 # TODO: check a bigger example (make sure blocking on GPU is handled correctly)
 def test_multinomial_large():
-    # DEBUG_MODE will test this on GPU
+    p = tensor.fmatrix()
-    def body(mode, gpu):
+    u = tensor.fvector()
-        p = tensor.fmatrix()
+    m = multinomial.MultinomialFromUniform('auto')(p, u)
-        u = tensor.fvector()
+    f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
-        m = multinomial.MultinomialFromUniform('auto')(p, u)
-        f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
+    pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
-        if gpu:
+    pval = pval / pval.sum(axis=1)[:, None]
-            assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
+    uval = np.ones_like(pval[:, 0]) * 0.5
-                        for node in f.maker.fgraph.toposort()])
+    mval = f(pval, uval)
-        pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
+    assert mval.shape == pval.shape
-        pval = pval / pval.sum(axis=1)[:, None]
+    if config.cast_policy == 'custom':
-        uval = np.ones_like(pval[:, 0]) * 0.5
+        assert mval.dtype == pval.dtype
-        mval = f(pval, uval)
+    elif config.cast_policy == 'numpy+floatX':
+        assert mval.dtype == config.floatX
-        assert mval.shape == pval.shape
+    elif config.cast_policy == 'numpy':
-        if config.cast_policy == 'custom':
+        assert mval.dtype == 'float64'
-            assert mval.dtype == pval.dtype
+    else:
-        elif config.cast_policy == 'numpy+floatX':
+        raise NotImplementedError(config.cast_policy)
-            assert mval.dtype == config.floatX
+    utt.assert_allclose(mval.sum(axis=1), 2)
-        elif config.cast_policy == 'numpy':
+    asdf = np.asarray([0, 0, 2, 0]) + 0 * pval
-            assert mval.dtype == 'float64'
+    utt.assert_allclose(mval, asdf)  # broadcast over all rows
-        else:
-            raise NotImplementedError(config.cast_policy)
-        utt.assert_allclose(mval.sum(axis=1), 2)
-        asdf = np.asarray([0, 0, 2, 0]) + 0 * pval
-        utt.assert_allclose(mval, asdf)  # broadcast over all rows
-    run_with_c(body)
-    if cuda.cuda_available:
-        run_with_c(body, True)
 def test_multinomial_dtypes():
@@ -197,40 +163,3 @@ def test_multinomial_dtypes():
    u = tensor.fvector()
    m = multinomial.MultinomialFromUniform('float64')(p, u)
    assert m.dtype == 'float64', m.dtype
-def test_gpu_opt():
-    if not cuda.cuda_available:
-        # Skip test if cuda_ndarray is not available.
-        from nose.plugins.skip import SkipTest
-        raise SkipTest('Optional package cuda not available')
-    # We test the case where we put the op on the gpu when the output
-    # is moved to the gpu.
-    p = tensor.fmatrix()
-    u = tensor.fvector()
-    m = multinomial.MultinomialFromUniform('auto')(p, u)
-    assert m.dtype == 'float32', m.dtype
-    m_gpu = cuda.gpu_from_host(m)
-    f = function([p, u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
-    assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
-                for node in f.maker.fgraph.toposort()])
-    pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
-    pval = pval / pval.sum(axis=1)[:, None]
-    uval = np.ones_like(pval[:, 0]) * 0.5
-    f(pval, uval)
-    # Test with a row, it was failing in the past.
-    r = tensor.frow()
-    m = multinomial.MultinomialFromUniform('auto')(r, u)
-    assert m.dtype == 'float32', m.dtype
-    m_gpu = cuda.gpu_from_host(m)
-    f = function([r, u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
-    assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
-                for node in f.maker.fgraph.toposort()])
-    pval = np.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
-    pval = pval / pval.sum(axis=1)[:, None]
-    uval = np.ones_like(pval[:, 0]) * 0.5
-    f(pval, uval)
--- a/theano/sandbox/tests/test_rng_mrg.py
+++ b/theano/sandbox/tests/test_rng_mrg.py