move the multinomial to the gpu when only the output is on the gpu.

2a0dc002 · Frederic Bastien · 5d551988 · 2a0dc002 · 2a0dc002
--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -297,6 +297,16 @@ def use_gpu_multinomial(node):
                 for i in node.inputs])):
            gpu_op = GpuMultinomialFromUniform(node.op.odtype)
            return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs])).T]
+    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
+        node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform):
+        multi = node.inputs[0].owner
+        p, u = multi.inputs
+        m, = multi.outputs
+        if (p.dtype == u.dtype == m.dtype == 'float32'):
+            gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
+            ret = gpu_op(*[gpu_from_host(i) for i in multi.inputs]).T
+            # The dimshuffle is on the cpu, but will be moved to the gpu by an opt.
+            return [gpu_from_host(ret)]
 if cuda_available:
    register_opt()(use_gpu_multinomial)

--- a/theano/sandbox/test_multinomial.py
+++ b/theano/sandbox/test_multinomial.py
+import copy
 import numpy
 import theano
-from theano import tensor, shared, function
+from theano import tensor, function
 import multinomial
 from theano.compile.mode import get_default_mode, predefined_linkers
 import theano.sandbox.cuda as cuda
-def run_with_c(f, gpu=False):
+def get_mode(gpu):
    mode = get_default_mode()
-    linker_orig = mode.linker
+    mode = copy.copy(mode)
-    if linker_orig == predefined_linkers['py']:
-        mode.linker = predefined_linkers['c|py']
    if gpu:
-        mode = mode.including('gpu')
+        mode = mode.including('gpu', 'gpu_local_optimizations', 'local_cut_gpu_host_gpu', 'use_gpu_multinomial')
-    try:
+    if isinstance(mode.linker, theano.gof.PerformLinker):
+        mode.linker = predefined_linkers['c|py']
+    return mode
+def run_with_c(f, gpu=False):
+    mode = get_mode(gpu)
    f(mode, gpu)
-    finally:
-        mode.linker = linker_orig
 def test_multinomial_0():
@@ -99,3 +102,23 @@ def test_multinomial_dtypes():
    u = tensor.fvector()
    m = multinomial.MultinomialFromUniform('float64')(p,u)
    assert m.dtype == 'float64', m.dtype
+def test_gpu_opt():
+    if not cuda.cuda_available:
+        # Skip test if cuda_ndarray is not available.
+        from nose.plugins.skip import SkipTest
+        raise SkipTest('Optional package cuda not available')
+    # We test the case where we put the op on the gpu when the output is moved to the gpu.
+    p = tensor.fmatrix()
+    u = tensor.fvector()
+    m = multinomial.MultinomialFromUniform('auto')(p,u)
+    assert m.dtype == 'float32', m.dtype
+    m_gpu = cuda.gpu_from_host(m)
+    f = function([p,u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
+    assert any([type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.env.toposort()])
+    pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4))+0.1
+    pval = pval / pval.sum(axis=1)[:,None]
+    uval = numpy.ones_like(pval[:,0]) * 0.5
+    mval = f(pval,uval)