Run cpu Join test on gpu and transform cpu join to gpu join when input are constant.

46f56a7d · Frederic · 724df23d · 46f56a7d · 46f56a7d · 46f56a7d
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -881,8 +881,8 @@ def local_gpu_join(node):
        #print "OPT: axis_and_tensors=", axis_and_tensors
-        matches = [not t.owner is None and t.owner.op == host_from_gpu for t in axis_and_tensors[1:]]
+        matches = [(not t.owner is None and t.owner.op == host_from_gpu) or
+                   isinstance(t, gof.Constant) for t in axis_and_tensors[1:]]
        #print "OPT: matches =", matches
        # if all input tensors are host_from_gpu'ified

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -792,6 +792,20 @@ def test_gpualloc_output_to_gpu():
    assert numpy.allclose(f(5),f_gpu(5))
 import theano.tensor.tests.test_basic
+class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
+    def setUp(self):
+        utt.seed_rng()
+        self.mode = mode_with_gpu.excluding('constant_folding')
+        self.join_op = cuda.GpuJoin
+        self.split_op = tensor.Split
+        # No Make vector on the gpu, Join used instead
+        self.make_vector_op = cuda.GpuJoin
+        self.floatX = "float32"
+        # In FAST_COMPILE mode, we force the FAST_RUN mode for optimization.
+        self.hide_error = not theano.config.mode in ['DebugMode', 'DEBUG_MODE']
+        self.shared = cuda.shared_constructor
 # This is to don't duplicate test.
 class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
    shared=staticmethod(cuda.shared_constructor)

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -2539,11 +2539,11 @@ class T_Join_and_Split(unittest.TestCase):
        self.mode = theano.compile.get_default_mode().excluding('constant_folding')
        self.join_op = Join
        self.split_op = Split
+        self.make_vector_op = opt.MakeVector
        self.floatX = config.floatX
        self.hide_error = not theano.config.mode in ['DebugMode', 'DEBUG_MODE', 'FAST_COMPILE']
        self.shared = shared
    def eval_outputs_and_check_join(self, outputs):
        f = theano.function([], outputs, self.mode)
        topo = f.maker.env.toposort()
@@ -2553,10 +2553,13 @@ class T_Join_and_Split(unittest.TestCase):
            return variables[0]
        return variables
-    def eval_outputs_and_check_vector(self, outputs):
+    def eval_outputs_and_check_vector(self, outputs,
+                                      make_vector_op = None):
+        if make_vector_op is None:
+            make_vector_op = self.make_vector_op
        f = theano.function([], outputs, self.mode)
        topo = f.maker.env.toposort()
-        assert [True for node in topo if isinstance(node.op, opt.MakeVector)]
+        assert [True for node in topo if isinstance(node.op, make_vector_op)]
        variables = f()
        if isinstance(variables,(tuple,list)) and len(variables) == 1:
            return variables[0]
@@ -2572,14 +2575,15 @@ class T_Join_and_Split(unittest.TestCase):
        self.fail()
    def test_stack_mixed_type_constants(self):
+        # tested only on cpu as gpu support only float32
        a = as_tensor_variable(1)
        b = as_tensor_variable(2.0)
-        c = shared(numpy.asarray(3.0))
+        c = shared(numpy.asarray(3.0).astype(self.floatX))
        s = stack(a, b, c)
        want = numpy.array([1, 2, 3])
-        out = self.eval_outputs_and_check_vector([s])
+        out = self.eval_outputs_and_check_vector([s], opt.MakeVector)
        self.assertTrue((out == want).all())
    def test_stack_scalar(self):