small fix to gemv to the gpu and added test to those optimization.

ad6ff6e2 · Frederic Bastien · 009251df · ad6ff6e2 · ad6ff6e2
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -175,18 +175,21 @@ def local_gpu_dot_to_dot22(node):
        host_input = node.inputs[0]
        if host_input.owner and host_input.owner.op == tensor.basic.dot:
            x, y = host_input.owner.inputs
-            # case one vector X matrix
+            # case one: vector X matrix
            if _is_real_vector(x) and _is_real_matrix(y):
                new_op = GpuDimShuffle((False,), ['x',0])
-                shape_out = y.shape[0],dimshuffle(['x'])
+                shape_out = y.shape[0].dimshuffle(['x'])
                gpu_x = new_op(gpu_from_host(x))
                gpu_y = gpu_from_host(y)
-            # case two matrix X vector
+            # case two: matrix X vector
            elif _is_real_matrix(x) and _is_real_vector(y):
                new_op = GpuDimShuffle((False,), [0,'x'])
                shape_out = x.shape[1].dimshuffle(['x'])
                gpu_x = gpu_from_host(x)
                gpu_y = new_op(gpu_from_host(y))
+            else:
+                return False
            return [GpuReshape(1)(gpu_dot22(gpu_x, gpu_y), shape_out)]
    if node.op == tensor.basic.dot:
        if numpy.any([(i.owner and i.owner.op == host_from_gpu) for i in node.inputs]):
@@ -202,6 +205,9 @@ def local_gpu_dot_to_dot22(node):
                shape_out = x.shape[1].dimshuffle(['x'])
                gpu_x = gpu_from_host(x)
                gpu_y = new_op(gpu_from_host(y))
+            else:
+                return False
            return [host_from_gpu(GpuReshape(1)(gpu_dot22(gpu_x, gpu_y),
                                                shape_out))]
    return False

--- a/theano/sandbox/cuda/tests/test_vector_matrix_dot.py
+++ b/theano/sandbox/cuda/tests/test_vector_matrix_dot.py
@@ -40,11 +40,18 @@ def test_dot_vm():
                                   dtype='float32'))
    no_gpu_f = theano.function([], theano.dot(v,m), mode = mode_without_gpu)
    gpu_f    = theano.function([], theano.dot(v,m), mode = mode_with_gpu)
+    #gpu_f2 is needed to test the case when the input is not on the gpu
+    #but the output is moved to the gpu.
+    gpu_f2   = theano.function([], cuda.gpu_from_host(theano.dot(v,m)), mode = mode_with_gpu)
    # Assert they produce the same output
    assert numpy.allclose(no_gpu_f(), gpu_f(), atol = atol)
+    assert numpy.allclose(no_gpu_f(), gpu_f2(), atol = atol)
    # Assert that the gpu version actually uses gpu
    assert sum([isinstance(node.op, blasop.GpuDot22) for node in
                gpu_f.maker.env.toposort() ]) == 1
+    assert sum([isinstance(node.op, blasop.GpuDot22) for node in
+                gpu_f2.maker.env.toposort() ]) == 1
 def test_dot_mv():
    ''' Test matrix dot vector '''
@@ -53,42 +60,61 @@ def test_dot_mv():
                                   dtype='float32'))
    no_gpu_f = theano.function([], theano.dot(m,v), mode = mode_without_gpu)
    gpu_f    = theano.function([], theano.dot(m,v), mode = mode_with_gpu)
+    #gpu_f2 is needed to test the case when the input is not on the gpu
+    #but the output is moved to the gpu.
+    gpu_f2   = theano.function([], cuda.gpu_from_host(theano.dot(m,v)), mode = mode_with_gpu)
    # Assert they produce the same output
    assert numpy.allclose(no_gpu_f(), gpu_f(), atol = atol)
+    assert numpy.allclose(no_gpu_f(), gpu_f2(), atol = atol)
    # Assert that the gpu version actually uses gpu
    assert sum([isinstance(node.op, blasop.GpuDot22) for node in
                gpu_f.maker.env.toposort() ]) == 1
+    assert sum([isinstance(node.op, blasop.GpuDot22) for node in
+                gpu_f2.maker.env.toposort() ]) == 1
 def test_gemv1():
-    ''' Is this the same test as test_gemv2 ? '''
+    ''' test vector1+dot(matrix,vector2) '''
    v1 = theano.shared( numpy.array(numpy.random.rand(2)  , dtype='float32'))
    v2 = theano.shared( numpy.array(numpy.random.rand(2)  , dtype='float32'))
    m  = theano.shared( numpy.array(numpy.random.rand(2,2), dtype='float32'))
    no_gpu_f = theano.function([], v2+theano.dot(m,v1), mode = mode_without_gpu)
    gpu_f    = theano.function([], v2+theano.dot(m,v1), mode = mode_with_gpu)
+    #gpu_f2 is needed to test the case when the input is not on the gpu
+    #but the output is moved to the gpu.
+    gpu_f2    = theano.function([], cuda.gpu_from_host(v2+theano.dot(m,v1)), mode = mode_with_gpu)
    # Assert they produce the same output
    assert numpy.allclose(no_gpu_f(), gpu_f(), atol = atol)
+    assert numpy.allclose(no_gpu_f(), gpu_f2(), atol = atol)
    # Assert that the gpu version actually uses gpu
    assert sum([isinstance(node.op, blasop.GpuGemm) for node in
                gpu_f.maker.env.toposort() ]) == 1
+    assert sum([isinstance(node.op, blasop.GpuGemm) for node in
+                gpu_f2.maker.env.toposort() ]) == 1
 def test_gemv2():
-    ''' Is this the same test as test_gemv1 ? '''
+    ''' test vector1+dot(vector2,matrix) '''
    v1 = theano.shared( numpy.array(numpy.random.rand(2)  , dtype='float32'))
    v2 = theano.shared( numpy.array(numpy.random.rand(2)  , dtype='float32'))
    m  = theano.shared( numpy.array(numpy.random.rand(2,2), dtype='float32'))
    no_gpu_f = theano.function([], v2+theano.dot(v1,m), mode = mode_without_gpu)
    gpu_f    = theano.function([], v2+theano.dot(v1,m), mode = mode_with_gpu)
+    #gpu_f2 is needed to test the case when the input is not on the gpu
+    #but the output is moved to the gpu.
+    gpu_f2    = theano.function([], cuda.gpu_from_host(v2+theano.dot(v1,m)), mode = mode_with_gpu)
    # Assert they produce the same output
    assert numpy.allclose(no_gpu_f(), gpu_f(), atol = atol)
+    assert numpy.allclose(no_gpu_f(), gpu_f2(), atol = atol)
    # Assert that the gpu version actually uses gpu
    assert sum([isinstance(node.op, blasop.GpuGemm) for node in
                gpu_f.maker.env.toposort() ]) == 1
+    assert sum([isinstance(node.op, blasop.GpuGemm) for node in
+                gpu_f2.maker.env.toposort() ]) == 1
 if __name__=='__main__':
    test_dot_vm()