Merge pull request #1800 from abergeron/gpuarray_ger

Add GpuGer to gpuarray.

Merge pull request #1800 from abergeron/gpuarray_ger
b457bb54 · Frédéric Bastien · 305402ba · 61d9631c · b457bb54 · b457bb54
--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
 from theano import Op, Apply, config

-from theano.tensor.blas import Dot22, Gemv, Gemm
+from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
 from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable)

 try:
@@ -28,7 +28,7 @@ class GpuGemv(BlasOp, Gemv):
        A = as_gpuarray_variable(A)
        x = as_gpuarray_variable(x)
        y = as_gpuarray_variable(y)
-        assert A.dtype == x.dtype == y.dtype == alpha.dtype == beta.dtype
+        assert A.dtype == x.dtype == y.dtype
        return Apply(self, [y, alpha, A, x, beta], [y.type()])

    def perform(self, node, inputs, out_storage):
@@ -45,8 +45,15 @@ class GpuGemv(BlasOp, Gemv):
        if self.inplace:
            code = """
                   Py_XDECREF(%(out)s);
-                   %(out)s = %(y)s;
-                   Py_INCREF(%(out)s);
+                   if (%(y)s->ga.strides[0] <= 0) {
+                     %(out)s = pygpu_copy(%(y)s, GA_ANY_ORDER);
+                     if (%(out)s == NULL) {
+                       %(fail)s
+                     }
+                   } else {
+                     %(out)s = %(y)s;
+                     Py_INCREF(%(out)s);
+                   }
                   """ % vars
        else:
            code = """
@@ -72,7 +79,7 @@ class GpuGemv(BlasOp, Gemv):
        return code

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)

 gpugemv_no_inplace = GpuGemv(inplace=False)
 gpugemv_inplace = GpuGemv(inplace=True)
@@ -84,7 +91,7 @@ class GpuGemm(BlasOp, Gemm):
        A = as_gpuarray_variable(A)
        B = as_gpuarray_variable(B)
        C = as_gpuarray_variable(C)
-        assert A.dtype == B.dtype == C.dtype == alpha.dtype == beta.dtype
+        assert A.dtype == B.dtype == C.dtype
        return Apply(self, [C, alpha, A, B, beta], [C.type()])

    def perform(self, node, inputs, outputs):
@@ -101,8 +108,15 @@ class GpuGemm(BlasOp, Gemm):
        if self.inplace:
            code = """
                   Py_XDECREF(%(out)s);
-                   %(out)s = %(C)s;
-                   Py_INCREF(%(out)s);
+                   if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
+                     %(out)s = pygpu_copy(%(C)s, GA_ANY_ORDER);
+                     if (%(out)s == NULL) {
+                       %(fail)s
+                     }
+                   } else {
+                     %(out)s = %(C)s;
+                     Py_INCREF(%(out)s);
+                   }
                   """ % vars
        else:
            code = """
@@ -128,13 +142,74 @@ class GpuGemm(BlasOp, Gemm):
        return code

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)


 gpugemm_no_inplace = GpuGemm(inplace=False)
 gpugemm_inplace = GpuGemm(inplace=True)


+class GpuGer(BlasOp, Ger):
+    def make_node(self, A, alpha, x, y):
+        res = Ger.make_node(self, A, alpha, x, y)
+        A = as_gpuarray_variable(A)
+        x = as_gpuarray_variable(x)
+        y = as_gpuarray_variable(y)
+        assert A.dtype == x.dtype == y.dtype
+        return Apply(self, [A, alpha, x, y], [A.type()])
+
+    def perform(self, node, inp, out):
+        A, alpha, x, y = inp
+        inplace = self.destructive
+        if inplace and not A.flags.forc:
+            inplace = False
+        outputs[0][0] = blas.ger(alpha, x, y, A,
+                                 overwrite_a=inplace)
+
+    def c_code(self, node, name, inp, out, sub):
+        vars = dict(out=out[0], A=inp[0], alpha=inp[1], x=inp[2], y=inp[3],
+                    fail=sub['fail'], name=name)
+        if self.destructive:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   if (!GpuArray_ISONESEGMENT(&%(A)s->ga)) {
+                     %(out)s = pygpu_copy(%(A)s, GA_ANY_ORDER);
+                     if (%(out)s == NULL) {
+                       %(fail)s
+                     }
+                   } else {
+                     %(out)s = %(A)s;
+                     Py_INCREF(%(out)s);
+                   }
+                   """ % vars
+        else:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   %(out)s = pygpu_copy(%(A)s, GA_ANY_ORDER);
+                   if (%(out)s == NULL) {
+                       %(fail)s
+                   }
+                   """ % vars
+        code += """
+        if (pygpu_blas_rger(((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
+                            %(x)s, %(y)s, %(out)s, 0) == -1) {
+            %(fail)s
+        }
+        """ % vars
+        if config.gpuarray.sync:
+            code += """
+            GpuArray_sync(&%(out)s->ga);
+            """ % vars
+        return code
+
+    def c_code_cache_version(self):
+        return (1,)
+
+
+gpuger_no_inplace = GpuGer(destructive=False)
+gpuger_inplace = GpuGer(destructive=True)
+
+
 class GpuDot22(BlasOp, Dot22):
    def make_node(self, x, y):
        res = Dot22.make_node(self, x, y)
@@ -211,8 +286,13 @@ def local_inplace_gpuagemm(node):
    if node.op == gpugemm_no_inplace:
        return [gpugemm_inplace(*node.inputs)]

+@local_optimizer([gpuger_no_inplace], inplace=True)
+def local_inplace_gpuager(node):
+    if node.op == gpuger_no_inplace:
+        return [gpuger_inplace(*node.inputs)]
+
 gpuablas_opt_inplace = in2out(LocalOptGroup(
-        local_inplace_gpuagemv, local_inplace_gpuagemm),
+        local_inplace_gpuagemv, local_inplace_gpuagemm, local_inplace_gpuager),
                              name='gpuablas_opt_inplace')
 optdb.register('InplaceGpuaBlasOpt',
               gpuablas_opt_inplace,

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -17,7 +17,7 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
                                               GpuAlloc,
                                               GpuReshape,
                                               GpuEye)
-from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
+from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
 from theano.sandbox.gpuarray.conv import GpuConv
 from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                                          GpuCrossentropySoftmax1HotWithBiasDx,
@@ -302,23 +302,23 @@ def local_gpua_careduce(node):


 @register_opt()
-@op_lifter([tensor.blas.Gemv])
+@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
 def local_gpua_gemv(node):
    return GpuGemv(inplace=node.op.inplace)


-@register_opt()
-@op_lifter([tensor.blas_c.CGemv])
-def local_gpua_gemv2(node):
-    return GpuGemv(inplace=node.op.inplace)
-
-
 @register_opt()
 @op_lifter([tensor.blas.Gemm])
 def local_gpua_gemm(node):
    return GpuGemm(inplace=node.op.inplace)


+@register_opt()
+@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
+def local_gpua_ger(node):
+    return GpuGer(destructive=node.op.destructive)
+
+
 @register_opt()
 @op_lifter([tensor.blas.Dot22])
 def local_gpua_dot22(node):

--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
 from unittest import TestCase
+from nose.plugins.skip import SkipTest

 import theano
-from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
+from theano import tensor
+from theano.tests import unittest_tools
+from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
+                                _dot22)
+from theano.tensor.tests.test_blas import TestGer, BaseGemv

-from theano.sandbox.gpuarray.tests.test_basic_ops import makeTester, rand
+from theano.sandbox.gpuarray import gpuarray_shared_constructor
+from theano.sandbox.gpuarray.tests.test_basic_ops import (makeTester, rand,
+                                                          mode_with_gpu)

-from theano.sandbox.gpuarray.blas import (gpugemv_inplace,
-                                          gpugemm_inplace, gpu_dot22)
+from theano.sandbox.gpuarray.blas import (gpugemv_inplace, gpugemv_no_inplace,
+                                          gpugemm_inplace, gpugemm_no_inplace,
+                                          gpuger_inplace, gpuger_no_inplace,
+                                          GpuGer, gpu_dot22)


 GpuGemvTester = makeTester('GpuGemvTester',
@@ -21,6 +30,21 @@ GpuGemvTester = makeTester('GpuGemvTester',
        )
 )

+class TestGpuSgemv(TestCase, BaseGemv, unittest_tools.TestOptimizationMixin):
+    mode = mode_with_gpu
+    dtype = 'float32'
+
+    gemv = gpugemv_no_inplace
+    gemv_inplace = gpugemv_inplace
+
+    @staticmethod
+    def shared(val):
+        try:
+            return gpuarray_shared_constructor(val)
+        except TypeError:
+            return theano.shared(val)
+
+
 GpuGemmTester = makeTester('GpuGemmTester',
                           op=gemm_inplace, gpu_op=gpugemm_inplace,
                           cases=dict(
@@ -37,9 +61,40 @@ GpuGemmTester = makeTester('GpuGemmTester',
 #       test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
 #       test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
 #       test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
-    )
+        )
 )

+class TestGpuSger(TestGer):
+    def setUp(self):
+        self.mode = mode_with_gpu
+        dtype = self.dtype = 'float32'  # optimization isn't dtype-dependent
+        self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
+        self.a = tensor.tensor(dtype=dtype, broadcastable=())
+        self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        self.ger_destructive = gpuger_inplace
+
+        # data on the gpu make the op always inplace
+        self.ger = gpuger_inplace
+        self.gemm = gpugemm_inplace
+
+    def test_f32_0_0(self):
+        raise SkipTest('0-sized objects not supported')
+    def test_f32_1_0(self):
+        raise SkipTest('0-sized objects not supported')
+    def test_f32_0_1(self):
+        raise SkipTest('0-sized objects not supported')
+
+class TestGpuSgerNoTransfer(TestGpuSger):
+    shared = staticmethod(gpuarray_shared_constructor)
+
+class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
+    def setUp(self):
+        self.ops = [gpuger_no_inplace, gpuger_inplace]
+
+    def clone(self, op):
+        return GpuGer(destructive=op.destructive)
+

 GpuDot22Tester = makeTester(
    'GpuGemmTester',