Add GpuGer to gpuarray.

f8b8c31a · Arnaud Bergeron · 9a44f9bf · f8b8c31a · f8b8c31a · f8b8c31a
--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
 from theano import Op, Apply, config

-from theano.tensor.blas import Dot22, Gemv, Gemm
+from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
 from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable)

 try:
@@ -135,6 +135,60 @@ gpugemm_no_inplace = GpuGemm(inplace=False)
 gpugemm_inplace = GpuGemm(inplace=True)


+class GpuGer(BlasOp, Ger):
+    def make_node(self, A, alpha, x, y):
+        res = Ger.make_node(self, A, alpha, x, y)
+        A = as_gpuarray_variable(A)
+        x = as_gpuarray_variable(x)
+        y = as_gpuarray_variable(y)
+        assert A.dtype == x.dtype == y.dtype == alpha.dtype
+        return Apply(self, [A, alpha, x, y], [A.type()])
+
+    def perform(self, node, inp, out):
+        A, alpha, x, y = inp
+        inplace = self.destructive
+        if inplace and not A.flags.forc:
+            inplace = False
+        outputs[0][0] = blas.ger(alpha, x, y, A,
+                                 overwrite_a=inplace)
+
+    def c_code(self, node, name, inp, out, sub):
+        vars = dict(out=out[0], A=inp[0], alpha=inp[1], x=inp[2], y=inp[3],
+                    fail=sub['fail'], name=name)
+        if self.destructive:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   %(out)s = %(A)s;
+                   Py_INCREF(%(out)s);
+                   """ % vars
+        else:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   %(out)s = pygpu_copy(%(A)s, GA_ANY_ORDER);
+                   if (%(out)s == NULL) {
+                       %(fail)s
+                   }
+                   """ % vars
+        code += """
+        if (pygpu_blas_rger(((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
+                            %(x)s, %(y)s, %(out)s, 0) == -1) {
+            %(fail)s
+        }
+        """ % vars
+        if config.gpuarray.sync:
+            code += """
+            GpuArray_sync(&%(out)s->ga);
+            """ % vars
+        return code
+
+    def c_code_cache_version(self):
+        return (0,)
+
+
+gpuger_no_inplace = GpuGer(destructive=False)
+gpuger_inplace = GpuGer(destructive=True)
+
+
 class GpuDot22(BlasOp, Dot22):
    def make_node(self, x, y):
        res = Dot22.make_node(self, x, y)
@@ -211,6 +265,11 @@ def local_inplace_gpuagemm(node):
    if node.op == gpugemm_no_inplace:
        return [gpugemm_inplace(*node.inputs)]

+@local_optimizer([gpuger_no_inplace], inplace=True)
+def local_inplace_gpuager(node):
+    if node.op == gpuger_no_inplace:
+        return [gpuger_inplace(*node.inputs)]
+
 gpuablas_opt_inplace = in2out(LocalOptGroup(
        local_inplace_gpuagemv, local_inplace_gpuagemm),
                              name='gpuablas_opt_inplace')

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -17,7 +17,7 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
                                               GpuAlloc,
                                               GpuReshape,
                                               GpuEye)
-from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
+from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
 from theano.sandbox.gpuarray.conv import GpuConv
 from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                                          GpuCrossentropySoftmax1HotWithBiasDx,
@@ -302,23 +302,23 @@ def local_gpua_careduce(node):


 @register_opt()
-@op_lifter([tensor.blas.Gemv])
+@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
 def local_gpua_gemv(node):
    return GpuGemv(inplace=node.op.inplace)


-@register_opt()
-@op_lifter([tensor.blas_c.CGemv])
-def local_gpua_gemv2(node):
-    return GpuGemv(inplace=node.op.inplace)
-
-
 @register_opt()
 @op_lifter([tensor.blas.Gemm])
 def local_gpua_gemm(node):
    return GpuGemm(inplace=node.op.inplace)


+@register_opt()
+@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer])
+def local_gpua_ger(node):
+    return GpuGer(destructive=node.op.destructive)
+
+
 @register_opt()
 @op_lifter([tensor.blas.Dot22])
 def local_gpua_dot22(node):

--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
 from unittest import TestCase

 import theano
-from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
+from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
+                                _dot22)

 from theano.sandbox.gpuarray.tests.test_basic_ops import makeTester, rand

 from theano.sandbox.gpuarray.blas import (gpugemv_inplace,
-                                          gpugemm_inplace, gpu_dot22)
+                                          gpugemm_inplace, gpuger_inplace,
+                                          gpu_dot22)


 GpuGemvTester = makeTester('GpuGemvTester',
@@ -37,7 +39,19 @@ GpuGemmTester = makeTester('GpuGemmTester',
 #       test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
 #       test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
 #       test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
-    )
+        )
+)
+
+GpuGerTester = makeTester(
+    'GpuGerTester',
+    op=ger_destructive, gpu_op=gpuger_inplace,
+    cases=dict(
+        test1=[rand(4, 5), 1.0, rand(4), rand(5)],
+        test2=[rand(4, 5), 0.6, rand(4), rand(5)],
+        test3=[rand(4, 5), -1.0, rand(4), rand(5)],
+        test4=[rand(4, 5), -0.6, rand(4), rand(5)],
+        test5=[rand(4, 5), 0.0, rand(4), rand(5)],
+        )
 )