Add GpuDot22

26941db0 · Frederic · b7b88b1c · 26941db0 · 26941db0 · 26941db0
--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
 from theano import Op, Apply, config
-from theano.tensor.blas import Gemv, Gemm
+from theano.tensor.blas import Dot22, Gemv, Gemm
 from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable)
 try:
@@ -128,12 +128,73 @@ class GpuGemm(BlasOp, Gemm):
        return code
    def c_code_cache_version(self):
+        return
        return (0,)
 gpugemm_no_inplace = GpuGemm(inplace=False)
 gpugemm_inplace = GpuGemm(inplace=True)
+class GpuDot22(BlasOp, Dot22):
+    def make_node(self, x, y):
+        res = Dot22.make_node(self, x, y)
+        x = as_gpuarray_variable(x)
+        y = as_gpuarray_variable(y)
+        assert x.dtype == y.dtype
+        return Apply(self, [x, y], [x.type()])
+    def perform(self, node, inputs, outputs):
+        x, y = inputs
+        out = pygpu.empty((x.shape[0], y.shape[1]), dtype=x.dtype)
+        outputs[0][0] = blas.gemm(1., x, y, 0., out,
+                                  overwrite_c=True)
+    def c_code(self, node, name, inputs, outputs, sub):
+        dtype = node.inputs[0].dtype
+        typecode = pygpu.gpuarray.dtype_to_typecode(dtype)
+        vars = dict(A=inputs[0], B=inputs[1], dtype=dtype, out=outputs[0],
+                    typecode=typecode,
+                    fail=sub['fail'], name=name)
+        code = """
+        double one = 1.;
+        double zero = 0.;
+        size_t dims[] = {PyGpuArray_DIMS(%(A)s)[0], PyGpuArray_DIMS(%(B)s)[1]};
+        %(out)s = pygpu_empty(2, dims,
+                            %(typecode)s,
+                            GA_C_ORDER,
+                            pygpu_default_context(), Py_None);
+        if (!%(out)s) {
+            %(fail)s
+        }
+        if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
+                             one,
+                             %(A)s, %(B)s,
+                             zero,
+                             %(out)s) == NULL) {
+            %(fail)s
+        }
+        """ % vars
+        if config.gpuarray.sync:
+            code += """
+            GpuArray_sync(&%(out)s->ga);
+            """ % vars
+        return code
+    def c_code_cache_version(self):
+        return
+        return (0,)
+    def c_headers(self):
+        ret = super(GpuDot22, self).c_headers()
+        return ret + ['<compyte/numpy_compat.h>']
+gpu_dot22 = GpuDot22()
 from theano.compile import optdb
 from theano.gof import local_optimizer, LocalOptGroup
 from theano.tensor.opt import in2out

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -18,7 +18,7 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduce)
 from theano.sandbox.gpuarray.subtensor import GpuSubtensor
-from theano.sandbox.gpuarray.blas import GpuGemv, GpuGemm
+from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
 gpu_optimizer = EquilibriumDB()
 gpu_cut_copies = EquilibriumDB()
@@ -238,6 +238,12 @@ def local_gpua_gemm(node):
    return GpuGemm(inplace=node.op.inplace)
+@register_opt()
+@op_lifter([tensor.blas.Dot22])
+def local_gpua_dot22(node):
+    return gpu_dot22
 @register_opt()
 @op_lifter([tensor.basic.Eye])
 def local_gpua_eye(node):

--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
 from unittest import TestCase
-from theano.tensor.blas import gemv_inplace, gemm_inplace
+import theano
+from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
 from theano.sandbox.gpuarray.tests.test_basic_ops import makeTester, rand
 from theano.sandbox.gpuarray.blas import (gpugemv_inplace,
-                                          gpugemm_inplace)
+                                          gpugemm_inplace, gpu_dot22)
 GpuGemvTester = makeTester('GpuGemvTester',
                           op=gemv_inplace, gpu_op=gpugemv_inplace,
@@ -29,7 +31,28 @@ GpuGemmTester = makeTester('GpuGemmTester',
        test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
        test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
        test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
-        test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.0],
+        test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
-        test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.0],
+        test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
-        )
+ #       test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
+ #       test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
+ #       test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
+ #       test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
+    )
+)
+GpuDot22Tester = makeTester(
+    'GpuGemmTester',
+    op=_dot22, gpu_op=gpu_dot22,
+    cases=dict(
+        test1=[rand(3, 4), rand(4, 5)],
+        test2=[rand(1, 4), rand(4, 5)],
+        test3=[rand(3, 1), rand(1, 5)],
+        test4=[rand(3, 4), rand(4, 1)],
+#        test5=[rand(0, 4), rand(4, 5)],
+#        test6=[rand(3, 0), rand(0, 5)],
+#        test7=[rand(3, 4), rand(4, 0)],
+#        test8=[rand(0, 4), rand(4, 0)],
+#        test9=[rand(0, 0), rand(0, 0)],
+    )
 )