added op GpuDot22Scalar, with opt to insert it and tests.

e7f39cef · Frederic Bastien · fb83675c · e7f39cef · e7f39cef · e7f39cef
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -71,6 +71,80 @@ class GpuDot22(Op):
        """ % locals()
 gpu_dot22 = GpuDot22()

+class GpuDot22Scalar(Op):
+    def __str__(self):
+        return 'GpuDot22Scalar'
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def make_node(self, x, y, a):
+        if x.type.ndim != 2:
+            raise TypeError(x)
+        if y.type.ndim != 2:
+            raise TypeError(y)
+        if not tensor.blas._as_scalar(a):
+            raise TypeError(a)
+        return Apply(self, [x,y,a], [x.type()])
+
+    def c_code_cache_version(self):
+        return (1,0)
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        x, y, a = inputs
+        z, = outputs
+        fail = sub['fail']
+        return """
+        #define REAL float
+        float %(name)s_a = (%(a)s->descr->type_num == PyArray_FLOAT)
+        ? (REAL)(((float*)%(a)s->data)[0])
+        : (REAL)(((double*)%(a)s->data)[0]);
+        #undef REAL
+        if (%(x)s->nd != 2)
+        {
+            PyErr_Format(PyExc_TypeError, "rank(x)==%%i must be 2", %(x)s->nd);
+            %(fail)s;
+        }
+        if (%(y)s->nd != 2)
+        {
+            PyErr_Format(PyExc_TypeError, "rank(y)==%%i must be 2", %(y)s->nd);
+            %(fail)s;
+        }
+
+        if ((NULL == %(z)s)
+            || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
+            || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(y)s)[1]))
+        {
+            //if (%(z)s) Py_DECREF(%(z)s);
+            Py_XDECREF(%(z)s);
+            npy_intp dims[2];
+            dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
+            dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[1];
+            %(z)s = (CudaNdarray*)CudaNdarray_new_null();
+            if ((NULL == %(z)s) || CudaNdarray_alloc_contiguous(%(z)s, 2, dims))
+            {
+                if (%(z)s)
+                {
+                    Py_DECREF(%(z)s);
+                    %(z)s = NULL;
+                }
+                %(fail)s;
+            }
+        }
+        if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, 0.0f, %(z)s))
+        {
+            if (%(z)s)
+            {
+                Py_DECREF(%(z)s);
+                %(z)s = NULL;
+            }
+            %(fail)s;
+        }
+        """ % locals()
+gpu_dot22scalar = GpuDot22Scalar()
+
 class GpuGemm(Op):
    destroy_map = {0:[0]}
    def __str__(self):

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -6,7 +6,7 @@ from theano.gof import local_optimizer, EquilibriumDB, SequenceDB, Optimizer, to

 from theano.sandbox.cuda.basic_ops import *
 from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda.blas import gpu_dot22, gpu_gemm, GpuConv
+from theano.sandbox.cuda.blas import gpu_dot22, gpu_dot22scalar, gpu_gemm, GpuConv
 from theano.sandbox.cuda.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
 from theano.sandbox.cuda.nnet import (
        GpuCrossentropySoftmaxArgmax1HotWithBias,
@@ -143,11 +143,11 @@ def local_gpu_dimshuffle_0(node):

 @register_opt()
 @local_optimizer([])
-def local_gpu_dot(node):
+def local_gpu_dot22(node):
    """
-    gpu_from_host(dot) -> gpudot(gpu_from_host)
+    gpu_from_host(dot22) -> gpudot(gpu_from_host)

-    dot(host_from_gpu) -> host_from_gpu(gpudot)
+    dot(host_from_gpu) -> host_from_gpu(gpudot22)
    """
    if node.op == gpu_from_host:
        host_input = node.inputs[0]
@@ -160,6 +160,25 @@ def local_gpu_dot(node):
            return [host_from_gpu(gpu_dot22(gpu_from_host(x), gpu_from_host(y)))]
    return False

+@register_opt()
+@local_optimizer([])
+def local_gpu_dot22scalar(node):
+    """
+    gpu_from_host(dot22scalar) -> gpudot(gpu_from_host)
+
+    dot(host_from_gpu) -> host_from_gpu(gpudot22scalar)
+    """
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if host_input.owner and host_input.owner.op == tensor.blas._dot22scalar:
+            x, y, scalar = host_input.owner.inputs
+            return [gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y), tensor.blas._as_scalar(scalar))]
+    if node.op == tensor.blas._dot22scalar:
+        if numpy.any([(i.owner and i.owner.op == host_from_gpu) for i in node.inputs]):
+            x, y, scalar = node.inputs
+            return [host_from_gpu(gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y),tensor.blas._as_scalar(scalar)))]
+    return False
+
 @register_opt()
 @local_optimizer([])
 def local_gpu_gemm(node):

--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -44,6 +44,33 @@ def test_dot():

    assert numpy.allclose(numpy.dot(a0, bval), a.value)

+def test_dot22scalar():
+    a = tensor.fmatrix()
+    b = tensor.fmatrix()
+    scalar = tensor.fscalar()
+    av = my_rand(4,4)
+    bv = my_rand(4,4)
+
+    f = theano.function([a,b], tensor.dot(a,b)*numpy.asarray(4, 'float32'), mode=mode_with_gpu)
+    f2 = theano.function([a,b], tensor.dot(a,b)*numpy.asarray(4, 'float32'))
+    t=f.maker.env.toposort()
+    assert len(t)==4
+    assert isinstance(t[0].op,tcn.GpuFromHost)
+    assert isinstance(t[1].op,tcn.GpuFromHost)
+    assert isinstance(t[2].op,tcn.blas.GpuDot22Scalar)
+    assert isinstance(t[3].op,tcn.HostFromGpu)
+    assert numpy.allclose(f(av,bv),f2(av,bv))
+
+    f = theano.function([a,b,scalar], tensor.dot(a,b)*scalar, mode=mode_with_gpu)
+    f2 = theano.function([a,b,scalar], tensor.dot(a,b)*scalar)
+    t=f.maker.env.toposort()
+    assert len(t)==4
+    assert isinstance(t[0].op,tcn.GpuFromHost)
+    assert isinstance(t[1].op,tcn.GpuFromHost)
+    assert isinstance(t[2].op,tcn.blas.GpuDot22Scalar)
+    assert isinstance(t[3].op,tcn.HostFromGpu)
+    assert numpy.allclose(f(av,bv,0.5),f2(av,bv,0.5))
+
 def test_gemm():

    a = tcn.shared_constructor(my_rand(4,4), 'a')