added GpuGemv and GpuGer

f85856b2 · James Bergstra · Frederic · 1b01d9b9 · f85856b2 · f85856b2
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -251,6 +251,188 @@ class GpuGemm(Op):
 gpu_gemm_no_inplace = GpuGemm(inplace=False)
 gpu_gemm_inplace = GpuGemm(inplace=True)
+class GpuGemv(Op):
+    """
+    implement gemv on the gpu.
+    """
+    def __init__(self, inplace):
+        self.__setstate__({'inplace':inplace})
+    def __str__(self):
+        if self.inplace:
+            return 'GpuGemv{inplace}'
+        else:
+            return 'GpuGemv{no_inplace}'
+    def __eq__(self, other):
+        return (type(self) == type(other)\
+                and self.inplace == other.inplace)
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.inplace)
+    def __setstate__(self, dct):
+        inplace = dct.get('inplace', True)
+        if inplace:
+            self.destroy_map = {0: [0]}
+        self.inplace = inplace
+    def __getstate__(self):
+        return dict(inplace=self.inplace)
+    def make_node(self, z, a, x, y, b):
+        # the more complicated error checking performed by tensor.gemv is assumed to already
+        # have been done
+        return Apply(self, [z, a, x, y, b], [z.type()])
+    def c_code_cache_version(self):
+        return ()
+    def c_code(self, node, name, inputs, outputs, sub):
+        #z_out = alpha * dot(x,y) + beta * z_in
+        #inplace version, set set z_out = z_in
+        #not inplace version, we copy z_in to z_out.
+        z_in, a, x, y, b = inputs
+        z_out, = outputs
+        fail = sub['fail']
+        sio = StringIO.StringIO()
+        print >> sio, """
+        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
+        float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];
+        """
+        if self.inplace:
+            print >> sio, """
+            Py_XDECREF(%(z_out)s);
+            %(z_out)s = %(z_in)s;
+            Py_INCREF(%(z_out)s);
+            """
+        else:
+            print >> sio, """
+            if (!%(z_out)s
+                || (%(z_out)s->nd != 1)
+                || (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
+                )
+            {
+                Py_XDECREF(%(z_out)s);
+                %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
+                if (!%(z_out)s)
+                {
+                    %(fail)s;
+                }
+            }
+            else
+            {
+                if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
+                {
+                    %(fail)s;
+                }
+            }
+            """
+        print >> sio, """
+        if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s))
+        {
+            %(fail)s;
+        }
+        """
+        return sio.getvalue() % locals()
+gpu_gemv_no_inplace = GpuGemv(inplace=False)
+gpu_gemv_inplace = GpuGemv(inplace=True)
+class GpuGer(Op):
+    """
+    implement ger on the gpu.
+    """
+    def __init__(self, inplace):
+        self.__setstate__({'inplace':inplace})
+    def __str__(self):
+        if self.inplace:
+            return 'GpuGer{inplace}'
+        else:
+            return 'GpuGer{no_inplace}'
+    def __eq__(self, other):
+        return (type(self) == type(other)\
+                and self.inplace == other.inplace)
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.inplace)
+    def __setstate__(self, dct):
+        inplace = dct.get('inplace', True)
+        if inplace:
+            self.destroy_map = {0: [0]}
+        self.inplace = inplace
+    def __getstate__(self):
+        return dict(inplace=self.inplace)
+    def make_node(self, z, a, x, y):
+        # the more complicated error checking performed by tensor.ger is
+        # assumed to already have been done
+        return Apply(self, [z, a, x, y], [z.type()])
+    def c_code_cache_version(self):
+        return ()
+    def c_code(self, node, name, inputs, outputs, sub):
+        #z_out = alpha * dot(x,y) + beta * z_in
+        #inplace version, set set z_out = z_in
+        #not inplace version, we copy z_in to z_out.
+        z_in, a, x, y = inputs
+        z_out, = outputs
+        fail = sub['fail']
+        sio = StringIO.StringIO()
+        print >> sio, """
+        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
+        """
+        if self.inplace:
+            print >> sio, """
+            Py_XDECREF(%(z_out)s);
+            %(z_out)s = %(z_in)s;
+            Py_INCREF(%(z_out)s);
+            """
+        else:
+            print >> sio, """
+            if (!%(z_out)s
+                || (%(z_out)s->nd != 2)
+                || (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
+                || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
+                )
+            {
+                Py_XDECREF(%(z_out)s);
+                %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
+                if (!%(z_out)s)
+                {
+                    %(fail)s;
+                }
+            }
+            else
+            {
+                if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
+                {
+                    %(fail)s;
+                }
+            }
+            """
+        print >> sio, """
+        if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s))
+        {
+            %(fail)s;
+        }
+        """
+        return sio.getvalue() % locals()
+gpu_ger_no_inplace = GpuGer(inplace=False)
+gpu_ger_inplace = GpuGer(inplace=True)
 class GpuOuter(Op):
    def make_node(self, x, y):
        # we suppose type checking has been done, but make sure.

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -2892,6 +2892,86 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    return 0;
 }
+int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
+{
+    if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemv"); return -1; }
+    if (B->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
+    if (C->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
+    // We must allow dimensions to be zeros.
+    if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
+            || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0]))
+    {
+        PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemv (%i,%i)x(%i)->(%i)",
+                CudaNdarray_HOST_DIMS(A)[0],
+                CudaNdarray_HOST_DIMS(A)[1],
+                CudaNdarray_HOST_DIMS(B)[0],
+                CudaNdarray_HOST_DIMS(C)[0]);
+        return -1;
+    }
+    // a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place
+    // TODO: make a copy instead of returning in error
+    if (((CudaNdarray_HOST_DIMS(A)[0] > 1) && (CudaNdarray_HOST_STRIDES(A)[0] != 1)) && ((CudaNdarray_HOST_DIMS(A)[1] > 1) && (CudaNdarray_HOST_STRIDES(A)[1] != 1)))
+    { PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemv arg"); return -1; }
+    // I don't know if cudablas handles negative strides
+    if (   (CudaNdarray_HOST_STRIDES(A)[0] < 0)
+        || (CudaNdarray_HOST_STRIDES(A)[1] < 0)
+        || (CudaNdarray_HOST_STRIDES(B)[0] < 0)
+        || (CudaNdarray_HOST_STRIDES(C)[0] < 0))
+    {
+        PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)x(%i)->(%i)",
+                CudaNdarray_HOST_STRIDES(A)[0],
+                CudaNdarray_HOST_STRIDES(A)[1],
+                CudaNdarray_HOST_STRIDES(B)[0],
+                CudaNdarray_HOST_STRIDES(C)[0]);
+        return -1;
+    }
+    /* create appropriate strides for malformed matrices that are row or column
+     * vectors
+     */
+    int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
+    int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
+    int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : CudaNdarray_HOST_DIMS(B)[1];
+    int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : CudaNdarray_HOST_DIMS(C)[1];
+    if (sa_0 == 1)
+    {
+        cublasSgemv('N',
+                CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
+                alpha,
+                CudaNdarray_DEV_DATA(A), sa_1,
+                CudaNdarray_DEV_DATA(B), sb_0,
+                beta,
+                CudaNdarray_DEV_DATA(C), sc_0);
+    }
+    else if (sa_1 == 1)
+    {
+        cublasSgemv('T',
+                CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
+                alpha,
+                CudaNdarray_DEV_DATA(A), sa_0,
+                CudaNdarray_DEV_DATA(B), sb_0,
+                beta,
+                CudaNdarray_DEV_DATA(C), sc_0);
+    }
+    else
+    {
+        PyErr_SetString(PyExc_NotImplementedError, "too many strides strides in sgemv");
+        return -1;
+    }
+    CNDA_THREAD_SYNC;
+    cudaError_t err = cudaGetLastError();
+    if (CUBLAS_STATUS_SUCCESS != err)
+    {
+        PyErr_Format(PyExc_RuntimeError, "cublassGemv failed (%s)",cudaGetErrorString(err));
+        return -1;
+    }
+    return 0;
+}
 int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) {
    if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
    if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -320,6 +320,7 @@ DllExport bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
 DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
 DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
+DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
 DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);
 DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -16,6 +16,10 @@ from theano.sandbox.cuda.basic_ops import *
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
        gpu_gemm_inplace, gpu_gemm_no_inplace, gpu_outer, GpuConv)
+from theano.sandbox.cuda.blas import gpu_gemv_inplace
+from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
+from theano.sandbox.cuda.blas import gpu_ger_inplace
+from theano.sandbox.cuda.blas import gpu_ger_no_inplace
 from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
        GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import (
@@ -375,47 +379,82 @@ def local_gpu_dot22scalar(node):
 @register_opt()
 @local_optimizer([])
-def local_gpu_gemv_as_gemm(node):
+def local_gpu_gemv(node):
    """
    gpu_from_host(gemv) -> gpu_gemv(gpu_from_host)
-    gemm(host_from_gpu) -> host_from_gpu(gpu_gemv)
+    gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)
-    This optimization solves the vector-matrix multiplication issue by
-    transforming the vector into a matrix, apply gpudot22 and reshaping
-    the output.
-    A more suitable solution would be to use the right cublas call
    """
-    gemvs = {tensor.blas.gemv_inplace: gpu_gemm_inplace,
+    gemvs = {tensor.blas.gemv_inplace: gpu_gemv_inplace,
-            tensor.blas.gemv_no_inplace: gpu_gemm_no_inplace}
+            tensor.blas.gemv_no_inplace: gpu_gemv_no_inplace,
+            tensor.blas_c.CGemv(inplace=True): gpu_gemv_inplace,
+            tensor.blas_c.CGemv(inplace=False): gpu_gemv_no_inplace,
+            }
    if node.op == gpu_from_host:
        host_input = node.inputs[0]
        if host_input.owner and host_input.owner.op in gemvs:
            op = host_input.owner.op
            z, a, x, y, b = host_input.owner.inputs
-            return [
+            return [gemvs[op](
-                GpuDimShuffle((False,True),[0])(gemvs[op](
+                    gpu_from_host(z)
-                    GpuDimShuffle((False,),[0,'x'])(gpu_from_host(z))
                    , a
                    , gpu_from_host(x)
-                    , GpuDimShuffle((False,),[0,'x'])(gpu_from_host(y))
+                    , gpu_from_host(y)
-                    , b))]
+                    , b)]
    if node.op in gemvs:
        z, a, x, y, b = node.inputs
        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
        if x_on_gpu or y_on_gpu or z_on_gpu:
-            return [host_from_gpu(GpuDimShuffle((False,True),[0])(
+            return [host_from_gpu(
                gemvs[node.op](
-                    GpuDimShuffle((False,),[0,'x'])(gpu_from_host(z))
+                    gpu_from_host(z)
                    , a
                    , gpu_from_host(x)
-                    , GpuDimShuffle((False,),[0,'x'])(gpu_from_host(y))
+                    , gpu_from_host(y)
-                    , b)))]
+                    , b))]
    return False
+@register_opt()
+@local_optimizer([])
+def local_gpu_ger(node):
+    """
+    gpu_from_host(gemv) -> gpu_gemv(gpu_from_host)
+    gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)
+    """
+    gers = {
+            tensor.blas_c.CGer(destructive=True): gpu_ger_inplace,
+            tensor.blas_c.CGer(destructive=False): gpu_ger_no_inplace,
+            }
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if host_input.owner and host_input.owner.op in gers:
+            op = host_input.owner.op
+            z, a, x, y = host_input.owner.inputs
+            return [gers[op](
+                    gpu_from_host(z)
+                    , a
+                    , gpu_from_host(x)
+                    , gpu_from_host(y)
+                    )]
+    if node.op in gers:
+        z, a, x, y = node.inputs
+        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
+        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
+        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
+        if x_on_gpu or y_on_gpu or z_on_gpu:
+            return [host_from_gpu(
+                gers[node.op](
+                    gpu_from_host(z)
+                    , a
+                    , gpu_from_host(x)
+                    , gpu_from_host(y)
+                    ))]
+    return False
 @register_opt()
 @local_optimizer([])
 def local_gpu_gemm(node):