Merge pull request #6367 from notoraptor/fallback-gemv

Add fallback implementation for BLAS [sd]gemv_.

Merge pull request #6367 from notoraptor/fallback-gemv
2ecf9f1d · abergeron · GitHub · d140f1a4 · 243857c5 · 2ecf9f1d
--- a/theano/tensor/blas_c.py
+++ b/theano/tensor/blas_c.py
@@ -444,7 +444,7 @@ def gemv_c_code(y, A, x, z, alpha, beta, fail,
        dtype_%(x)s* x_data = (dtype_%(x)s*) PyArray_DATA(%(x)s);
        dtype_%(z)s* z_data = (dtype_%(z)s*) PyArray_DATA(%(z)s);
        // gemv expects pointers to the beginning of memory arrays,
-        // but numpy provides provides a pointer to the first element,
+        // but numpy provides a pointer to the first element,
        // so when the stride is negative, we need to get the last one.
        if (Sx < 0)
            x_data += (NA1 - 1) * Sx;

--- a/theano/tensor/blas_headers.py
+++ b/theano/tensor/blas_headers.py
@@ -731,29 +731,26 @@ def cblas_header_text():
 def blas_header_text():
    """C header for the fortran blas interface"""

-    gemm_code = ""
-    const = "const"
+    blas_code = ""
    if not config.blas.ldflags:
        # Include the Numpy version implementation of [sd]gemm_.
        current_filedir = dirname(__file__)
-        gemm_common_filepath = os.path.join(current_filedir, 'c_code', 'alt_gemm_common.c')
-        gemm_template_filepath = os.path.join(current_filedir, 'c_code', 'alt_gemm_template.c')
+        blas_common_filepath = os.path.join(current_filedir, 'c_code', 'alt_blas_common.h')
+        blas_template_filepath = os.path.join(current_filedir, 'c_code', 'alt_blas_template.c')
        common_code = ""
-        sgemm_code = ""
-        dgemm_code = ""
-        with open(gemm_common_filepath) as code:
+        sblas_code = ""
+        dblas_code = ""
+        with open(blas_common_filepath) as code:
            common_code = code.read()
-        with open(gemm_template_filepath) as code:
+        with open(blas_template_filepath) as code:
            template_code = code.read()
-            sgemm_code = template_code % {"float_type": "float", "float_size": 4, "npy_float": "NPY_FLOAT32", "name": "sgemm_"}
-            dgemm_code = template_code % {"float_type": "double", "float_size": 8, "npy_float": "NPY_FLOAT64", "name": "dgemm_"}
-        if not common_code or not sgemm_code:
-            raise IOError("Unable to load NumPy implementation of gemm code from C source files.")
-        else:
-            const = ""
-        gemm_code += common_code
-        gemm_code += sgemm_code
-        gemm_code += dgemm_code
+            sblas_code = template_code % {"float_type": "float", "float_size": 4, "npy_float": "NPY_FLOAT32", "precision": "s"}
+            dblas_code = template_code % {"float_type": "double", "float_size": 8, "npy_float": "NPY_FLOAT64", "precision": "d"}
+        if not common_code or not template_code:
+            raise IOError("Unable to load NumPy implementation of BLAS functions from C source files.")
+        blas_code += common_code
+        blas_code += sblas_code
+        blas_code += dblas_code

    header = """
    extern "C"
@@ -916,7 +913,7 @@ def blas_header_text():

    /* Single Precision */

-        void sgemm_(char*, char*, const int*, const int*, const int*, const float *, %(const)s float *, const int*, %(const)s float *, const int*, const float *, float *, const int*);
+        void sgemm_(char*, char*, const int*, const int*, const int*, const float *, const float *, const int*, const float *, const int*, const float *, float *, const int*);
        void ssymm_(char*, char*, const int*, const int*, const float *, const float *, const int*, const float *, const int*, const float *, float *, const int*);
        void ssyrk_(char*, char*, const int*, const int*, const float *, const float *, const int*, const float *, float *, const int*);
        void ssyr2k_(char*, char*, const int*, const int*, const float *, const float *, const int*, const float *, const int*, const float *, float *, const int*);
@@ -925,7 +922,7 @@ def blas_header_text():

    /* Double Precision */

-        void dgemm_(char*, char*, const int*, const int*, const int*, const double *, %(const)s double *, const int*, %(const)s double *, const int*, const double *, double *, const int*);
+        void dgemm_(char*, char*, const int*, const int*, const int*, const double *, const double *, const int*, const double *, const int*, const double *, double *, const int*);
        void dsymm_(char*, char*, const int*, const int*, const double *, const double *, const int*, const double *, const int*, const double *, double *, const int*);
        void dsyrk_(char*, char*, const int*, const int*, const double *, const double *, const int*, const double *, double *, const int*);
        void dsyr2k_(char*, char*, const int*, const int*, const double *, const double *, const int*, const double *, const int*, const double *, double *, const int*);
@@ -984,7 +981,11 @@ def blas_header_text():
                    }
                    """)

-    return (header % {'const': const}) + gemm_code
+    return header + blas_code
+
+
+if not config.blas.ldflags:
+    _logger.warning('Using NumPy C-API based implementation for BLAS functions.')


 def mkl_threads_text():
@@ -1032,7 +1033,7 @@ def openblas_threads_text():

 def blas_header_version():
    # Version for the base header
-    version = (2,)
+    version = (5,)
    if detect_macos_sdot_bug():
        if detect_macos_sdot_bug.fix_works:
            # Version with fix

--- a/theano/tensor/c_code/alt_blas_common.h
+++ b/theano/tensor/c_code/alt_blas_common.h
+/** C Implementation (with NumPy back-end) of BLAS functions used in Theano.
+ * Used instead of BLAS when Theano flag ``blas.ldflags`` is empty.
+ * This file contains some useful header code not templated.
+ * File alt_blas_template.c currently contains template code for:
+ * - [sd]gemm_
+ * - [sd]gemv_
+ * - [sd]dot_
+ **/
+
+#define alt_fatal_error(message) { if (PyErr_Occurred()) PyErr_Print(); if(message != NULL) fprintf(stderr, message); exit(-1); }
+
+#define alt_trans_to_bool(trans)  (*trans != 'N' && *trans != 'n')
+
+/**Template code for BLAS functions follows in file alt_blas_template.c
+ * (as Python string to be used with old formatting).
+ * PARAMETERS:
+ * float_type: "float" or "double".
+ * float_size: 4 for float32 (sgemm_), 8 for float64 (dgemm_).
+ * npy_float: "NPY_FLOAT32" or "NPY_FLOAT64".
+ * precision: "s" for single, "d" for double.
+ * See blas_headers.py for current use.**/
--- a/theano/tensor/c_code/alt_gemm_template.c
+++ b/theano/tensor/c_code/alt_gemm_template.c
-/** %(name)s **/
+/** Alternative template NumPy-based implementation of BLAS functions used in Theano. **/

-/* Scalar*Matrix function.
- * Computes: matrix = scalar*matrix. */
+/* Scalar * Matrix function.
+ * Computes: matrix = scalar * matrix. */
 void alt_numpy_scale_matrix_inplace_%(float_type)s(const %(float_type)s* scalar, PyArrayObject* matrix) {
-    NpyIter* iterator = NpyIter_New(matrix, 
-        NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, 
+    NpyIter* iterator = NpyIter_New(matrix,
+        NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK,
        NPY_KEEPORDER, NPY_NO_CASTING, NULL);
    if(iterator == NULL)
        alt_fatal_error("Unable to iterate over a matrix "
@@ -25,7 +25,8 @@ void alt_numpy_scale_matrix_inplace_%(float_type)s(const %(float_type)s* scalar,
    } while(get_next(iterator));
    NpyIter_Deallocate(iterator);
 }
-/* Matrix+Matrix function.
+
+/* Matrix + Matrix function.
 * Computes: matrix2 = (scalar1 * matrix1) + (scalar2 * matrix2) */
 void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
        const %(float_type)s* scalar1, PyArrayObject* matrix1,
@@ -48,11 +49,12 @@ void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
    } while(get_next(iterators));
    NpyIter_Deallocate(iterators);
 }
+
 /* NumPy Wrapping function. Wraps a data into a NumPy's PyArrayObject.
 * By default, data is considered as Fortran-style array (column by column).
 * If to_transpose, data will be considered as C-style array (row by row)
 * with dimensions reversed. */
-PyObject* alt_op_%(float_type)s(int to_transpose, %(float_type)s* M, int nrow, int ncol, int LDM) {
+PyObject* alt_op_%(float_type)s(int to_transpose, %(float_type)s* M, int nrow, int ncol, int LDM, int numpyFlags) {
    npy_intp dims[2];
    npy_intp strides[2];
    if(to_transpose) {
@@ -66,9 +68,10 @@ PyObject* alt_op_%(float_type)s(int to_transpose, %(float_type)s* M, int nrow, i
        strides[0] = %(float_size)d;
        strides[1] = LDM * %(float_size)d;
    }
-    return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, M, 0, 0, NULL);
+    return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, M, 0, numpyFlags, NULL);
 }
-/* Special wrapping case used for matrix C in gemm implementation. */
+
+/* Special wrapping case used for matrix C in gemm_ implementation. */
 inline PyObject* alt_wrap_fortran_writeable_matrix_%(float_type)s(
    %(float_type)s* matrix, const int* nrow, const int* ncol, const int* LD
 ) {
@@ -76,15 +79,16 @@ inline PyObject* alt_wrap_fortran_writeable_matrix_%(float_type)s(
    npy_intp strides[2] = {%(float_size)d, (*LD) * %(float_size)d};
    return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, matrix, 0, NPY_ARRAY_WRITEABLE, NULL);
 }
-/* %(name)s template code */
-void %(name)s(
+
+/* gemm */
+void %(precision)sgemm_(
    char* TRANSA, char* TRANSB, const int* M, const int* N, const int* K,
-    const %(float_type)s* ALPHA, %(float_type)s* A, const int* LDA, 
-    %(float_type)s* B, const int* LDB, const %(float_type)s* BETA, 
+    const %(float_type)s* ALPHA, %(float_type)s* A, const int* LDA,
+    %(float_type)s* B, const int* LDB, const %(float_type)s* BETA,
    %(float_type)s* C, const int* LDC
 ) {
    if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
-        alt_fatal_error("The integer arguments passed to %(name)s must all be at least 0.");
+        alt_fatal_error("The integer arguments passed to %(precision)sgemm_ must all be at least 0.");
    /* If M or N is null, there is nothing to do with C,
     * as C should contain M*N == 0 items. */
    if(*M == 0 || *N == 0)
@@ -136,13 +140,15 @@ void %(name)s(
     * for A and B will be reversed, so that the buffer will contain
     * C-contiguous opB_transposed * opA_transposed (N*M matrix).
     * After that, the code that uses the buffer (either the code calling
-     * this function, or this function if BETA != 0) just has to 
+     * this function, or this function if BETA != 0) just has to
     * consider the buffer as a F-contiguous M*N matrix, so that
     * it will get the transposed of op_B_transposed * op_A_transposed,
     * that is op_A * op_B (M*N matrix) as expected. */
-    PyObject* opA_transposed = alt_op_%(float_type)s(!to_transpose_A, A, nrowa, ncola, *LDA);
-    PyObject* opB_transposed = alt_op_%(float_type)s(!to_transpose_B, B, nrowb, ncolb, *LDB);
-    PyObject* opB_trans_dot_opA_trans = PyArray_New(&PyArray_Type, 2, computation_dims, %(npy_float)s, computation_strides, computation_pointer, 0, computation_flags, NULL);
+    PyObject* opA_transposed = alt_op_%(float_type)s(!to_transpose_A, A, nrowa, ncola, *LDA, 0);
+    PyObject* opB_transposed = alt_op_%(float_type)s(!to_transpose_B, B, nrowb, ncolb, *LDB, 0);
+    PyObject* opB_trans_dot_opA_trans = PyArray_New(&PyArray_Type, 2, computation_dims, %(npy_float)s,
+                                                    computation_strides, computation_pointer, 0,
+                                                    computation_flags, NULL);
    PyArray_MatrixProduct2(opB_transposed, opA_transposed, (PyArrayObject*)opB_trans_dot_opA_trans);
    /* PyArray_MatrixProduct2 adds a reference to the output array,
     * which we need to remove to avoid a memory leak. */
@@ -156,7 +162,7 @@ void %(name)s(
            PyObject* matrix_C = alt_wrap_fortran_writeable_matrix_%(float_type)s(C, M, N, LDC);
            PyObject* alpha_opA_dot_opB = PyArray_Transpose((PyArrayObject*)opB_trans_dot_opA_trans, NULL);
            if(0 != PyArray_CopyInto((PyArrayObject*)matrix_C, (PyArrayObject*)alpha_opA_dot_opB))
-                alt_fatal_error("NumPy %(name)s implementation: unable to copy ALPHA*op(A)*op(B) into C when BETA == 0.");
+                alt_fatal_error("NumPy %(precision)sgemm_ implementation: unable to copy ALPHA*op(A)*op(B) into C when BETA == 0.");
            Py_XDECREF(alpha_opA_dot_opB);
            Py_XDECREF(matrix_C);
        }
@@ -164,7 +170,8 @@ void %(name)s(
        /* C is read, so we must consider it as Fortran-style matrix. */
        PyObject* matrix_C = alt_wrap_fortran_writeable_matrix_%(float_type)s(C, M, N, LDC);
        PyObject* opA_dot_opB = PyArray_Transpose((PyArrayObject*)opB_trans_dot_opA_trans, NULL);
-        alt_numpy_matrix_extended_sum_inplace_%(float_type)s(ALPHA, (PyArrayObject*)opA_dot_opB, BETA, (PyArrayObject*)matrix_C);
+        alt_numpy_matrix_extended_sum_inplace_%(float_type)s(ALPHA, (PyArrayObject*)opA_dot_opB,
+                                                             BETA, (PyArrayObject*)matrix_C);
        Py_XDECREF(opA_dot_opB);
        Py_XDECREF(matrix_C);
    }
@@ -172,3 +179,137 @@ void %(name)s(
    Py_XDECREF(opB_transposed);
    Py_XDECREF(opA_transposed);
 }
+
+/* gemv */
+void %(precision)sgemv_(
+    char* TRANS,
+    const int* M,
+    const int* N,
+    const %(float_type)s* ALPHA,
+    %(float_type)s* A,
+    const int* LDA,
+    %(float_type)s* x,
+    const int* incx,
+    const %(float_type)s* BETA,
+    %(float_type)s* y,
+    const int* incy
+) {
+    /**
+    If TRANS is 'n' or 'N', computes:
+        y = ALPHA * A * x + BETA * y
+    Else, computes:
+        y = ALPHA * A.T * x + BETA * y
+    A is a M*N matrix, A.T is A transposed
+    x, y are vectors
+    ALPHA, BETA are scalars
+    **/
+
+    // If alpha == 0 and beta == 1, we have nothing to do, as alpha*A*x + beta*y == y.
+    if (*ALPHA == 0 && *BETA == 1)
+        return;
+    if (*M < 0 || *N < 0 || *LDA < 0)
+        alt_fatal_error("NumPy %(precision)sgemv_ implementation: M, N and LDA must be at least 0.");
+    if (*incx == 0 || *incy == 0)
+        alt_fatal_error("NumPy %(precision)sgemv_ implementation: incx and incy must not be 0.");
+    int transpose = alt_trans_to_bool(TRANS);
+    int size_x = 0, size_y = 0;
+    if (transpose) {
+        size_x = *M;
+        size_y = *N;
+    } else {
+        size_x = *N;
+        size_y = *M;
+    }
+    if (*M == 0 || *N == 0) {
+        /* A contains M * N == 0 values. y should be empty too, and we have nothing to do. */
+        if (size_y != 0)
+            alt_fatal_error("NumPy %(precision)sgemv_ implementation: the output vector should be empty.");
+        return;
+    }
+    /* Vector pointers points to the begining of memory (see function `theano.tensor.blas_c.gemv_c_code`).
+     * NumPy seems to expect that pointers points to the first element of the array. */
+    if (*incx < 0)
+        x += (size_x - 1) * (-*incx);
+    if (*incy < 0)
+        y += (size_y - 1) * (-*incy);
+    PyObject* matrixA = alt_op_%(float_type)s(transpose, A, *M, *N, *LDA, 0);
+    PyObject* matrixX = alt_op_%(float_type)s(1, x, 1, size_x, *incx, 0);
+    PyObject* matrixY = alt_op_%(float_type)s(1, y, 1, size_y, *incy, NPY_ARRAY_WRITEABLE);
+    if (matrixA == NULL || matrixX == NULL || matrixY == NULL)
+        alt_fatal_error("NumPy %(precision)sgemv_ implementation: unable to wrap A, x or y arrays.")
+    if (*ALPHA == 0) {
+        // Just BETA * y
+        alt_numpy_scale_matrix_inplace_%(float_type)s(BETA, (PyArrayObject*)matrixY);
+    } else if (*BETA == 0) {
+        // We can directly compute alpha * A * x into y if y is C-contiguous.
+        if (PyArray_IS_C_CONTIGUOUS((PyArrayObject*)matrixY)) {
+            PyArray_MatrixProduct2(matrixA, matrixX, (PyArrayObject*)matrixY);
+            // PyArray_MatrixProduct2 adds an extra reference to the output array.
+            Py_XDECREF(matrixY);
+            alt_numpy_scale_matrix_inplace_%(float_type)s(ALPHA, (PyArrayObject*)matrixY);
+        } else {
+            // If y is not contiguous, we need a temporar workspace.
+            PyObject* tempAX = PyArray_MatrixProduct(matrixA, matrixX);
+            if (tempAX == NULL)
+                alt_fatal_error("NumPy %(precision)sgemv_ implementation: Unable to get matrix product.");
+            alt_numpy_scale_matrix_inplace_%(float_type)s(ALPHA, (PyArrayObject*)tempAX);
+            if(0 != PyArray_CopyInto((PyArrayObject*)matrixY, (PyArrayObject*)tempAX)) {
+                alt_fatal_error("NumPy %(precision)sgemv_ implementation: unable to update output.");
+            }
+            Py_XDECREF(tempAX);
+        }
+    } else {
+        // We must perform full computation.
+        PyObject* tempAX = PyArray_MatrixProduct(matrixA, matrixX);
+        if (tempAX == NULL)
+            alt_fatal_error("NumPy %(precision)sgemv_ implementation: unable to get matrix product.");
+        // ALPHA * (A * x) + BETA * y.
+        alt_numpy_matrix_extended_sum_inplace_%(float_type)s(ALPHA, (PyArrayObject*)tempAX,
+                                                             BETA, (PyArrayObject*)matrixY);
+        Py_XDECREF(tempAX);
+    }
+    Py_XDECREF(matrixY);
+    Py_XDECREF(matrixX);
+    Py_XDECREF(matrixA);
+}
+
+/* dot */
+%(float_type)s %(precision)sdot_(
+    const int* N,
+    %(float_type)s *SX,
+    const int *INCX,
+    %(float_type)s *SY,
+    const int *INCY
+) {
+    if (*N < 0)
+        alt_fatal_error("NumPy %(precision)sdot_ implementation: N must be at least 0.");
+    if (*INCX == 0 || *INCY == 0)
+        alt_fatal_error("NumPy %(precision)sdot_ implementation: INCX and INCY must not be 0.");
+    %(float_type)s result = 0;
+    int one = 1;
+    /* Vector pointers points to the begining of memory (see function `theano.tensor.blas_c.gemv_c_code`).
+     * NumPy seems to expect that pointers points to the first element of the array. */
+    if (*INCX < 0)
+        SX += (*N - 1) * (-*INCX);
+    if (*INCY < 0)
+        SY += (*N - 1) * (-*INCY);
+    // Create vector_x with shape (1, N)
+    PyObject* vector_x = alt_op_%(float_type)s(0, SX, 1, *N, *INCX, 0);
+    // Create vector_y with shape (N, 1)
+    PyObject* vector_y = alt_op_%(float_type)s(1, SY, 1, *N, *INCY, 0);
+    // Create output scalar z with shape (1, 1) to wrap `result`.
+    PyArrayObject* dot_product = (PyArrayObject*)alt_wrap_fortran_writeable_matrix_%(float_type)s(&result, &one, &one, &one);
+
+    if (vector_x == NULL || vector_y == NULL || dot_product == NULL)
+        alt_fatal_error("NumPy %(precision)sdot_ implementation: unable to wrap x, y or output arrays.");
+
+    // Compute matrix product: (1, N) * (N, 1) => (1, 1)
+    PyArray_MatrixProduct2(vector_x, vector_y, dot_product);
+    if (PyErr_Occurred())
+        alt_fatal_error("NumPy %(precision)sdot_ implementation: unable to compute dot.");
+    // Get result.
+    Py_XDECREF(dot_product);
+    Py_XDECREF(vector_y);
+    Py_XDECREF(vector_x);
+    return result;
+}
--- a/theano/tensor/c_code/alt_gemm_common.c
+++ b/theano/tensor/c_code/alt_gemm_common.c
-/** C Implementation of [sd]gemm_ based on NumPy
- * Used instead of blas when Theano config flag blas.ldflags is empty.
- * This file contains the common code for [sd]gemm_.
- * File alt_gemm_template.c contains template code for [sd]gemm_. **/
-
-#define alt_fatal_error(message) { if(message != NULL) fprintf(stderr, message); exit(-1); }
-
-#define alt_trans_to_bool(trans)  (*trans != 'N' && *trans != 'n')
-
-/**Template code for [sd]gemm_ follows in file alt_gemm_template.c
- * (as Python string to be used with old formatting).
- * PARAMETERS:
- * float_type: "float" for sgemm_, "double" for dgemm_.
- * float_size: 4 for float32 (sgemm_), 8 for float64 (dgemm_).
- * npy_float: "NPY_FLOAT32" for sgemm_, "NPY_FLOAT64" for dgemm_.
- * name: "sgemm_" for sgemm_, "dgemm_" for dgemm_. 
- * See blas_headers.py for current use.**/
--- a/theano/tensor/tests/test_blas_c.py
+++ b/theano/tensor/tests/test_blas_c.py
@@ -316,5 +316,92 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
        skip_if_blas_ldflags_empty()


+class TestCGemvNoFlags(object):
+    mode = mode_blas_opt
+    gemv = CGemv(inplace=False)
+    M = 4
+    N = 5
+    slice_step = 3
+
+    def setUp(self):
+        unittest_tools.seed_rng()
+
+    def get_function(self, dtype, transpose_A=False, slice_tensors=False):
+        alpha = theano.tensor.scalar(dtype=dtype)
+        beta = theano.tensor.scalar(dtype=dtype)
+        A = theano.tensor.matrix(dtype=dtype)
+        x = theano.tensor.vector(dtype=dtype)
+        y = theano.tensor.vector(dtype=dtype)
+        if transpose_A:
+            A_1 = A.T
+        else:
+            A_1 = A
+        if slice_tensors:
+            A_2 = A_1[::-self.slice_step]
+            x_2 = x[::-self.slice_step]
+            y_2 = y[::-self.slice_step]
+        else:
+            A_2 = A_1
+            x_2 = x
+            y_2 = y
+        return theano.function([alpha, A, x, beta, y], self.gemv(y_2, alpha, A_2, x_2, beta))
+
+    def get_data(self, dtype, alpha, beta, transpose_A=False, slice_tensors=False):
+        if slice_tensors:
+            if transpose_A:
+                A_shape = (self.N, self.M * self.slice_step)
+            else:
+                A_shape = (self.M * self.slice_step, self.N)
+            x_shape = (self.N * self.slice_step,)
+            y_shape = (self.M * self.slice_step,)
+        else:
+            if transpose_A:
+                A_shape = (self.N, self.M)
+            else:
+                A_shape = (self.M, self.N)
+            x_shape = (self.N,)
+            y_shape = (self.M,)
+        A = np.random.random(A_shape).astype(dtype)
+        x = np.random.random(x_shape).astype(dtype)
+        y = np.random.random(y_shape).astype(dtype)
+        return (alpha, A, x, beta, y)
+
+    def compute_ref(self, alpha, A, x, beta, y, transpose_A, slice_tensors):
+        if transpose_A:
+            A = A.T
+        if slice_tensors:
+            A = A[::-self.slice_step]
+            x = x[::-self.slice_step]
+            y = y[::-self.slice_step]
+        ref_val = alpha * np.dot(A, x)
+        if beta != 0:
+            ref_val += beta * y
+        return ref_val
+
+    @theano.change_flags({'blas.ldflags': ''})
+    def run_cgemv(self, dtype, ALPHA, BETA, transpose_A, slice_tensors):
+        f = self.get_function(dtype, transpose_A=transpose_A, slice_tensors=slice_tensors)
+        values = self.get_data(dtype, ALPHA, BETA, transpose_A=transpose_A, slice_tensors=slice_tensors)
+        assert any(isinstance(node.op, CGemv) for node in f.maker.fgraph.apply_nodes)
+        z_val = f(*values)
+        assert z_val.dtype == dtype
+        assert z_val.ndim == 1
+        assert z_val.shape[0] == self.M
+        ref_val = self.compute_ref(*(values + (transpose_A, slice_tensors)))
+        unittest_tools.assert_allclose(ref_val, z_val)
+
+    def test_cgemv(self):
+        for dtype in ('float32', 'float64'):
+            for alpha in (0, 1, -2):
+                for beta in (0, 1, -2):
+                    for transpose_A in (False, True):
+                        for slice_tensors in (False, True):
+                            yield (self.run_cgemv, dtype, alpha, beta, transpose_A, slice_tensors)
+
+
+class TestSdotNoFlags(TestCGemvNoFlags):
+    M = 1
+
+
 class TestBlasStridesC(TestBlasStrides):
    mode = mode_blas_opt