Merge pull request #375 from nouiz/c_blas

additions and improvements to blas on both CPU and GPU (including GEMV, GER in blas_c)

Merge pull request #375 from nouiz/c_blas
f2743791 · James Bergstra · d9c41f58 · 03e7a048 · f2743791 · f2743791
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -251,6 +251,188 @@ class GpuGemm(Op):
 gpu_gemm_no_inplace = GpuGemm(inplace=False)
 gpu_gemm_inplace = GpuGemm(inplace=True)
+class GpuGemv(Op):
+    """
+    implement gemv on the gpu.
+    """
+    def __init__(self, inplace):
+        self.__setstate__({'inplace':inplace})
+    def __str__(self):
+        if self.inplace:
+            return 'GpuGemv{inplace}'
+        else:
+            return 'GpuGemv{no_inplace}'
+    def __eq__(self, other):
+        return (type(self) == type(other)\
+                and self.inplace == other.inplace)
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.inplace)
+    def __setstate__(self, dct):
+        inplace = dct.get('inplace', True)
+        if inplace:
+            self.destroy_map = {0: [0]}
+        self.inplace = inplace
+    def __getstate__(self):
+        return dict(inplace=self.inplace)
+    def make_node(self, z, a, x, y, b):
+        # the more complicated error checking performed by tensor.gemv is assumed to already
+        # have been done
+        return Apply(self, [z, a, x, y, b], [z.type()])
+    def c_code_cache_version(self):
+        return (1,)
+    def c_code(self, node, name, inputs, outputs, sub):
+        #z_out = alpha * dot(x,y) + beta * z_in
+        #inplace version, set set z_out = z_in
+        #not inplace version, we copy z_in to z_out.
+        z_in, a, x, y, b = inputs
+        z_out, = outputs
+        fail = sub['fail']
+        sio = StringIO.StringIO()
+        print >> sio, """
+        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
+        float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];
+        """
+        if self.inplace:
+            print >> sio, """
+            Py_XDECREF(%(z_out)s);
+            %(z_out)s = %(z_in)s;
+            Py_INCREF(%(z_out)s);
+            """
+        else:
+            print >> sio, """
+            if (!%(z_out)s
+                || (%(z_out)s->nd != 1)
+                || (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
+                )
+            {
+                Py_XDECREF(%(z_out)s);
+                %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
+                if (!%(z_out)s)
+                {
+                    %(fail)s;
+                }
+            }
+            else
+            {
+                if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
+                {
+                    %(fail)s;
+                }
+            }
+            """
+        print >> sio, """
+        if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s))
+        {
+            %(fail)s;
+        }
+        """
+        return sio.getvalue() % locals()
+gpu_gemv_no_inplace = GpuGemv(inplace=False)
+gpu_gemv_inplace = GpuGemv(inplace=True)
+class GpuGer(Op):
+    """
+    implement ger on the gpu.
+    """
+    def __init__(self, inplace):
+        self.__setstate__({'inplace':inplace})
+    def __str__(self):
+        if self.inplace:
+            return 'GpuGer{inplace}'
+        else:
+            return 'GpuGer{no_inplace}'
+    def __eq__(self, other):
+        return (type(self) == type(other)\
+                and self.inplace == other.inplace)
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.inplace)
+    def __setstate__(self, dct):
+        inplace = dct.get('inplace', True)
+        if inplace:
+            self.destroy_map = {0: [0]}
+        self.inplace = inplace
+    def __getstate__(self):
+        return dict(inplace=self.inplace)
+    def make_node(self, z, a, x, y):
+        # the more complicated error checking performed by tensor.ger is
+        # assumed to already have been done
+        return Apply(self, [z, a, x, y], [z.type()])
+    def c_code_cache_version(self):
+        return (1,)
+    def c_code(self, node, name, inputs, outputs, sub):
+        #z_out = alpha * dot(x,y) + beta * z_in
+        #inplace version, set set z_out = z_in
+        #not inplace version, we copy z_in to z_out.
+        z_in, a, x, y = inputs
+        z_out, = outputs
+        fail = sub['fail']
+        sio = StringIO.StringIO()
+        print >> sio, """
+        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
+        """
+        if self.inplace:
+            print >> sio, """
+            Py_XDECREF(%(z_out)s);
+            %(z_out)s = %(z_in)s;
+            Py_INCREF(%(z_out)s);
+            """
+        else:
+            print >> sio, """
+            if (!%(z_out)s
+                || (%(z_out)s->nd != 2)
+                || (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
+                || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
+                )
+            {
+                Py_XDECREF(%(z_out)s);
+                %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
+                if (!%(z_out)s)
+                {
+                    %(fail)s;
+                }
+            }
+            else
+            {
+                if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
+                {
+                    %(fail)s;
+                }
+            }
+            """
+        print >> sio, """
+        if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s))
+        {
+            %(fail)s;
+        }
+        """
+        return sio.getvalue() % locals()
+gpu_ger_no_inplace = GpuGer(inplace=False)
+gpu_ger_inplace = GpuGer(inplace=True)
 class GpuOuter(Op):
    def make_node(self, x, y):
        # we suppose type checking has been done, but make sure.

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -3012,6 +3012,92 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    return 0;
 }
+int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
+{
+    /**
+    * C <- alpha A B + beta C
+    *    A : matrix
+    *    B, C: vector
+    *    alpha, beta: scalars
+    */
+    if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemv"); return -1; }
+    if (B->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
+    if (C->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
+    // We must allow dimensions to be zeros.
+    if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
+            || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0]))
+    {
+        PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemv (%i,%i)x(%i)->(%i)",
+                CudaNdarray_HOST_DIMS(A)[0],
+                CudaNdarray_HOST_DIMS(A)[1],
+                CudaNdarray_HOST_DIMS(B)[0],
+                CudaNdarray_HOST_DIMS(C)[0]);
+        return -1;
+    }
+    // a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place
+    // TODO: make a copy instead of returning in error
+    if (((CudaNdarray_HOST_DIMS(A)[0] > 1) && (CudaNdarray_HOST_STRIDES(A)[0] != 1)) && ((CudaNdarray_HOST_DIMS(A)[1] > 1) && (CudaNdarray_HOST_STRIDES(A)[1] != 1)))
+    { PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemv arg"); return -1; }
+    // I don't know if cudablas handles negative strides
+    if (   (CudaNdarray_HOST_STRIDES(A)[0] < 0)
+        || (CudaNdarray_HOST_STRIDES(A)[1] < 0)
+        || (CudaNdarray_HOST_STRIDES(B)[0] < 0)
+        || (CudaNdarray_HOST_STRIDES(C)[0] < 0))
+    {
+        PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)x(%i)->(%i)",
+                CudaNdarray_HOST_STRIDES(A)[0],
+                CudaNdarray_HOST_STRIDES(A)[1],
+                CudaNdarray_HOST_STRIDES(B)[0],
+                CudaNdarray_HOST_STRIDES(C)[0]);
+        return -1;
+    }
+    /* create appropriate strides for malformed matrices that are row or column
+     * vectors
+     */
+    int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
+    int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
+    int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : 1;
+    int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : 1;
+    if (sa_0 == 1)
+    {
+        cublasSgemv('N',
+                CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
+                alpha,
+                CudaNdarray_DEV_DATA(A), sa_1,
+                CudaNdarray_DEV_DATA(B), sb_0,
+                beta,
+                CudaNdarray_DEV_DATA(C), sc_0);
+    }
+    else if (sa_1 == 1)
+    {
+        cublasSgemv('T',
+                CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
+                alpha,
+                CudaNdarray_DEV_DATA(A), sa_0,
+                CudaNdarray_DEV_DATA(B), sb_0,
+                beta,
+                CudaNdarray_DEV_DATA(C), sc_0);
+    }
+    else
+    {
+        PyErr_SetString(PyExc_NotImplementedError, "too many strides strides in sgemv");
+        return -1;
+    }
+    CNDA_THREAD_SYNC;
+    cudaError_t err = cudaGetLastError();
+    if (CUBLAS_STATUS_SUCCESS != err)
+    {
+        PyErr_Format(PyExc_RuntimeError, "cublassGemv failed (%s)",cudaGetErrorString(err));
+        return -1;
+    }
+    return 0;
+}
 int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) {
    if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
    if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
@@ -3033,17 +3119,50 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
        PyErr_SetString(PyExc_NotImplementedError, "non-c continugous A in sger");
        return -1;
    }
+    // Since Sger expects A in col-major, we invert x and y to fake this.
+    int x_strides = CudaNdarray_HOST_STRIDES(x)[0];
+    CudaNdarray * x_ = x;
+    if(x_strides == 0){
+        if(CudaNdarray_HOST_DIMS(x)[0] != 1){
+            PyErr_Format(PyExc_RuntimeError,
+                         "CudaNdarray_sger: Invalid input x(should not happen)."
+                         " We received an CudaNdarray vector with a stride of 0"
+                         " that have more then 1 elements!");
+            return -1;
+        }
+        x_strides = 4;
+    } else if(x_strides < 0){
+        x_ = (CudaNdarray*)CudaNdarray_Copy(x);
+        x_strides = CudaNdarray_HOST_STRIDES(x_)[0];
+    }
-    // Same for this, be safe
+    int y_strides = CudaNdarray_HOST_STRIDES(y)[0];
-    assert (CudaNdarray_HOST_STRIDES(x)[0] >= 0);
+    CudaNdarray * y_ = y;
-    assert (CudaNdarray_HOST_STRIDES(y)[0] >= 0);
+    if(y_strides == 0){
+        if(CudaNdarray_HOST_DIMS(y)[0] != 1){
+            PyErr_Format(PyExc_RuntimeError,
+                         "CudaNdarray_sger: Invalid input y(should not happen)."
+                         " We received an CudaNdarray vector with a stride of 0"
+                         " that have more then 1 elements!");
+            return -1;
+        }
+        y_strides = 4;
+    } else if(y_strides < 0){
+        y_ = (CudaNdarray*)CudaNdarray_Copy(y);
+        y_strides = CudaNdarray_HOST_STRIDES(y_)[0];
+    }
-    // Since Sger expects A in col-major, we invert x and y to fake this.
+    if(CudaNdarray_SIZE(A)){
-    cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
+        cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
-               CudaNdarray_DEV_DATA(y), CudaNdarray_HOST_STRIDES(y)[0],
+                   CudaNdarray_DEV_DATA(y_), y_strides,
-               CudaNdarray_DEV_DATA(x), CudaNdarray_HOST_STRIDES(x)[0],
+                   CudaNdarray_DEV_DATA(x_), x_strides,
-               CudaNdarray_DEV_DATA(A), CudaNdarray_HOST_DIMS(A)[1]);
+                   CudaNdarray_DEV_DATA(A), CudaNdarray_HOST_DIMS(A)[1]);
+    }
    CNDA_THREAD_SYNC;
+    if(x_ != x)
+        Py_DECREF(x_);
+    if(y_ != y)
+        Py_DECREF(y_);
    cudaError_t err = cudaGetLastError();
    if (CUBLAS_STATUS_SUCCESS != err)

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -320,6 +320,7 @@ DllExport bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
 DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
 DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
+DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
 DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);
 DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -16,6 +16,10 @@ from theano.sandbox.cuda.basic_ops import *
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
        gpu_gemm_inplace, gpu_gemm_no_inplace, gpu_outer, GpuConv)
+from theano.sandbox.cuda.blas import gpu_gemv_inplace
+from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
+from theano.sandbox.cuda.blas import gpu_ger_inplace
+from theano.sandbox.cuda.blas import gpu_ger_no_inplace
 from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
        GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import (
@@ -375,47 +379,85 @@ def local_gpu_dot22scalar(node):
 @register_opt()
 @local_optimizer([])
-def local_gpu_gemv_as_gemm(node):
+def local_gpu_gemv(node):
    """
    gpu_from_host(gemv) -> gpu_gemv(gpu_from_host)
-    gemm(host_from_gpu) -> host_from_gpu(gpu_gemv)
+    gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)
-    This optimization solves the vector-matrix multiplication issue by
-    transforming the vector into a matrix, apply gpudot22 and reshaping
-    the output.
-    A more suitable solution would be to use the right cublas call
    """
-    gemvs = {tensor.blas.gemv_inplace: gpu_gemm_inplace,
+    gemvs = {
-            tensor.blas.gemv_no_inplace: gpu_gemm_no_inplace}
+            tensor.blas.gemv_inplace: gpu_gemv_no_inplace,
+            tensor.blas.gemv_no_inplace: gpu_gemv_no_inplace,
+            tensor.blas_c.CGemv(inplace=True): gpu_gemv_no_inplace,
+            tensor.blas_c.CGemv(inplace=False): gpu_gemv_no_inplace,
+            }
    if node.op == gpu_from_host:
        host_input = node.inputs[0]
        if host_input.owner and host_input.owner.op in gemvs:
            op = host_input.owner.op
            z, a, x, y, b = host_input.owner.inputs
-            return [
+            return [gemvs[op](
-                GpuDimShuffle((False,True),[0])(gemvs[op](
+                    gpu_from_host(z)
-                    GpuDimShuffle((False,),[0,'x'])(gpu_from_host(z))
                    , a
                    , gpu_from_host(x)
-                    , GpuDimShuffle((False,),[0,'x'])(gpu_from_host(y))
+                    , gpu_from_host(y)
-                    , b))]
+                    , b)]
    if node.op in gemvs:
        z, a, x, y, b = node.inputs
        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
        if x_on_gpu or y_on_gpu or z_on_gpu:
-            return [host_from_gpu(GpuDimShuffle((False,True),[0])(
+            return [host_from_gpu(
                gemvs[node.op](
-                    GpuDimShuffle((False,),[0,'x'])(gpu_from_host(z))
+                    gpu_from_host(z)
                    , a
                    , gpu_from_host(x)
-                    , GpuDimShuffle((False,),[0,'x'])(gpu_from_host(y))
+                    , gpu_from_host(y)
-                    , b)))]
+                    , b))]
    return False
+@register_opt()
+@local_optimizer([])
+def local_gpu_ger(node):
+    """
+    gpu_from_host(ger) -> gpu_ger(gpu_from_host)
+    ger(host_from_gpu) -> host_from_gpu(gpu_ger)
+    """
+    gers = {
+            tensor.blas_c.CGer(destructive=True): gpu_ger_no_inplace,
+            tensor.blas_c.CGer(destructive=False): gpu_ger_no_inplace,
+            tensor.blas.Ger(destructive=True): gpu_ger_no_inplace,
+            tensor.blas.Ger(destructive=False): gpu_ger_no_inplace,
+            }
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if host_input.owner and host_input.owner.op in gers:
+            op = host_input.owner.op
+            z, a, x, y = host_input.owner.inputs
+            return [gers[op](
+                    gpu_from_host(z)
+                    , a
+                    , gpu_from_host(x)
+                    , gpu_from_host(y)
+                    )]
+    if node.op in gers:
+        z, a, x, y = node.inputs
+        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
+        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
+        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
+        if x_on_gpu or y_on_gpu or z_on_gpu:
+            return [host_from_gpu(
+                gers[node.op](
+                    gpu_from_host(z)
+                    , a
+                    , gpu_from_host(x)
+                    , gpu_from_host(y)
+                    ))]
+    return False
 @register_opt()
 @local_optimizer([])
 def local_gpu_gemm(node):
@@ -424,7 +466,8 @@ def local_gpu_gemm(node):
    gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
    """
-    gemms = {tensor.blas.gemm_inplace: gpu_gemm_inplace,
+    gemms = {
+            #tensor.blas.gemm_inplace: gpu_gemm_inplace,
            tensor.blas.gemm_no_inplace: gpu_gemm_no_inplace}
    if node.op == gpu_from_host:
        host_input = node.inputs[0]
@@ -924,15 +967,32 @@ def local_inplace_gemm(node):
    if node.op == gpu_gemm_no_inplace:
        return [gpu_gemm_inplace(*node.inputs)]
+@local_optimizer([gpu_gemv_no_inplace])
+def local_inplace_gemv(node):
+    if node.op == gpu_gemv_no_inplace:
+        return [gpu_gemv_inplace(*node.inputs)]
+@local_optimizer([gpu_gemm_no_inplace])
+def local_inplace_ger(node):
+    if node.op == gpu_ger_no_inplace:
+        return [gpu_ger_inplace(*node.inputs)]
 # After destroyhandler is in but before we try to make elemwise things inplace
 # Try to make gpu gemm inplace
 # Also, need to make the gemm optimisation(step 70) happen before the fusion of
 # elemwise(step 71)
 optdb.register('InplaceGpuBlasOpt',
-        EquilibriumOptimizer([local_inplace_gemm], failure_callback=EquilibriumOptimizer.warn_inplace,
+        EquilibriumOptimizer([local_inplace_gemm,
+                              local_inplace_gemv,
+                              local_inplace_ger,
+                              ],
+                            failure_callback=EquilibriumOptimizer.warn_inplace,
            max_use_ratio=5),
               70.0, 'fast_run', 'inplace', 'gpu')
 def get_device_type_sizes():
    """
    :return:(gpu ptr size, cpu ptr size, int sizes(gpu and cpu))

--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
+from unittest import TestCase
 from theano.compile.pfunc import pfunc
 from theano import tensor
 from theano.tests import unittest_tools
@@ -15,6 +17,9 @@ import theano.sandbox.cuda as tcn
 from theano.tensor.signal.downsample import DownsampleFactorMax, DownsampleFactorMaxGrad
 import theano.compile.mode
+from theano.tensor.tests.test_blas import BaseGemv, TestGer
+from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
+from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
 if theano.config.mode=='FAST_COMPILE':
@@ -243,3 +248,39 @@ def test_downsample():
                #We already check that the gpu version return the same value as the gpu version
                #for GpuDownsampleFactorMaxGrad. So no need to call verify_grad here.
+class TestGpuGemv(TestCase, BaseGemv,
+                  unittest_tools.TestOptimizationMixin):
+    mode = mode_with_gpu
+    dtype = 'float32'
+    # As all input are transfered to the gpu, this allow to make all
+    # the gemv inplace.
+    gemv = gpu_gemv_inplace
+    gemv_inplace = gpu_gemv_inplace
+class TestGpuGer(TestGer):
+    def setUp(self):
+        self.mode = mode_with_gpu
+        dtype = self.dtype = 'float32'  # optimization isn't dtype-dependent
+        self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
+        self.a = tensor.tensor(dtype=dtype, broadcastable=())
+        self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        self.ger = gpu_ger_no_inplace
+        self.ger_destructive = gpu_ger_inplace
+        self.gemm = tcn.blas.gpu_gemm_no_inplace
+        # data on the gpu make the op always inplace
+        self.ger = gpu_ger_inplace
+        self.gemm = tcn.blas.gpu_gemm_inplace
+class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
+    def setUp(self):
+        self.ops = [gpu_ger_no_inplace, gpu_ger_inplace]
+    def clone(self, op):
+        return tcn.blas.GpuGer(op.inplace)
--- a/theano/tensor/__init__.py
+++ b/theano/tensor/__init__.py
@@ -10,6 +10,7 @@ import opt
 import opt_uncanonicalize
 import blas
 import blas_scipy
+import blas_c
 import xlogx
 import raw_random

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2540,6 +2540,41 @@ class Alloc(gof.Op):
            #reuse the allocated memory.
            out[0][...] = v # broadcast v to fill us up
+    def c_code(self, node, name, inp, out, sub):
+        # TODO: use the elemwise code generator here
+        if node.inputs[0].ndim == 0:
+            # filling with a scalar is a common use of alloc
+            # that we can implement relatively easily
+            vv = inp[0]
+            zz, = out
+            fail = sub['fail']
+            if node.outputs[0].ndim == 1:
+                N0 = inp[1]
+                return """
+                npy_intp N0 = ((dtype_%(N0)s*)%(N0)s->data)[0];
+                dtype_%(vv)s vv;
+                dtype_%(zz)s* zz;
+                if ((NULL == %(zz)s) || (%(zz)s->dimensions[0] != N0))
+                {
+                    if (%(zz)s) Py_XDECREF(%(zz)s);
+                    %(zz)s = (PyArrayObject*)PyArray_SimpleNew(1,
+                        &N0, type_num_%(vv)s);
+                    if(!%(zz)s) {
+                        PyErr_SetString(PyExc_MemoryError, "alloc failed");
+                        %(fail)s
+                    }
+                }
+                vv = ((dtype_%(vv)s*)%(vv)s->data)[0];
+                zz = ((dtype_%(zz)s*)%(zz)s->data);
+                assert (%(zz)s->strides[0] == sizeof(dtype_%(zz)s));
+                for (int i = 0; i < N0; ++i)
+                {
+                    zz[i] = vv;
+                }
+                """ % locals()
+        # else pretend this never happened
+        return super(Alloc, self).c_code(node, name, inp, out, sub)
    def infer_shape(self, node, input_shapes):
        return [node.inputs[1:]]

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -6,18 +6,26 @@ Learn more about BLAS here:
 The standard BLAS libraries implement what is called "legacy BLAS" in that
 document.
-This documentation section describes Theano's BLAS optimization
+This documentation describes Theano's BLAS optimization pipeline.
-pipeline.
 Where there is a discrepancy between how things do work and how they *should*
-work, both aspects should be documented.  It helps keep a broader agenda in view
+work, both aspects should be documented.
-even while fixing little bugs etc. from day to day.
+There are four kinds of BLAS Ops in Theano:
+    - Python implementations (this file)
+    - SciPy-based (blas_scipy)
+    - C-based (blas_c)
+    - CUDA-based (theano.sandbox.cuda.blas)
+:note: Unfortunately (because it's confusing) this file currently contains Ops
+    that contain both Python and C versions.  I think it would be better to
+    move the C implementations to blas_c so that this file is pure Python.
+    -JB
 Ops
 ===
-There are two BLAS calls wrapped in this module: GEMM and GEMV.
 GEMM: Dot22, Dot22Scalar, GemmRelated, Gemm
 -------------------------------------------
@@ -43,18 +51,19 @@ GEMV: Gemv
 ----------
 The BLAS GEMV operation implements Z <- a X Y + b Z,
-where Z is a matrix, Y, and Z are vectors, and a and b are scalars.
+where X is a matrix, Y, and Z are vectors, and a and b are scalars.
-Gemv implements the GEMV call in all its generality.
+GER: Ger
+--------
+The BLAS GER operation implements Z <- a X' Y + Z,
+where X and Y are vectors, and matrix Z gets a rank-1 update.
 Other Notable BLAS-related Ops
 ------------------------------
-GpuOuter is currently a wrapper around GER.  GER is a useful special case of
-GEMM, and in the future it would be good to have a GER Op.  With a GER Op here,
-the GpuOuter could be turned into a GpuGER.
 SYRK is another useful special case of GEMM. Particularly SYRK preserves
 symmetry in the matrix that it updates.  See how the linear-algebra module uses
 symmetry hints before implementing this Op, so that this Op is compatible with
@@ -64,14 +73,19 @@ that system.
 Optimizations
 =============
-The current optimization pipeline is not exactly clear to me. Instead I will
+The optimization pipeline works something like this:
-describe how it should work.
-The high level pipeline is:
    1. identify dot22 from dot
    2. identify gemm from dot22
    3. identify dot22scalar from dot22 that are not gemm
    4. specialize gemm to gemv where applicable
+    5. specialize gemm to ger where applicable
+    6. specialize dot22 -> gemv or ger where applicable
+:note: GEMM is the most canonical BLAS signature that we deal with so far, it
+    would be good to turn most things into GEMM (dot, inner, outer, dot22,
+    dot22scalar), and then to specialize from gemm to the various other L2 and
+    L3 operations.
 Identify Dot22
 --------------
@@ -161,9 +175,9 @@ class Gemv(Op):
    def __str__(self):
        if self.inplace:
-            return 'Gemv{inplace}'
+            return '%s{inplace}' % self.__class__.__name__
        else:
-            return 'Gemv{no_inplace}'
+            return '%s{no_inplace}' % self.__class__.__name__
    def __hash__(self):
        return hash(type(self)) ^ hash(self.inplace)
@@ -268,31 +282,19 @@ class Ger(Op):
            raise TypeError('only float and complex types supported', x.dtype)
        return Apply(self, [A, alpha, x, y], [A.type()])
-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def perform(self, node, inp, out):
-        node_input_storage = [storage_map[r] for r in node.inputs]
+        cA, calpha, cx, cy = inp
-        node_output_storage = [storage_map[r] for r in node.outputs]
+        cZ, = out
+        if self.destructive:
-        # get vars for containers
+            A = cA
-        cA, calpha, cx, cy = node_input_storage
+        else:
-        cZ, = node_output_storage
+            A = cA.copy()
+        if calpha != 1:
+            A += calpha * numpy.outer(cx, cy)
+        else:
+            A += numpy.outer(cx, cy)
+        cZ[0] = A
-        def rval():
-            if self.destructive:
-                A = cA[0]
-            else:
-                A = cA[0].copy()
-            if calpha[0] != 1:
-                A += calpha[0] * numpy.outer(cx[0], cy[0])
-            else:
-                A += numpy.outer(cx[0], cy[0])
-            cZ[0] = A
-        #TODO: If this is currently an unofficial part of the thunk API,
-        #      then maybe it should be documented and made official?
-        rval.inputs = node_input_storage
-        rval.outputs = node_output_storage
-        rval.lazy = False
-        return rval
 ger = Ger(destructive=False)
 ger_destructive = Ger(destructive=True)
@@ -1148,7 +1150,7 @@ def _gemm_from_factored_list(lst):
    # Try every pair in the sM_list, trying to turn it into a gemm operation
    for i in xrange(len(lst) - 1):
-        s_i,M_i = lst[i]
+        s_i, M_i = lst[i]
        for j in xrange(i+1, len(lst)):
            s_j, M_j = lst[j]
@@ -1400,9 +1402,7 @@ def local_gemm_to_ger(node):
                rval = ger(z, a, xv, yv)
                return [rval]
            elif bval == 0:   # GER on zeros_like should be faster than GEMM
-                zeros = T.alloc(
+                zeros = T.zeros([x.shape[0], y.shape[1]], x.dtype)
-                        numpy.asarray(0, dtype=x.dtype),
-                        x.shape[0], y.shape[1])
                rval = ger(zeros, a, xv, yv)
                return [rval]
            else:
@@ -1414,20 +1414,43 @@ def local_gemm_to_ger(node):
 #TODO: delete this optimization when we have the proper dot->gemm->ger pipeline
 #      working
 @local_optimizer([_dot22])
-def local_dot22_to_ger(node):
+def local_dot22_to_ger_or_gemv(node):
-    """GEMM computing an outer-product -> GER
+    """dot22 computing an outer-product -> GER
    """
    if node.op == _dot22:
        x, y = node.inputs
-        if x.broadcastable[1] and y.broadcastable[0]:
+        xb = x.broadcastable
+        yb = y.broadcastable
+        one = T.as_tensor_variable(numpy.asarray(1, dtype=x.dtype))
+        zero = T.as_tensor_variable(numpy.asarray(0, dtype=x.dtype))
+        if xb[1] and yb[0]:
            # x and y are both vectors so this might qualifies for a GER
            xv = x.dimshuffle(0)
            yv = y.dimshuffle(1)
-            one = T.as_tensor_variable(numpy.asarray(1, dtype=x.dtype))
+            zeros = T.zeros([x.shape[0], y.shape[1]], dtype=x.dtype)
-            zeros = T.alloc(numpy.asarray(0, dtype=x.dtype), x.shape[0], y.shape[1])
            rval = ger(zeros, one, xv, yv)
            return [rval]
+        if xb[0] and yb[1]:
+            # x and y are both vectors so this qualifies for a sdot / ddot
+            # TODO: Theano doesn't have a sdot, but gemv is better than _dot22
+            xv = x.dimshuffle(1)
+            zeros = T.zeros([1], x.dtype)
+            rval = gemv_no_inplace(zeros, one, y.T, xv, one)
+            return [rval.dimshuffle('x', 0)]
+        if xb[0] and not yb[0] and not yb[1]:
+            # x is vector, y is matrix so try gemv
+            xv = x.dimshuffle(1)
+            zeros = T.zeros([y.shape[1]], x.dtype)
+            rval = gemv_no_inplace(zeros, one, y.T, xv, one)
+            return [rval.dimshuffle('x', 0)]
+        if not xb[0] and not xb[1] and yb[1]:
+            # x is matrix, y is vector, try gemv
+            yv = y.dimshuffle(0)
+            zeros = T.zeros([x.shape[0]], dtype=x.dtype)
+            rval = gemv_no_inplace(zeros, one, x, yv, one)
+            return [rval.dimshuffle(0, 'x')]
 #################################
 #
@@ -1445,14 +1468,14 @@ optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run')
 blas_optdb.register('local_dot_to_dot22',
        EquilibriumOptimizer([local_dot_to_dot22], max_use_ratio=5),
        0, 'fast_run')
-blas_optdb.register('local_dot_to_gemm',
+blas_optdb.register('gemm_optimizer',
        GemmOptimizer(),
        10, 'fast_run')
 blas_optdb.register('local_gemm_to_gemv',
        EquilibriumOptimizer([
            local_gemm_to_gemv,
            local_gemm_to_ger,
-            local_dot22_to_ger,
+            local_dot22_to_ger_or_gemv,
            local_dimshuffle_lift],
            max_use_ratio=5),
        15, 'fast_run')

--- a/theano/tensor/blas_c.py
+++ b/theano/tensor/blas_c.py
--- a/theano/tensor/blas_scipy.py
+++ b/theano/tensor/blas_scipy.py
@@ -23,10 +23,9 @@ try:
            numpy.dtype('complex64'):scipy.linalg.blas.fblas.cgeru,
            numpy.dtype('complex128'):scipy.linalg.blas.fblas.zgeru,
            }
-    optimizations_enabled = True
 except ImportError, e:
    have_fblas = False
-    optimizations_enabled = False
 class ScipyGer(Ger):
@@ -62,13 +61,11 @@ class ScipyGer(Ger):
 @local_optimizer([ger, ger_destructive])
 def use_scipy_ger(node):
-    if not optimizations_enabled: return
    if node.op == ger:
        return [ScipyGer(False)(*node.inputs)]
 @local_optimizer([ScipyGer(False)])
 def make_ger_destructive(node):
-    if not optimizations_enabled: return
    if node.op == ScipyGer(False):
        return [ScipyGer(True)(*node.inputs)]

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
--- a/theano/tensor/tests/test_blas_c.py
+++ b/theano/tensor/tests/test_blas_c.py
+import sys
+import numpy
+import theano
+import theano.tensor as tensor
+from theano.tensor.blas_c import CGer
+from theano.tensor.blas_scipy import ScipyGer
+from theano.tensor.blas import Ger
+from theano.tensor.blas_c import CGemv
+from theano.tensor.blas_scipy import ScipyGer
+from theano.tensor.blas import Gemv
+from theano.tests import unittest_tools
+from theano.tests.unittest_tools import TestOptimizationMixin
+from test_blas import TestCase
+from test_blas import BaseGemv
+mode_blas_opt = theano.compile.get_default_mode().including(
+    'BlasOpt', 'specialize', 'InplaceBlasOpt', 'c_blas')
+class TestCGer(TestCase, TestOptimizationMixin):
+    def setUp(self, dtype='float64'):
+        self.dtype = dtype
+        self.mode = theano.compile.get_default_mode().including('fast_run')
+        self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
+        self.a = tensor.tensor(dtype=dtype, broadcastable=())
+        self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        self.Aval = numpy.ones((2,3), dtype=dtype)
+        self.xval = numpy.asarray([1,2], dtype=dtype)
+        self.yval = numpy.asarray([1.5,2.7,3.9], dtype=dtype)
+    def function(self, inputs, outputs):
+        return theano.function(inputs, outputs,
+                mode=self.mode,
+                #allow_inplace=True,
+                )
+    def run_f(self, f):
+        return f(self.Aval, self.xval, self.yval)
+    def b(self, bval):
+        return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
+    def test_eq(self):
+        self.assert_(CGer(True) == CGer(True))
+        self.assert_(CGer(False) == CGer(False))
+        self.assert_(CGer(False) != CGer(True))
+        self.assert_(CGer(True) != ScipyGer(True))
+        self.assert_(CGer(False) != ScipyGer(False))
+        self.assert_(CGer(True) != Ger(True))
+        self.assert_(CGer(False) != Ger(False))
+        # assert that eq works for non-CGer instances
+        self.assert_(CGer(False) != None)
+        self.assert_(CGer(True) != None)
+    def test_hash(self):
+        self.assert_(hash(CGer(True)) == hash(CGer(True)))
+        self.assert_(hash(CGer(False)) == hash(CGer(False)))
+        self.assert_(hash(CGer(False)) != hash(CGer(True)))
+    def test_optimization_pipeline(self):
+        f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
+        self.assertFunctionContains(f, CGer(destructive=True))
+        f(self.xval, self.yval)  #DebugMode tests correctness
+    def test_optimization_pipeline_float(self):
+        self.setUp('float32')
+        f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
+        self.assertFunctionContains(f, CGer(destructive=True))
+        f(self.xval, self.yval)  #DebugMode tests correctness
+    def test_int_fails(self):
+        self.setUp('int32')
+        f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
+        self.assertFunctionContains0(f, CGer(destructive=True))
+        self.assertFunctionContains0(f, CGer(destructive=False))
+    def test_A_plus_outer(self):
+        f = self.function([self.A, self.x, self.y],
+                self.A + tensor.outer(self.x, self.y))
+        self.assertFunctionContains(f, CGer(destructive=False))
+        self.run_f(f) #DebugMode tests correctness
+    def test_A_plus_scaled_outer(self):
+        f = self.function([self.A, self.x, self.y],
+                self.A + 0.1 * tensor.outer(self.x, self.y))
+        self.assertFunctionContains(f, CGer(destructive=False))
+        self.run_f(f) #DebugMode tests correctness
+class TestCGemv(TestCase, TestOptimizationMixin):
+    """
+    Tests of CGemv specifically.
+    Generic tests of Gemv-compatibility, including both dtypes are done below in
+    TestCGemvFloat32 and TestCGemvFloat64
+    """
+    def setUp(self, dtype='float64'):
+        self.dtype = dtype
+        self.mode = theano.compile.get_default_mode().including('fast_run')
+        # matrix
+        self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
+        self.Aval = numpy.ones((2,3), dtype=dtype)
+        # vector
+        self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        self.xval = numpy.asarray([1,2], dtype=dtype)
+        self.yval = numpy.asarray([1.5,2.7,3.9], dtype=dtype)
+        # scalar
+        self.a = tensor.tensor(dtype=dtype, broadcastable=())
+    def test_optimizations_vm(self):
+        ''' Test vector dot matrix '''
+        f = theano.function([self.x, self.A],
+                theano.dot(self.x, self.A),
+                mode=self.mode)
+        # Assert that the dot was optimized somehow
+        self.assertFunctionContains0(f, tensor.dot)
+        self.assertFunctionContains1(f, CGemv(True))
+        # Assert they produce the same output
+        assert numpy.allclose(f(self.xval, self.Aval),
+                numpy.dot(self.xval, self.Aval))
+    def test_optimizations_mv(self):
+        ''' Test matrix dot vector '''
+        f = theano.function([self.A, self.y],
+                theano.dot(self.A, self.y),
+                mode=self.mode)
+        # Assert that the dot was optimized somehow
+        self.assertFunctionContains0(f, tensor.dot)
+        self.assertFunctionContains1(f, CGemv(True))
+        # Assert they produce the same output
+        assert numpy.allclose(f(self.Aval, self.yval),
+                numpy.dot(self.Aval, self.yval))
+    def t_gemv1(self, m_shp):
+        ''' test vector2 + dot(matrix, vector1) '''
+        rng = numpy.random.RandomState(unittest_tools.fetch_seed())
+        v1 = theano.shared(numpy.array(rng.uniform(size=(m_shp[1],)), dtype='float32'))
+        v2_orig = numpy.array(rng.uniform(size=(m_shp[0],)), dtype='float32')
+        v2 = theano.shared(v2_orig)
+        m  = theano.shared(numpy.array(rng.uniform(size=m_shp), dtype='float32'))
+        f = theano.function([], v2 + tensor.dot(m, v1),
+                mode=self.mode)
+        # Assert they produce the same output
+        assert numpy.allclose(f(),
+                numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
+        topo = [n.op for n in f.maker.env.toposort()]
+        assert topo == [CGemv(inplace=False)], topo
+        #test the inplace version
+        f = theano.function([], [],
+                updates={v2:v2+theano.dot(m,v1)},
+                mode=self.mode)
+        # Assert they produce the same output
+        f()
+        assert numpy.allclose(v2.get_value(),
+                numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
+        topo = [n.op for n in f.maker.env.toposort()]
+        assert topo == [CGemv(inplace=True)]
+    def test_gemv1(self):
+        self.t_gemv1((3,2))
+        self.t_gemv1((0,2))
+        self.t_gemv1((3,0))
+        self.t_gemv1((0,0))
+    def test_gemv_dimensions(self, dtype='float32'):
+        alpha = theano.shared(theano._asarray(1.0, dtype=dtype),
+                name='alpha')
+        beta = theano.shared(theano._asarray(1.0, dtype=dtype),
+                name='beta')
+        z = beta * self.y + alpha * tensor.dot(self.A, self.x)
+        f = theano.function([self.A, self.x, self.y], z,
+                mode=self.mode)
+        # Matrix value
+        A_val = numpy.ones((5,3), dtype=dtype)
+        # Different vector length
+        ones_3 = numpy.ones(3, dtype=dtype)
+        ones_4 = numpy.ones(4, dtype=dtype)
+        ones_5 = numpy.ones(5, dtype=dtype)
+        ones_6 = numpy.ones(6, dtype=dtype)
+        f(A_val, ones_3, ones_5)
+        self.assertRaises(ValueError, f, A_val, ones_4, ones_5)
+        self.assertRaises(ValueError, f, A_val, ones_3, ones_6)
+        self.assertRaises(ValueError, f, A_val, ones_4, ones_6)
+class TestCGemvFloat32(TestCase, BaseGemv, TestOptimizationMixin):
+    mode = mode_blas_opt
+    dtype = 'float32'
+    gemv = CGemv(inplace=False)
+    gemv_inplace = CGemv(inplace=True)
+class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
+    mode = mode_blas_opt
+    dtype = 'float64'
+    gemv = CGemv(inplace=False)
+    gemv_inplace = CGemv(inplace=True)
--- a/theano/tensor/tests/test_blas_scipy.py
+++ b/theano/tensor/tests/test_blas_scipy.py
@@ -4,12 +4,15 @@ import theano
 import theano.tensor as tensor
 from theano.tensor.blas_scipy import ScipyGer
-from test_blas import TestCase, TestOptimizationMixin, gemm_no_inplace
+from test_blas import TestCase, gemm_no_inplace
+from theano.tests.unittest_tools import TestOptimizationMixin
 class TestScipyGer(TestCase, TestOptimizationMixin):
    def setUp(self):
-        self.mode = theano.compile.get_default_mode().including('fast_run')
+        self.mode = theano.compile.get_default_mode()
+        self.mode = self.mode.including('fast_run')
+        self.mode = self.mode.excluding('c_blas')  # c_blas trumps scipy Ops
        dtype = self.dtype = 'float64'  # optimization isn't dtype-dependent
        self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
        self.a = tensor.tensor(dtype=dtype, broadcastable=())
@@ -18,9 +21,10 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
        self.Aval = numpy.ones((2,3), dtype=dtype)
        self.xval = numpy.asarray([1,2], dtype=dtype)
        self.yval = numpy.asarray([1.5,2.7,3.9], dtype=dtype)
-        if not theano.tensor.blas_scipy.optimizations_enabled:
+        if not theano.tensor.blas_scipy.have_fblas:
            self.SkipTest()
    def function(self, inputs, outputs):
        return theano.function(inputs, outputs, self.mode)

--- a/theano/tests/unittest_tools.py
+++ b/theano/tests/unittest_tools.py
+from copy import copy, deepcopy
 import sys
 import numpy
 import theano.tensor as T
 from theano.configparser import config, AddConfigVar, StrParam
+try:
+    from nose.plugins.skip import SkipTest
+except ImportError:
+    class SkipTest(Exception):
+        """
+        Skip this test
+        """
 AddConfigVar('unittests.rseed',
        "Seed to use for randomized unit tests. Special value 'random' means using a seed of None.",
        StrParam(666),
        in_c_key=False)
 def fetch_seed(pseed=None):
    """
    Returns the seed to use for running the unit tests.
@@ -38,6 +48,7 @@ def fetch_seed(pseed=None):
    return seed
 def seed_rng(pseed=None):
    """
    Seeds numpy's random number generator with the value returned by fetch_seed.
@@ -51,6 +62,7 @@ def seed_rng(pseed=None):
    numpy.random.seed(seed)
    return seed
 def verify_grad(op, pt, n_tests=2, rng=None, *args, **kwargs):
    """
    Wrapper for tensor/basic.py:verify_grad
@@ -72,3 +84,67 @@ def verify_grad(op, pt, n_tests=2, rng=None, *args, **kwargs):
 #     raise
 #
 verify_grad.E_grad = T.verify_grad.E_grad
+class TestOptimizationMixin(object):
+    def assertFunctionContains(self, f, op, min=1, max=sys.maxint):
+        toposort = f.maker.env.toposort()
+        matches = [node for node in toposort if node.op == op]
+        assert (min <= len(matches) <= max), (toposort, matches, str(op), min, max)
+    def assertFunctionContains0(self, f, op):
+        return self.assertFunctionContains(f, op, min=0, max=0)
+    def assertFunctionContains1(self, f, op):
+        return self.assertFunctionContains(f, op, min=1, max=1)
+    def assertFunctionContainsN(self, f, op, N):
+        return self.assertFunctionContains(f, op, min=N, max=N)
+    def SkipTest(self, msg='Skip this test'):
+        raise SkipTest(msg)
+# This object name should not start with Test.
+# Otherwise nosetests will execute it!
+class T_OpContractMixin(object):
+    # self.ops should be a list of instantiations of an Op class to test.
+    # self.other_op should be an op which is different from every op
+    other_op = T.add
+    def copy(self, x):
+        return copy(x)
+    def deepcopy(self, x):
+        return deepcopy(x)
+    def clone(self, op):
+        raise NotImplementedError('return new instance like `op`')
+    def test_eq(self):
+        for i, op_i in enumerate(self.ops):
+            assert op_i == op_i
+            assert op_i == self.copy(op_i)
+            assert op_i == self.deepcopy(op_i)
+            assert op_i == self.clone(op_i)
+            assert op_i != self.other_op
+            for j, op_j in enumerate(self.ops):
+                if i == j: continue
+                assert op_i != op_j
+    def test_hash(self):
+        for i, op_i in enumerate(self.ops):
+            h_i = hash(op_i)
+            assert h_i == hash(op_i)
+            assert h_i == hash(self.copy(op_i))
+            assert h_i == hash(self.deepcopy(op_i))
+            assert h_i == hash(self.clone(op_i))
+            assert h_i != hash(self.other_op)
+            for j, op_j in enumerate(self.ops):
+                if i == j: continue
+                assert op_i != hash(op_j)
+    def test_name(self):
+        for op in self.ops:
+            s = str(op)    # show that str works
+            assert s       # names should not be empty