Make dot, dot22, dot22scalar, gemm on GPU work with dimensions of 0.

e98da52c · Frederic Bastien · c6fce90d · e98da52c · e98da52c
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -2720,10 +2720,10 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    if (B->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; }
    if (C->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; }
+    // We must allow dimensions to be zeros.
    if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
            || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0])
-            || (CudaNdarray_HOST_DIMS(B)[1] != CudaNdarray_HOST_DIMS(C)[1])
+            || (CudaNdarray_HOST_DIMS(B)[1] != CudaNdarray_HOST_DIMS(C)[1]))
-            || (CudaNdarray_HOST_DIMS(A)[1] == 0))
    {
        PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemm (%i,%i)x(%i,%i)->(%i,%i)",
                CudaNdarray_HOST_DIMS(A)[0],
@@ -2814,6 +2814,9 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    //TODO: recognize the negative stride and make a copy of the offending argument,
    //rather than aborting
 #define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
+    if (sx == 0){sx = 1;}\
+    if (sy == 0){sy = 1;}\
+    if (sz == 0){sz = 1;}\
    if ((sx > 0) && (sy > 0) && (sz > 0)) { \
        cublasSgemm(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz); \
    } else { \

--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -28,94 +28,111 @@ def my_rand(*shape):
    return theano._asarray(numpy.random.rand(*shape),dtype='float32')
 def test_dot22():
+    def cmp(a_shp, b_shp):
+        a = tcn.shared_constructor(my_rand(*a_shp), 'a')
-    a = tcn.shared_constructor(my_rand(4,4), 'a')
+        b = tensor.fmatrix()
-    b = tensor.fmatrix()
+        f = pfunc([b], [], updates=[(a, tensor.dot(a,b))], mode=mode_with_gpu)
-    f = pfunc([b], [], updates=[(a, tensor.dot(a,b))], mode=mode_with_gpu)
+        a0 = a.get_value() * 1.0
+        bval = my_rand(*b_shp)
+        f(bval)
-    a0 = a.get_value() * 1.0
+        assert numpy.allclose(numpy.dot(a0, bval), a.get_value())
-    print a0
-    for i, node in enumerate(f.maker.env.toposort()):
-        print i, node
-    bval = my_rand(4,4)
-    f(bval)
-    print a.get_value()
-    assert numpy.allclose(numpy.dot(a0, bval), a.get_value())
+    cmp((3,4),(4,5))
+    cmp((0,4),(4,5))
+    cmp((3,4),(4,0))
+    cmp((3,0),(0,5))
+    cmp((0,4),(4,0))
+    cmp((0,0),(0,0))
 def test_dot22scalar():
-    a = tensor.fmatrix()
+    def cmp(a_shp, b_shp):
-    b = tensor.fmatrix()
+        a = tensor.fmatrix()
-    scalar = tensor.fscalar()
+        b = tensor.fmatrix()
-    av = my_rand(4,4)
+        scalar = tensor.fscalar()
-    bv = my_rand(4,4)
+        av = my_rand(*a_shp)
+        bv = my_rand(*b_shp)
-    f = theano.function([a,b], tensor.dot(a,b)*numpy.asarray(4, 'float32'), mode=mode_with_gpu)
-    f2 = theano.function([a,b], tensor.dot(a,b)*numpy.asarray(4, 'float32'))
+        f = theano.function([a,b], tensor.dot(a,b)*numpy.asarray(4, 'float32'), mode=mode_with_gpu)
-    t=f.maker.env.toposort()
+        f2 = theano.function([a,b], tensor.dot(a,b)*numpy.asarray(4, 'float32'))
-    assert len(t)==4
+        t=f.maker.env.toposort()
-    assert isinstance(t[0].op,tcn.GpuFromHost)
+        assert len(t)==4
-    assert isinstance(t[1].op,tcn.GpuFromHost)
+        assert isinstance(t[0].op,tcn.GpuFromHost)
-    assert isinstance(t[2].op,tcn.blas.GpuDot22Scalar)
+        assert isinstance(t[1].op,tcn.GpuFromHost)
-    assert isinstance(t[3].op,tcn.HostFromGpu)
+        assert isinstance(t[2].op,tcn.blas.GpuDot22Scalar)
-    assert numpy.allclose(f(av,bv),f2(av,bv))
+        assert isinstance(t[3].op,tcn.HostFromGpu)
+        assert numpy.allclose(f(av,bv),f2(av,bv))
-    f = theano.function([a,b,scalar], tensor.dot(a,b)*scalar, mode=mode_with_gpu)
-    f2 = theano.function([a,b,scalar], tensor.dot(a,b)*scalar)
+        f = theano.function([a,b,scalar], tensor.dot(a,b)*scalar, mode=mode_with_gpu)
-    t=f.maker.env.toposort()
+        f2 = theano.function([a,b,scalar], tensor.dot(a,b)*scalar)
-    assert len(t)==4
+        t=f.maker.env.toposort()
-    assert isinstance(t[0].op,tcn.GpuFromHost)
+        assert len(t)==4
-    assert isinstance(t[1].op,tcn.GpuFromHost)
+        assert isinstance(t[0].op,tcn.GpuFromHost)
-    assert isinstance(t[2].op,tcn.blas.GpuDot22Scalar)
+        assert isinstance(t[1].op,tcn.GpuFromHost)
-    assert isinstance(t[3].op,tcn.HostFromGpu)
+        assert isinstance(t[2].op,tcn.blas.GpuDot22Scalar)
-    assert numpy.allclose(f(av,bv,0.5),f2(av,bv,0.5))
+        assert isinstance(t[3].op,tcn.HostFromGpu)
+        assert numpy.allclose(f(av,bv,0.5),f2(av,bv,0.5))
+    cmp((3,4),(4,5))
+    cmp((0,4),(4,5))
+    cmp((3,4),(4,0))
+    cmp((3,0),(0,5))
+    cmp((0,4),(4,0))
+    cmp((0,0),(0,0))
 def test_gemm():
+    def cmp(a_shp, b_shp):
+        a = tcn.shared_constructor(my_rand(*a_shp), 'a')
-    a = tcn.shared_constructor(my_rand(4,4), 'a')
+        b = tensor.fmatrix('b')
+        c = tensor.fmatrix('c')
-    b = tensor.fmatrix('b')
+        f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))], mode=mode_with_gpu)
-    c = tensor.fmatrix('c')
+        assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort()])
-    f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))], mode=mode_with_gpu)
+        a0 = a.get_value() * 1.0
-    assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort()])
+        bval = my_rand(*b_shp)
+        cval = my_rand(a_shp[0],b_shp[1])
+        f(bval,cval)
-    a0 = a.get_value() * 1.0
+        assert numpy.allclose(numpy.dot(a0, bval)+numpy.exp(cval), a.get_value())
-    print a0
+    cmp((3,4),(4,5))
-    for i, node in enumerate(f.maker.env.toposort()):
+    cmp((0,4),(4,5))
-        print i, node
+    cmp((3,4),(4,0))
-    bval = my_rand(4,4)
+    cmp((3,0),(0,5))
-    cval = my_rand(4,4)
+    cmp((0,4),(4,0))
-    f(bval,cval)
+    cmp((0,0),(0,0))
-    print a.get_value()
-    assert numpy.allclose(numpy.dot(a0, bval)+numpy.exp(cval), a.get_value())
 def test_gemm_no_inplace():
-    a = tcn.shared_constructor(my_rand(4,4), 'a')
+    def cmp(a_shp, b_shp):
-    cval = my_rand(4,4)
+        a = tcn.shared_constructor(my_rand(*a_shp), 'a')
-    c = tcn.shared_constructor(cval.copy(), 'c')
+        cval = my_rand(a_shp[0], b_shp[1])
+        c = tcn.shared_constructor(cval.copy(), 'c')
+        b = tcn.fmatrix('b')
+        b2 = tcn.fmatrix('b2')
-    b = tcn.fmatrix('b')
+        f = pfunc([b,b2], [tensor.dot(a,b2) + c], updates=[(a, tensor.dot(a,b) + c)], mode=mode_with_gpu)
-    b2 = tcn.fmatrix('b2')
-    f = pfunc([b,b2], [tensor.dot(a,b2) + c], updates=[(a, tensor.dot(a,b) + c)], mode=mode_with_gpu)
+        a0 = a.get_value() * 1.0
+        assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()])
+        bval = my_rand(*b_shp)
+        bval2 = my_rand(*b_shp)
+        rval = f(bval,bval2)
-    a0 = a.get_value() * 1.0
+        assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value())
-    #print a0
+        assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)
-    for i, node in enumerate(f.maker.env.toposort()):
-        print i, node
-    assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()])
-    bval = my_rand(4,4)
-    bval2 = my_rand(4,4)
-    rval = f(bval,bval2)
-    #print a.get_value()
-    assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value())
+    cmp((3,4),(4,5))
-    assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)
+    cmp((0,4),(4,5))
+    cmp((3,4),(4,0))
+    cmp((3,0),(0,5))
+    cmp((0,4),(4,0))
+    cmp((0,0),(0,0))
 def test_outer():
    x = tcn.shared_constructor(my_rand(8,), 'x')