gemm and dot added

0f60bf1a · James Bergstra · 31afc498 · 0f60bf1a · 0f60bf1a · 0f60bf1a
--- a/blas.py
+++ b/blas.py
+from theano import Op, Type, Apply, Variable, Constant
+from theano import tensor, scalar
+import StringIO
 class GpuDot22(Op):
-    pass
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, x, y):
+        if x.type.ndim != 2:
+            raise TypeError(x)
+        if y.type.ndim != 2:
+            raise TypeError(y)
+        return Apply(self, [x,y], [x.type()])
+    def c_code_cache_version(self):
+        return ()
+    def c_code(self, node, nodename, inputs, outputs, sub):
+        x, y = inputs
+        z, = outputs
+        fail = sub['fail']
+        return """
+        if (cnda_%(x)s->nd != 2)
+        {
+            PyErr_Format(PyExc_TypeError, "rank(x)==%%i must be 2", cnda_%(x)s->nd);
+            %(fail)s;
+        }
+        if (cnda_%(y)s->nd != 2)
+        {
+            PyErr_Format(PyExc_TypeError, "rank(y)==%%i must be 2", cnda_%(y)s->nd);
+            %(fail)s;
+        }
+        if ((NULL == cnda_%(z)s)
+            || (cnda_%(z)s->dim[0] != cnda_%(x)s->dim[0])
+            || (cnda_%(z)s->dim[1] != cnda_%(y)s->dim[1]))
+        {
+            if (cnda_%(z)s) Py_DECREF(cnda_%(z)s);
+            npy_intp dims[2];
+            dims[0] = cnda_%(x)s->dim[0];
+            dims[1] = cnda_%(y)s->dim[1];
+            cnda_%(z)s = (CudaNdarray*)CudaNdarray_new_null();
+            if ((NULL == cnda_%(z)s) || CudaNdarray_alloc_contiguous(cnda_%(z)s, 2, dims))
+            {
+                if (cnda_%(z)s)
+                {
+                    Py_DECREF(cnda_%(z)s);
+                    cnda_%(z)s = NULL;
+                }
+                %(fail)s;
+            }
+        }
+        if (CudaNdarray_gemm(1.0f, cnda_%(x)s, cnda_%(y)s, 0.0f, cnda_%(z)s))
+        {
+            if (cnda_%(z)s)
+            {
+                Py_DECREF(cnda_%(z)s);
+                cnda_%(z)s = NULL;
+            }
+            %(fail)s;
+        }
+        """ % locals()
+gpu_dot22 = GpuDot22()
 class GpuGemm(Op):
-    pass
+    destroy_map = {0:[0]}
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, z, a, x, y, b):
+        # the more complicated error checking performed by tensor.gemm is assumed to already
+        # have been done
+        return Apply(self, [z, a, x, y, b], [z.type()])
+    def c_code_cache_version(self):
+        return ()
+    def c_code(self, node, name, inputs, outputs, sub):
+        z_in, a, x, y, b = inputs
+        z_out, = outputs
+        fail = sub['fail']
+        return """
+        #define REAL float
+        float %(name)s_a = (%(a)s->descr->type_num == PyArray_FLOAT) 
+        ? (REAL)(((float*)%(a)s->data)[0])
+        : (REAL)(((double*)%(a)s->data)[0]);
+        float %(name)s_b = (%(b)s->descr->type_num == PyArray_FLOAT) ?
+        (REAL)(((float*)%(b)s->data)[0])
+        : (REAL)(((double*)%(b)s->data)[0]);
+        #undef REAL
+        if (CudaNdarray_gemm(%(name)s_a, cnda_%(x)s, cnda_%(y)s, %(name)s_b, cnda_%(z_in)s))
+        {
+            %(fail)s;
+        }
+        cnda_%(z_out)s = cnda_%(z_in)s;
+        Py_INCREF(cnda_%(z_out)s);
+        """ % locals()
+gpu_gemm = GpuGemm()
--- a/opt.py
+++ b/opt.py
-from theano import tensor, gof
+from theano import tensor, scalar, compile
-from theano import tensor, scalar
+from theano.gof import local_optimizer, EquilibriumDB
 from .basic_ops import *
+from .blas import gpu_dot22, gpu_gemm
-@gof.local_optimizer([GpuFromHost(), None])
+from theano.compile import optdb
+#optdb.print_summary()  # this shows what is currently registered (in a so-far crude way...)
+gpu_optimizer = EquilibriumDB()
+optdb.register('gpu', 
+        gpu_optimizer, 
+        optdb.__priority__.get('inplace_opt', 75) + 5, 
+        'fast_run',
+        'inplace')
+def register_opt(*tags, **kwargs):
+    def f(local_opt):
+        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
+        gpu_optimizer.register(name, local_opt, 'fast_run', 'inplace', *tags)
+        return local_opt
+    return f
+@register_opt()
+@local_optimizer([GpuFromHost(), None])
 def local_gpu_host_gpu(node):
    if not tensor.opt.opt.check_chain(node, GpuFromHost(), HostFromGpu()):
        return False
    return [node.inputs[0].owner.inputs[0]]
-tensor.opt.register_specialize(local_gpu_host_gpu, 'gpu')
-@gof.local_optimizer([HostFromGpu(), None])
+@register_opt()
+@local_optimizer([HostFromGpu(), None])
 def local_host_gpu_host(node):
    if not tensor.opt.opt.check_chain(node, HostFromGpu(), GpuFromHost()):
        return False
    return [node.inputs[0].owner.inputs[0]]
-tensor.opt.register_specialize(local_host_gpu_host, 'gpu')
-@gof.local_optimizer([])
+@register_opt()
+@local_optimizer([])
 def local_gpu_elemwise_0(node):
    if isinstance(node.op, tensor.Elemwise):
        if any(hasattr(i.owner, 'op') and isinstance(i.owner.op, HostFromGpu) for i in node.inputs):
@@ -25,8 +44,9 @@ def local_gpu_elemwise_0(node):
            new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern)
            return [host_from_gpu(new_op(*(gpu_from_host(i) for i in node.inputs)))]
    return False
-tensor.opt.register_specialize(local_gpu_elemwise_0, 'gpu')
-@gof.local_optimizer([])
+@register_opt()
+@local_optimizer([])
 def local_gpu_elemwise_1(node):
    """
    gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...))
@@ -38,9 +58,9 @@ def local_gpu_elemwise_1(node):
            new_op = GpuElemwise(elemwise_node.op.scalar_op, elemwise_node.op.inplace_pattern)
            return [new_op(*(gpu_from_host(i) for i in elemwise_node.inputs))]
    return False
-tensor.opt.register_specialize(local_gpu_elemwise_1, 'gpu')
-@gof.local_optimizer([])
+@register_opt()
+@local_optimizer([])
 def local_gpu_dimshuffle_0(node):
    """
    dimshuffle(host_from_gpu()) -> host_from_gpu(gpu_dimshuffle)
@@ -56,9 +76,9 @@ def local_gpu_dimshuffle_0(node):
            else:
                return [host_from_gpu(new_op(gpu_from_host(tensor.tensor_copy(input))))]
    return False
-tensor.opt.register_specialize(local_gpu_dimshuffle_0, 'gpu')
-@gof.local_optimizer([])
+@register_opt()
+@local_optimizer([])
 def local_gpu_dimshuffle_1(node):
    """
    gpu_from_host(dimshuffle) -> gpu_dimshuffle(gpu_from_host)
@@ -71,5 +91,44 @@ def local_gpu_dimshuffle_1(node):
                    dimshuffle_node.op.new_order)
            return [new_op(gpu_from_host(dimshuffle_node.inputs[0]))]
    return False
-tensor.opt.register_specialize(local_gpu_dimshuffle_1, 'gpu')
+@register_opt()
+@local_optimizer([])
+def local_gpu_dot(node):
+    """
+    gpu_from_host(dot) -> gpudot(gpu_from_host)
+    dot(host_from_gpu) -> host_from_gpu(gpudot)
+    """
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if host_input.owner and host_input.owner.op == tensor.blas._dot22:
+            x, y = host_input.owner.inputs
+            return [gpu_dot22(gpu_from_host(x), gpu_from_host(y))]
+    if node.op == tensor.blas._dot22:
+        if any((i.owner and i.owner.op == host_from_gpu) for i in node.inputs):
+            x, y = node.inputs
+            return [host_from_gpu(gpu_dot22(gpu_from_host(x), gpu_from_host(y)))]
+    return False
+@register_opt()
+@local_optimizer([])
+def local_gpu_gemm(node):
+    """
+    gpu_from_host(gemm) -> gpu_gemm(gpu_from_host)
+    gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
+    """
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if host_input.owner and host_input.owner.op == tensor.blas.gemm:
+            z, a, x, y, b = host_input.owner.inputs
+            return [gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)]
+    if node.op == tensor.blas.gemm:
+        z, a, x, y, b = node.inputs
+        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
+        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
+        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
+        if x_on_gpu or y_on_gpu or z_on_gpu:
+            return [host_from_gpu(gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
+    return False
--- a/tests/test_blas.py
+++ b/tests/test_blas.py
+import sys, time
+from theano.compile.sandbox.sharedvalue import shared
+from theano.compile.sandbox.pfunc import pfunc
+from theano import tensor
+import numpy
+import theano_cuda_ndarray as tcn
+def test_dot():
+    a = tcn.shared_constructor(numpy.random.rand(4,4), 'a')
+    b = tensor.fmatrix()
+    f = pfunc([b], [], updates=[(a, tensor.dot(a,b))])
+    a0 = a.value * 1.0
+    print a0
+    for i, node in enumerate(f.maker.env.toposort()):
+        print i, node
+    bval = numpy.random.rand(4,4)
+    f(bval)
+    print a.value
+    assert numpy.allclose(numpy.dot(a0, bval), a.value)
+def test_gemm():
+    a = tcn.shared_constructor(numpy.random.rand(4,4), 'a')
+    b = tensor.fmatrix('b')
+    c = tensor.fmatrix('c')
+    f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))])
+    a0 = a.value * 1.0
+    print a0
+    for i, node in enumerate(f.maker.env.toposort()):
+        print i, node
+    bval = numpy.random.rand(4,4)
+    cval = numpy.random.rand(4,4)
+    f(bval,cval)
+    print a.value
+    assert numpy.allclose(numpy.dot(a0, bval)+numpy.exp(cval), a.value)
--- a/tests/walltime.py
+++ b/tests/walltime.py
@@ -56,4 +56,7 @@ def cmp_sigmoids_T(shape):
 if __name__ == '__main__':
    eval(sys.argv[1])
+    #cmp_sigmoids((640, 64*64)) # looks great in profiler
+    #cmp_sigmoids((173, 74*49))
+    #cmp_sigmoids_T((173, 74*49))