Add nervana gemm (for float16).

7f7749d2 · Arnaud Bergeron · 804f9114 · 7f7749d2 · 7f7749d2 · 7f7749d2
--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -29,7 +29,7 @@ AddConfigVar('gpuarray.sync',
 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                  GpuArraySharedVariable, gpuarray_shared_constructor)
-from . import opt
+from . import opt, nerv
 def init_dev(dev):

--- a/theano/sandbox/gpuarray/nerv.py
+++ b/theano/sandbox/gpuarray/nerv.py
+import numpy
+import theano
+from theano import Op, Apply, Variable, tensor
+from theano.compile import optdb
+from theano.gof import local_optimizer
+from theano.scalar import as_scalar, constant
+from . import opt
+from .basic_ops import (as_gpuarray_variable, gpu_alloc, gpu_from_host,
+                        host_from_gpu)
+from .opt_util import alpha_merge, output_merge
+from .pycuda_helper import ensure_pycuda_context
+try:
+    from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
+    nerv = NervanaGPU()
+except ImportError:
+    GPUTensor = None
+def to_gputensor(a):
+    assert a.flags.c_contiguous or a.flags.f_contiguous
+    return GPUTensor(a.shape, dtype=a.dtype, base=a,
+                     gpudata=a.gpudata + a.offset,
+                     strides=a.strides, is_trans=a.flags.f_contiguous)
+def ensure_float(val, name):
+    if not isinstance(val, Variable):
+        val = constant(val)
+    if hasattr(val, 'ndim') and val.ndim == 0:
+        val = as_scalar(val)
+    if not isinstance(val.type, theano.scalar.Scalar):
+        raise TypeError("%s: expected a scalar value" % (name,))
+    if not val.type.dtype == 'float32':
+        raise TypeError("%s: type is not float32" % (name,))
+    return val
+class Gemm16(Op):
+    __props__ = ('relu', 'inplace')
+    _f16_ok = True
+    def __init__(self, relu=False, inplace=False):
+        self.relu = relu
+        # relu = True will require more work in optimizations.
+        assert self.relu == False
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+    def make_node(self, C, alpha, A, B, beta):
+        if GPUTensor is None:
+            raise RuntimeError("Can't use Gemm16: nervanagpu not found")
+        A = as_gpuarray_variable(A)
+        B = as_gpuarray_variable(B)
+        C = as_gpuarray_variable(C)
+        alpha = ensure_float(alpha, 'alpha')
+        beta = ensure_float(beta, 'beta')
+        assert C.dtype == A.dtype == B.dtype == 'float16'
+        return Apply(self, [C, alpha, A, B, beta], [C.type()])
+    def perform(self, node, inputs, outputs):
+        ctx = ensure_pycuda_context()
+        C, alpha, A, B, beta = inputs
+        # The nervana code does not support the case where both inputs
+        # are trans, so we need to copy one if them if that is the
+        # case. We copy the smaller one.
+        if A.flags.f_contiguous and B.flags.f_contiguous:
+            if A.size < B.size:
+                A = A.copy()
+            else:
+                B = B.copy()
+        inplace = self.inplace
+        if inplace and not C.flags.forc:
+            inplace = False
+        if not inplace:
+            C = C.copy()
+        At = to_gputensor(A)
+        Bt = to_gputensor(B)
+        Ct = to_gputensor(C)
+        nerv.dot(At, Bt, Ct, alpha=alpha, beta=beta, relu=False)
+        outputs[0][0] = C
+@opt.register_opt()
+@local_optimizer([tensor.Dot])
+def local_dot_to_gemm16(node):
+    if (type(node.op) == tensor.Dot and
+            node.inputs[0].dtype == 'float16' and
+            node.inputs[1].dtype == 'float16' and
+            node.inputs[0].ndim == 2 and node.inputs[1].ndim == 2):
+        A = gpu_from_host(node.inputs[0])
+        B = gpu_from_host(node.inputs[1])
+        C = gpu_alloc(numpy.asarray(0, dtype='float16'),
+                      A.shape[0], B.shape[1])
+        return [host_from_gpu(Gemm16()(C, 1.0, A, B, 0.0))]
+@opt.register_opt()
+@alpha_merge(Gemm16, alpha_in=1, beta_in=4, nd=2)
+def local_gemm16_alpha_merge(node, *inputs):
+    return [Gemm16(relu=node.op.relu)(*inputs)]
+@opt.register_opt()
+@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0, nd=2)
+def local_gemm16_output_merge(node, *inputs):
+    return [Gemm16(relu=node.op.relu)(*inputs)]
+@local_optimizer([Gemm16], inplace=True)
+def local_gemm16_inplace(node):
+    if type(node.op) != Gemm16 or node.op.inplace:
+        return
+    return [Gemm16(relu=node.op.relu, inplace=True)(*node.inputs)]
+optdb.register('local_gemm16_inplace',
+               tensor.opt.in2out(local_gemm16_inplace,
+                                 name='local_gemm16_inplace'),
+               70.0, 'fast_run', 'inplace', 'gpuarray')
--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
+from functools import wraps
+import numpy
+import theano
+from theano import scalar as scal, Constant
+from theano.gof import local_optimizer
+from theano.tensor import (DimShuffle, get_scalar_constant_value,
+                           NotScalarConstantError)
+from .basic_ops import GpuFromHost, HostFromGpu, host_from_gpu
+from .elemwise import GpuDimShuffle, GpuElemwise
+_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
+def grab_cpu_scalar(v, nd):
+    if v.owner is not None:
+        n = v.owner
+        if (isinstance(n.op, GpuDimShuffle) and
+            n.op.new_order == ('x',) * nd):
+            return host_from_gpu(n.inputs[0])
+        elif (isinstance(n.op, DimShuffle) and
+              n.op.new_order == ('x',) * nd):
+            return n.inputs[0]
+        elif isinstance(n.op, GpuFromHost):
+            return grab_cpu_scalar(n.inputs[0], nd=nd)
+        else:
+            return None
+    else:
+        if (isinstance(v, Constant) and
+            v.broadcastable == (True,) * nd):
+            return v.dimshuffle(())
+def find_node(v, cls):
+    # This digs through possibly redundant transfers to for the node
+    # that has the op class specified.
+    if v.owner is not None:
+        if isinstance(v.owner.op, cls):
+            return v.owner
+        elif (isinstance(v.owner.op, GpuFromHost) and
+              v.owner.inputs[0].owner is not None and
+              isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
+            return find_node(v.owner.inputs[0].owner.inputs[0], cls)
+        else:
+            return None
+def is_equal(var, val):
+    # Returns True if var is always equal to val (python value), False
+    # otherwise (including if var is not constant)
+    try:
+        v = get_scalar_constant_value(var)
+        return v == val
+    except NotScalarConstantError:
+        return False
+def alpha_merge(cls, alpha_in, beta_in, nd):
+    def wrapper(maker):
+        @local_optimizer([GpuElemwise])
+        @wraps(maker)
+        def opt(node):
+            if (isinstance(node.op, GpuElemwise) and
+                node.op.scalar_op == scal.mul and
+                node.nin == 2):
+                targ = find_node(node.inputs[0], cls)
+                if targ is None:
+                    targ = find_node(node.inputs[1], cls)
+                    lr = grab_cpu_scalar(node.inputs[0], nd=nd)
+                else:
+                    lr = grab_cpu_scalar(node.inputs[1], nd=nd)
+                if lr is None or targ is None:
+                    return None
+                inputs = list(targ.inputs)
+                inputs[alpha_in] = lr * targ.inputs[alpha_in]
+                inputs[beta_in] = lr * targ.inputs[beta_in]
+                return maker(targ, *inputs)
+        return opt
+    return wrapper
+def output_merge(cls, alpha_in, beta_in, out_in, nd):
+    def wrapper(maker):
+        @local_optimizer([GpuElemwise])
+        @wraps(maker)
+        def opt(node):
+            if (isinstance(node.op, GpuElemwise) and
+                node.op.scalar_op == scal.add and
+                node.nin == 2):
+                targ = find_node(node.inputs[0], cls)
+                W = node.inputs[1]
+                if targ is None:
+                    targ = find_node(node.inputs[1], cls)
+                    W = node.inputs[0]
+                if targ is None:
+                    return None
+                if not is_equal(targ.inputs[beta_in], 0.0):
+                    # other cases are too complex for now
+                    return None
+                if W.broadcastable != targ.inputs[out_in].broadcastable:
+                    # Would need to explicitly tile the output to fill
+                    # the full shape here.  Disable for now.
+                    return None
+                inputs = list(targ.inputs)
+                inputs[out_in] = W
+                inputs[beta_in] = _one.clone()
+                return maker(targ, *inputs)
+        return opt
+    return wrapper
--- a/theano/sandbox/gpuarray/pycuda_helper.py
+++ b/theano/sandbox/gpuarray/pycuda_helper.py
+try:
+    from pycuda.driver import Context
+    if not hasattr(Context, 'attach'):
+        raise ImportError('too old')
+except ImportError:
+    Context = None
+pycuda_initialized = False
+pycuda_context = None
+def ensure_pycuda_context():
+    global pycuda_context, pycuda_initialized
+    if not pycuda_initialized:
+        if Context is None:
+            raise RuntimeError("PyCUDA not found or too old.")
+        else:
+            pycuda_context = Context.attach()
+            import atexit
+            atexit.register(pycuda_context.detach)
+            pycuda_initialized = True
+    return pycuda_context