Merge pull request #2927 from abergeron/nerv_ops

Nerv ops

Merge pull request #2927 from abergeron/nerv_ops
6a93ccc7 · Frédéric Bastien · b40ba487 · 5d34eefe · 6a93ccc7 · 6a93ccc7
--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -29,7 +29,7 @@ AddConfigVar('gpuarray.sync',
 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                  GpuArraySharedVariable, gpuarray_shared_constructor)
-from . import opt
+from . import opt, nerv
 def init_dev(dev):

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -586,11 +586,13 @@ class GpuAlloc(HideC, Alloc):
        return s
    def make_node(self, value, *shape):
-        res = Alloc.make_node(self, value, *shape)
        value = as_gpuarray_variable(value)
-        otype = GpuArrayType(dtype=res.outputs[0].dtype,
+        sh, bcast = self.validate_shape(shape)
-                             broadcastable=res.outputs[0].broadcastable)
+        if value.ndim > len(sh):
-        return Apply(self, [value] + res.inputs[1:], [otype()])
+            TypeError("The GpuAlloc value to use has more dimensions "
+                      "than the specified shape", v.ndim, len(sh))
+        otype = value.type.clone(broadcastable=bcast)
+        return Apply(self, [value] + sh, [otype()])
    def c_headers(self):
        return ['<numpy_compat.h>']
@@ -600,7 +602,7 @@ class GpuAlloc(HideC, Alloc):
        v = inputs[0]
        sh = tuple(map(int, inputs[1:]))
        if out[0] is None or out[0].shape != sh:
-            if v.size == 1 and numpy.asarray(v).flatten().item() == 0:
+            if self.memset_0:
                out[0] = gpuarray.zeros(sh, dtype=v.dtype)
            else:
                out[0] = gpuarray.empty(sh, dtype=v.dtype)
@@ -712,9 +714,74 @@ class GpuAlloc(HideC, Alloc):
                return False
        return True
 gpu_alloc = GpuAlloc()
+class GpuAllocEmpty(HideC, Alloc):
+    __props__ = ('dtype',)
+    _f16_ok = True
+    def __init__(self, dtype):
+        self.dtype = dtype
+    def make_node(self, *shape):
+        sh, bcast = self.validate_shape(shape)
+        otype = GpuArrayType(dtype=self.dtype, broadcastable=bcast)
+        return Apply(self, sh, [otype()])
+    def perform(self, node, inputs, out_):
+        out = out_[0]
+        sh = [int(i) for i in inputs]
+        if out[0] is None or out[0].shape != sh:
+            out[0] = pygpu.empty(sh, dtype=self.dtype)
+        # if out[0] is the right shape, we just return it
+    def c_headers(self):
+        return ['<gpuarray_helper.h>']
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+    def c_code(self, node, name, inp, out, sub):
+        ndim = len(inp)
+        zz = out[0]
+        fail = sub['fail']
+        code = ["""
+int i;
+size_t shape[%(ndim)s];
+""" % dict(ndim=ndim)]
+        for i, shp_i in enumerate(inp):
+            code.append("""
+shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
+""" % dict(i=i, shp_i=shp_i))
+        code.append("""
+if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
+                       pygpu_default_context())) {
+  %(fail)s
+}
+""" % dict(zz=zz, ndim=ndim, type=gpuarray.dtype_to_typecode(self.dtype),
+           fail=fail))
+        return ''.join(code)
+    def c_code_cache_version(self):
+        return (0,)
+    def do_constant_folding(self, node):
+        return False
+    def infer_shape(self, node, input_shapes):
+        return [node.inputs]
+    def grad(self, *args):
+        # Don't reuse the grad implementation from Alloc
+        raise NotImplementedError("grad disabled")
 class GpuContiguous(Op):
    """
    Always return a c contiguous output. Copy the input only if it is

--- a/theano/sandbox/gpuarray/gemm16.c
+++ b/theano/sandbox/gpuarray/gemm16.c
+#section init_code_struct
+/* Why do we need this? */
+size_t dim = 2048 * 32;
+rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, pygpu_default_context(),
+                       Py_None);
+if (rand_buf == NULL) {
+  FAIL;
+}
+#section support_code_struct
+PyGpuArrayObject *rand_buf;
+int gemm16(PyGpuArrayObject *C, float alpha,
+           PyGpuArrayObject *A, PyGpuArrayObject *B,
+           float beta, PyGpuArrayObject **out) {
+  PyGpuArrayObject *_A = NULL;
+  PyGpuArrayObject *_B = NULL;
+  GpuKernel *gk;
+  char *prand, *pA, *pB, *pout;
+  void *params[13];
+  size_t grid[2];
+  size_t threads[2];
+  int res = 0;
+  int flags = 0;
+  int lda, ldb, ldc, n, m, k;
+  int n128, n64;
+  int size = 0;
+  int vec = 0;
+  static unsigned int nprocs = 0;
+  char opA, opB;
+  if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY) &&
+      GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) {
+    /*
+     * The nervana kernels do not cover the case where both inputs are
+     * trans so we need to copy one of them.  We choose the smallest
+     * one.
+     */
+    if (PyGpuArray_DIM(A, 0) * PyGpuArray_DIM(A, 1) <
+        PyGpuArray_DIM(B, 0) * PyGpuArray_DIM(B, 1)) {
+      _A = pygpu_copy(A, GA_C_ORDER);
+      if (_A == NULL) {
+        res = 1;
+        goto cleanup;
+      }
+      /*
+       * This is not an extra reference on _A so don't add an INCREF.
+       * Also, we don't lose the ref on A since our caller will deal
+       * with it.
+       */
+      A = _A;
+    } else {
+      _B = pygpu_copy(B, GA_C_ORDER);
+      if (_B == NULL) {
+        res = 1;
+        goto cleanup;
+      }
+      /*
+       * This is not an extra reference on _B so don't add an INCREF
+       * Also, we don't lose the ref on B since our caller will deal
+       * with it.
+       */
+      B = _B;
+    }
+  }
+  if (GEMM16_INPLACE && GpuArray_CHKFLAGS(&C->ga, GA_CARRAY)) {
+    Py_XDECREF(*out);
+    *out = C;
+    Py_INCREF(*out);
+  } else {
+    *out = theano_try_copy(*out, C);
+    if (*out == NULL) {
+      res = 1;
+      goto cleanup;
+    }
+  }
+  if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY)) {
+    opA = 't';
+    lda = PyGpuArray_STRIDE(A, 1);
+  } else {
+    opA = 'n';
+    lda = PyGpuArray_STRIDE(A, 0);
+  }
+  if (GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) {
+    opB = 't';
+    ldb = PyGpuArray_STRIDE(B, 1);
+  } else {
+    opB = 'n';
+    ldb = PyGpuArray_STRIDE(B, 0);
+  }
+  ldc = PyGpuArray_STRIDE(*out, 0);
+  /* lda and friend are in number of elements, not bytes */
+  lda /= 2;
+  ldb /= 2;
+  ldc /= 2;
+  m = PyGpuArray_DIM(*out, 0);
+  n = PyGpuArray_DIM(*out, 1);
+  k = PyGpuArray_DIM(B, 0);
+  /* Tuning code adapted from the python version */
+  grid[0] = (m + 127) / 128;
+  if (opA == 'n' && opB == 't')
+    size = 128;
+  else {
+    if (n < 384-16) {
+      n128 = n % 128;
+      if (n128 < 112) {
+        if (48 < n128 && n128 <= 64) {
+          n64 = n / 64;
+          if (nprocs == 0)
+            if (A->ga.ops->property(A->context->ctx, NULL, NULL,
+                                    GA_CTX_PROP_NUMPROCS, &nprocs)) {
+              nprocs = 0;
+              res = 1;
+              goto cleanup;
+            }
+          n64 *= (grid[0] / nprocs);
+          if (n64 > 1 || (opA == 't' && opB == 'n'))
+            size = 64;
+          else
+            size = 32;
+        } else {
+          size = 32;
+        }
+      } else {
+        size = 128;
+      }
+    } else {
+      size = 128;
+    }
+  }
+  grid[1] = (n + (size-1)) / size;
+  if (size == 128)
+    threads[0] = 256;
+  else
+    threads[0] = 128;
+  threads[1] = 1;
+  if ((opA == 't' && opB == 'n' && m % 8 == 0 && n % 8 == 0) ||
+      (opA == 'n' && opB == 'n' && k % 16 == 0 && n % 8 == 0) ||
+      (opA == 'n' && opB == 't' && k % 16 == 0))
+    vec = 1;
+  switch (size) {
+  case 128:
+    if (opA == 'n' && opB == 'n') {
+      if (vec)
+        gk = &k_nn_vec_128x128;
+      else
+        gk = &k_nn_128x128;
+    } else if (opA == 'n' && opB == 't') {
+      if (vec)
+        gk = &k_nt_vec_128x128;
+      else
+        gk = &k_nt_128x128;
+    } else if (opA == 't' && opB == 'n') {
+      if (vec)
+        gk = &k_tn_vec_128x128;
+      else
+        gk = &k_tn_128x128;
+    }
+    break;
+  case 64:
+    if (opA == 'n' && opB == 'n') {
+      if (vec)
+        gk = &k_nn_vec_128x64;
+      else
+        gk = &k_nn_128x64;
+    } else if (opA == 't' && opB == 'n') {
+      if (vec)
+        gk = &k_tn_vec_128x64;
+      else
+        gk = &k_tn_128x64;
+    }
+    break;
+  case 32:
+    if (opA == 'n' && opB == 'n') {
+      if (vec)
+        gk = &k_nn_vec_128x32;
+      else
+        gk = &k_nn_128x32;
+    } else if (opA == 't' && opB == 'n') {
+      if (vec)
+        gk = &k_tn_vec_128x32;
+      else
+        gk = &k_tn_128x32;
+    }
+    break;
+  default:
+    PyErr_SetString(PyExc_RuntimeError, "error selecting kernel");
+    res = 1;
+    goto cleanup;
+  }
+  prand = *((char **)rand_buf->ga.data);
+  prand += rand_buf->ga.offset;
+  pA = *((char **)A->ga.data);
+  pA += A->ga.offset;
+  pB = *((char **)B->ga.data);
+  pB += B->ga.offset;
+  pout = *((char **)(*out)->ga.data);
+  pout += (*out)->ga.offset;
+  params[0] = &prand;
+  params[1] = &pA;
+  params[2] = &pB;
+  params[3] = &pout;
+  params[4] = &lda;
+  params[5] = &ldb;
+  params[6] = &ldc;
+  params[7] = &m;
+  params[8] = &n;
+  params[9] = &k;
+  params[10] = &alpha;
+  params[11] = &beta;
+  params[12] = &flags;
+  if (GpuKernel_call(gk, 2, threads, grid, 0, params) != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "error in gemm16 kernel call");
+    res = 1;
+  }
+cleanup:
+  Py_XDECREF(_A);
+  Py_XDECREF(_B);
+  return res;
+}
--- a/theano/sandbox/gpuarray/gpuarray_helper.h
+++ b/theano/sandbox/gpuarray/gpuarray_helper.h
@@ -24,4 +24,22 @@ static int theano_prep_output(PyGpuArrayObject **out, unsigned int nd,
  return (*out == NULL) ? 1 : 0;
 }
+static PyGpuArrayObject *theano_try_copy(PyGpuArrayObject *out,
+                                         PyGpuArrayObject *V) {
+  if (out &&
+      GpuArray_CHKFLAGS(&out->ga, GA_CARRAY) &&
+      theano_size_check(out, PyGpuArray_NDIM(V),
+                        PyGpuArray_DIMS(V),
+                        V->ga.typecode)) {
+    if (pygpu_move(out, V)) {
+      Py_XDECREF(out);
+      return NULL;
+    }
+  } else {
+    Py_XDECREF(out);
+    out = pygpu_copy(V, GA_C_ORDER);
+  }
+  return out;
+}
 #endif
--- a/theano/sandbox/gpuarray/nerv.py
+++ b/theano/sandbox/gpuarray/nerv.py
+import os.path
+import theano
+from theano import Apply, Variable, tensor
+from theano.compile import optdb
+from theano.compile.ops import shape_i
+from theano.gof import local_optimizer, COp
+from theano.scalar import as_scalar, constant
+from . import opt
+from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty)
+from .opt_util import alpha_merge, output_merge
+from .pycuda_helper import ensure_pycuda_context
+try:
+    from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
+    nerv = NervanaGPU()
+except ImportError:
+    GPUTensor = None
+    nerv = None
+def to_gputensor(a):
+    assert a.flags.c_contiguous or a.flags.f_contiguous
+    return GPUTensor(a.shape, dtype=a.dtype, base=a,
+                     gpudata=a.gpudata + a.offset,
+                     strides=a.strides, is_trans=a.flags.f_contiguous)
+def ensure_float(val, name):
+    if not isinstance(val, Variable):
+        val = constant(val)
+    if hasattr(val, 'ndim') and val.ndim == 0:
+        val = as_scalar(val)
+    if not isinstance(val.type, theano.scalar.Scalar):
+        raise TypeError("%s: expected a scalar value" % (name,))
+    if not val.type.dtype == 'float32':
+        raise TypeError("%s: type is not float32" % (name,))
+    return val
+class Gemm16(COp):
+    __props__ = ('relu', 'inplace')
+    _f16_ok = True
+    KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
+                  'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
+                  'tn_128x128', 'tn_128x64', 'tn_128x32',
+                  'tn_vec_128x128', 'tn_vec_128x64', 'tn_vec_128x32',
+                  'tn_vec_128x16', 'nt_128x128', 'nt_vec_128x128')
+    def __init__(self, relu=False, inplace=False):
+        COp.__init__(self, ["gemm16.c"], "gemm16")
+        self.relu = relu
+        # relu = True will require more work in optimizations.
+        assert self.relu is False
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+    def make_node(self, C, alpha, A, B, beta):
+        if GPUTensor is None:
+            raise RuntimeError("Can't use Gemm16: nervanagpu not found")
+        A = as_gpuarray_variable(A)
+        B = as_gpuarray_variable(B)
+        C = as_gpuarray_variable(C)
+        alpha = ensure_float(alpha, 'alpha')
+        beta = ensure_float(beta, 'beta')
+        assert C.dtype == A.dtype == B.dtype == 'float16'
+        return Apply(self, [C, alpha, A, B, beta], [C.type()])
+    def perform(self, node, inputs, outputs):
+        ensure_pycuda_context()
+        C, alpha, A, B, beta = inputs
+        # The nervana code does not support the case where both inputs
+        # are trans, so we need to copy one if them if that is the
+        # case. We copy the smaller one.
+        if A.flags.f_contiguous and B.flags.f_contiguous:
+            if A.size < B.size:
+                A = A.copy()
+            else:
+                B = B.copy()
+        inplace = self.inplace
+        if inplace and not C.flags.c_contiguous:
+            inplace = False
+        if not inplace:
+            C = C.copy()
+        At = to_gputensor(A)
+        Bt = to_gputensor(B)
+        Ct = to_gputensor(C)
+        nerv.dot(At, Bt, Ct, alpha=alpha, beta=beta, relu=False)
+        outputs[0][0] = C
+    def c_headers(self):
+        return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
+                'string.h']
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+    def get_op_params(self):
+        return [('GEMM16_INPLACE', '1' if self.inplace else '0')]
+    @staticmethod
+    def cubin_to_code(name):
+        fname = 'hgemm_{0}.cubin'.format(name)
+        with open(os.path.join(nerv.cubin_path, fname)) as f:
+            cubin = f.read()
+        bcode = ','.join(hex(ord(c)) for c in cubin)
+        return "static const char bin_%s[] = { %s };" % (name, bcode)
+    @staticmethod
+    def init_gpukernel(name, fail):
+        return """
+bcode = bin_%(name)s;
+sz = sizeof(bin_%(name)s);
+if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
+                   "hgemm_%(name)s", 13, types, GA_USE_BINARY, NULL)
+    != GA_NO_ERROR) {
+  PyErr_SetString(PyExc_RuntimeError, "Could not initialize kernel %(name)s");
+  %(fail)s;
+}
+""" % dict(name=name, fail=fail)
+    def c_support_code(self):
+        codel = []
+        for name in self.KERN_NAMES:
+            codel.append(Gemm16.cubin_to_code(name))
+        return '\n'.join(codel)
+    def c_support_code_struct(self, node, nodename):
+        codel = []
+        for name in self.KERN_NAMES:
+            codel.append("GpuKernel k_{0};".format(name))
+        codel.append(super(Gemm16, self).c_support_code_struct(node, nodename))
+        return '\n'.join(codel)
+    def c_init_code_struct(self, node, nodename, sub):
+        codel = [super(Gemm16, self).c_init_code_struct(node, nodename, sub)]
+        for name in self.KERN_NAMES:
+            codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name))
+        codel.append("const char *bcode;")
+        codel.append("size_t sz;")
+        codel.append("PyGpuContextObject *c = pygpu_default_context();")
+        codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
+                     "GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
+                     "GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
+        for name in self.KERN_NAMES:
+            codel.append(self.init_gpukernel(name, sub['fail']))
+        return '\n'.join(codel)
+    def c_cleanup_code_struct(self, node, nodename):
+        codel = []
+        for name in self.KERN_NAMES:
+            codel.append("GpuKernel_clear(&k_{0});".format(name))
+        return '\n'.join(codel)
+@opt.register_opt()
+@opt.op_lifter([tensor.Dot])
+def local_dot_to_gemm16(node):
+    A = node.inputs[0]
+    B = node.inputs[1]
+    if (A.ndim == 2 and B.ndim == 2 and
+            A.dtype == 'float16' and B.dtype == 'float16'):
+        fgraph = node.inputs[0].fgraph
+        C = GpuAllocEmpty(dtype='float16')(
+            shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
+        return Gemm16()(C, 1.0, A, B, 0.0)
+@opt.register_opt()
+@alpha_merge(Gemm16, alpha_in=1, beta_in=4, nd=2)
+def local_gemm16_alpha_merge(node, *inputs):
+    return [Gemm16(relu=node.op.relu)(*inputs)]
+@opt.register_opt()
+@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0, nd=2)
+def local_gemm16_output_merge(node, *inputs):
+    return [Gemm16(relu=node.op.relu)(*inputs)]
+@local_optimizer([Gemm16], inplace=True)
+def local_gemm16_inplace(node):
+    if type(node.op) != Gemm16 or node.op.inplace:
+        return
+    inputs = list(node.inputs)
+    C = inputs[0]
+    if (C.owner and
+            isinstance(C.owner.op, GpuAllocEmpty) and
+            len(C.clients) > 1):
+        inputs[0] = C.owner.op(*C.owner.inputs)
+    return [Gemm16(relu=node.op.relu, inplace=True)(*inputs)]
+optdb.register('local_gemm16_inplace',
+               tensor.opt.in2out(local_gemm16_inplace,
+                                 name='local_gemm16_inplace'),
+               70.0, 'fast_run', 'inplace', 'gpuarray')
--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
+from functools import wraps
+import numpy
+from theano import scalar as scal, Constant
+from theano.gof import local_optimizer
+from theano.tensor import (DimShuffle, get_scalar_constant_value,
+                           NotScalarConstantError)
+from .basic_ops import GpuFromHost, HostFromGpu, host_from_gpu
+from .elemwise import GpuDimShuffle, GpuElemwise
+_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
+def grab_cpu_scalar(v, nd):
+    if v.owner is not None:
+        n = v.owner
+        if (isinstance(n.op, GpuDimShuffle) and
+                n.op.new_order == ('x',) * nd):
+            return host_from_gpu(n.inputs[0])
+        elif (isinstance(n.op, DimShuffle) and
+              n.op.new_order == ('x',) * nd):
+            return n.inputs[0]
+        elif isinstance(n.op, GpuFromHost):
+            return grab_cpu_scalar(n.inputs[0], nd=nd)
+        else:
+            return None
+    else:
+        if (isinstance(v, Constant) and
+                v.broadcastable == (True,) * nd):
+            return v.dimshuffle(())
+def find_node(v, cls, ignore_clients=False):
+    # This digs through possibly redundant transfers to for the node
+    # that has the op class specified. If ignore_clients is False (the
+    # default) it will only dig through nodes that have a single
+    # client.
+    if v.owner is not None and (ignore_clients or v.clients == 1):
+        if isinstance(v.owner.op, cls):
+            return v.owner
+        elif (isinstance(v.owner.op, GpuFromHost) and
+              v.owner.inputs[0].owner is not None and
+              isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
+            return find_node(v.owner.inputs[0].owner.inputs[0], cls)
+        else:
+            return None
+def is_equal(var, val):
+    # Returns True if var is always equal to val (python value), False
+    # otherwise (including if var is not constant)
+    try:
+        v = get_scalar_constant_value(var)
+        return v == val
+    except NotScalarConstantError:
+        return False
+def alpha_merge(cls, alpha_in, beta_in, nd):
+    def wrapper(maker):
+        @local_optimizer([GpuElemwise])
+        @wraps(maker)
+        def opt(node):
+            if (isinstance(node.op, GpuElemwise) and
+                    node.op.scalar_op == scal.mul and
+                    node.nin == 2):
+                targ = find_node(node.inputs[0], cls)
+                if targ is None:
+                    targ = find_node(node.inputs[1], cls)
+                    lr = grab_cpu_scalar(node.inputs[0], nd=nd)
+                else:
+                    lr = grab_cpu_scalar(node.inputs[1], nd=nd)
+                if lr is None or targ is None:
+                    return None
+                inputs = list(targ.inputs)
+                inputs[alpha_in] = lr * targ.inputs[alpha_in]
+                inputs[beta_in] = lr * targ.inputs[beta_in]
+                return maker(targ, *inputs)
+        return opt
+    return wrapper
+def output_merge(cls, alpha_in, beta_in, out_in, nd):
+    def wrapper(maker):
+        @local_optimizer([GpuElemwise])
+        @wraps(maker)
+        def opt(node):
+            if (isinstance(node.op, GpuElemwise) and
+                    node.op.scalar_op == scal.add and
+                    node.nin == 2):
+                targ = find_node(node.inputs[0], cls)
+                W = node.inputs[1]
+                if targ is None:
+                    targ = find_node(node.inputs[1], cls)
+                    W = node.inputs[0]
+                if targ is None:
+                    return None
+                if not is_equal(targ.inputs[beta_in], 0.0):
+                    # other cases are too complex for now
+                    return None
+                if W.broadcastable != targ.inputs[out_in].broadcastable:
+                    # Would need to explicitly tile the output to fill
+                    # the full shape here.  Disable for now.
+                    return None
+                inputs = list(targ.inputs)
+                inputs[out_in] = W
+                inputs[beta_in] = _one.clone()
+                return maker(targ, *inputs)
+        return opt
+    return wrapper
--- a/theano/sandbox/gpuarray/pycuda_helper.py
+++ b/theano/sandbox/gpuarray/pycuda_helper.py
+try:
+    from pycuda.driver import Context
+    if not hasattr(Context, 'attach'):
+        raise ImportError('too old')
+except ImportError:
+    Context = None
+pycuda_initialized = False
+pycuda_context = None
+def ensure_pycuda_context():
+    global pycuda_context, pycuda_initialized
+    if not pycuda_initialized:
+        if Context is None:
+            raise RuntimeError("PyCUDA not found or too old.")
+        else:
+            pycuda_context = Context.attach()
+            import atexit
+            atexit.register(pycuda_context.detach)
+            pycuda_initialized = True
+    return pycuda_context
--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -40,6 +40,7 @@ from ..type import (GpuArrayType,
 from ..basic_ops import (
    host_from_gpu, gpu_from_host,
    gpu_alloc, GpuAlloc,
+    GpuAllocEmpty,
    gpu_from_cuda,
    cuda_from_gpu, HostFromGpu,
    GpuContiguous,
@@ -309,6 +310,25 @@ class TestAlloc(test_basic.TestAlloc):
    allocs = [GpuAlloc(), GpuAlloc(), T.Alloc()]
+def test_alloc_empty():
+    for dt in ['float32', 'int8']:
+        f = theano.function([], GpuAllocEmpty(dt)(2, 3))
+        assert len(f.maker.fgraph.apply_nodes) == 1
+        out = f()
+        assert out.shape == (2, 3)
+        assert out.dtype == dt
+    f = theano.function([], [GpuAllocEmpty('uint64')(3, 2),
+                             GpuAllocEmpty('uint64')(3, 2)])
+    out = f()
+    assert out[0].shape == (3, 2)
+    assert out[0].dtype == 'uint64'
+    assert out[1].shape == (3, 2)
+    assert out[1].dtype == 'uint64'
+    assert len([node for node in f.maker.fgraph.apply_nodes
+                if isinstance(node.op, GpuAllocEmpty)]) == 1
 def test_shape():
    x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])()
    v = gpuarray.zeros((3, 4, 5), dtype='float32')

--- a/theano/sandbox/gpuarray/tests/test_nerv.py
+++ b/theano/sandbox/gpuarray/tests/test_nerv.py
+from nose.plugins.skip import SkipTest
+import numpy
+from theano import function
+from theano.tests import unittest_tools as utt
+from theano.tensor import vector, matrix, dot
+from .test_basic_ops import mode_with_gpu
+from ..nerv import Gemm16, nerv
+def test_gemm16_swap():
+    if nerv is None:
+        raise SkipTest("nervanagpu not available")
+    v = vector(dtype='float16')
+    m = matrix(dtype='float16')
+    m2 = matrix(dtype='float16')
+    m32 = matrix(dtype='float32')
+    # test that we don't try to replace anything but matrix x matrix in float16
+    f = function([v, m], dot(v, m), mode=mode_with_gpu)
+    assert len([node for node in f.maker.fgraph.apply_nodes
+                if isinstance(node.op, Gemm16)]) == 0
+    f = function([m32, m], dot(m32, m), mode=mode_with_gpu)
+    assert len([node for node in f.maker.fgraph.apply_nodes
+                if isinstance(node.op, Gemm16)]) == 0
+    f = function([m, m2], dot(m, m2), mode=mode_with_gpu)
+    assert len([node for node in f.maker.fgraph.apply_nodes
+                if isinstance(node.op, Gemm16)]) == 1
+def test_gemm16_value():
+    if nerv is None:
+        raise SkipTest("nervanagpu not available")
+    m = matrix(dtype='float16')
+    m2 = matrix(dtype='float16')
+    f = function([m, m2], dot(m, m2), mode=mode_with_gpu)
+    v1 = numpy.random.random((3, 4)).astype('float16')
+    v2 = numpy.random.random((4, 2)).astype('float16')
+    of = f(v1, v2)
+    on = numpy.dot(v1, v2)
+    utt.assert_allclose(of, on)
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2401,14 +2401,9 @@ class Alloc(gof.Op):
    """
    __props__ = ()
-    def make_node(self, value, *shape):
+    def validate_shape(self, shape):
-        v = as_tensor_variable(value)
        sh = [as_tensor_variable(s) for s in shape]
        bcast = []
-        if v.ndim > len(sh):
-            raise TypeError("The Alloc value to use has more dimensions"
-                            " than the specified dimensions",
-                            v.ndim, len(sh))
        for i, s in enumerate(sh):
            if s.type.dtype[:3] not in ('int', 'uin'):
                if config.exception_verbosity == 'high':
@@ -2424,8 +2419,17 @@ class Alloc(gof.Op):
            except NotScalarConstantError:
                const_shp = None
            bcast.append(numpy.all(1 == const_shp))
+        return sh, bcast
+    def make_node(self, value, *shape):
+        v = as_tensor_variable(value)
+        sh, bcast = self.validate_shape(shape)
+        if v.ndim > len(sh):
+            raise TypeError("The Alloc value to use has more dimensions"
+                            " than the specified dimensions",
+                            v.ndim, len(sh))
        otype = TensorType(dtype=v.dtype, broadcastable=bcast)
-        return gof.Apply(self, ([v] + sh), [otype()])
+        return gof.Apply(self, [v] + sh, [otype()])
    def perform(self, node, inputs, out_):
        out, = out_