提交 9eeafd89 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Update nerv ops for type context.

This unfortunately renders pycuda unusable without a significant amount of work which I am not ready to do right now. Since we have C code for this op I just removed the python code.
上级 6c4bdd7d
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
/* Why do we need this? */ /* Why do we need this? */
size_t dim = 2048 * 32; size_t dim = 2048 * 32;
rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, pygpu_default_context(), rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, CONTEXT,
Py_None); Py_None);
if (rand_buf == NULL) { if (rand_buf == NULL) {
FAIL; FAIL;
......
...@@ -8,10 +8,10 @@ from theano.gof import local_optimizer, COp ...@@ -8,10 +8,10 @@ from theano.gof import local_optimizer, COp
from theano.scalar import as_scalar, constant from theano.scalar import as_scalar, constant
from . import opt from . import opt
from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty) from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
infer_context_name)
from .type import gpu_context_type
from .opt_util import alpha_merge, output_merge from .opt_util import alpha_merge, output_merge
from .pycuda_helper import ensure_pycuda_context
try: try:
from nervanagpu.nervanagpu import GPUTensor, NervanaGPU from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
...@@ -43,6 +43,7 @@ def ensure_float(val, name): ...@@ -43,6 +43,7 @@ def ensure_float(val, name):
class Gemm16(COp): class Gemm16(COp):
__props__ = ('relu', 'inplace') __props__ = ('relu', 'inplace')
_f16_ok = True _f16_ok = True
context_type = gpu_context_type
KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32', KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32', 'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
'tn_128x128', 'tn_128x64', 'tn_128x32', 'tn_128x128', 'tn_128x64', 'tn_128x32',
...@@ -61,10 +62,11 @@ class Gemm16(COp): ...@@ -61,10 +62,11 @@ class Gemm16(COp):
def make_node(self, C, alpha, A, B, beta): def make_node(self, C, alpha, A, B, beta):
if GPUTensor is None: if GPUTensor is None:
raise RuntimeError("Can't use Gemm16: nervanagpu not found") raise RuntimeError("Can't use Gemm16: nervanagpu not found")
ctx_name = infer_context_name(C, A, B)
A = as_gpuarray_variable(A) A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B) B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C) C = as_gpuarray_variable(C, ctx_name)
alpha = ensure_float(alpha, 'alpha') alpha = ensure_float(alpha, 'alpha')
beta = ensure_float(beta, 'beta') beta = ensure_float(beta, 'beta')
...@@ -73,27 +75,8 @@ class Gemm16(COp): ...@@ -73,27 +75,8 @@ class Gemm16(COp):
return Apply(self, [C, alpha, A, B, beta], [C.type()]) return Apply(self, [C, alpha, A, B, beta], [C.type()])
def perform(self, node, inputs, outputs): def get_context(self, node):
ensure_pycuda_context() return node.inputs[0].type.context
C, alpha, A, B, beta = inputs
# The nervana code does not support the case where both inputs
# are trans, so we need to copy one if them if that is the
# case. We copy the smaller one.
if A.flags.f_contiguous and B.flags.f_contiguous:
if A.size < B.size:
A = A.copy()
else:
B = B.copy()
inplace = self.inplace
if inplace and not C.flags.c_contiguous:
inplace = False
if not inplace:
C = C.copy()
At = to_gputensor(A)
Bt = to_gputensor(B)
Ct = to_gputensor(C)
nerv.dot(At, Bt, Ct, alpha=alpha, beta=beta, relu=False)
outputs[0][0] = C
def c_headers(self): def c_headers(self):
return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h', return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
...@@ -145,7 +128,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz, ...@@ -145,7 +128,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name)) codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name))
codel.append("const char *bcode;") codel.append("const char *bcode;")
codel.append("size_t sz;") codel.append("size_t sz;")
codel.append("PyGpuContextObject *c = pygpu_default_context();") codel.append("PyGpuContextObject *c = %s;" % (sub['context'],))
codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, " codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
"GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, " "GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
"GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};") "GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
...@@ -170,7 +153,8 @@ def local_dot_to_gemm16(node): ...@@ -170,7 +153,8 @@ def local_dot_to_gemm16(node):
if (A.ndim == 2 and B.ndim == 2 and if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'): A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph fgraph = node.inputs[0].fgraph
C = GpuAllocEmpty(dtype='float16')( ctx_name = infer_context_name(A, B)
C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph)) shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0) return Gemm16()(C, 1.0, A, B, 0.0)
......
try:
from pycuda.driver import Context
if not hasattr(Context, 'attach'):
raise ImportError('too old')
except ImportError:
Context = None
pycuda_initialized = False
pycuda_context = None
def ensure_pycuda_context():
global pycuda_context, pycuda_initialized
if not pycuda_initialized:
if Context is None:
raise RuntimeError("PyCUDA not found or too old.")
else:
pycuda_context = Context.attach()
import atexit
atexit.register(pycuda_context.detach)
pycuda_initialized = True
return pycuda_context
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论