提交 3cb88534 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Multiple changes in gpuarray/basic_ops.py:

- Convert GpuKernelBase to use contexts. - Start using explicit context for all the ops. This may break some stuff outside of basic_ops.py since sometimes a context will be required, but should be ok otherwise.
上级 cafe03af
...@@ -5,11 +5,13 @@ import numpy ...@@ -5,11 +5,13 @@ import numpy
from theano import Op, Apply, Type, Variable from theano import Op, Apply, Type, Variable
from theano import tensor, config from theano import tensor, config
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
from theano.tensor.basic import Alloc, Join, Split from theano.tensor.basic import Alloc, Join, Split, as_tensor_variable
from theano.gof import HideC from theano.gof import HideC
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
from theano.compat import PY3
from collections import deque
from six import string_types from six import string_types
from six.moves import xrange from six.moves import xrange
...@@ -23,23 +25,49 @@ from .type import GpuArrayType, gpu_context_type, get_context ...@@ -23,23 +25,49 @@ from .type import GpuArrayType, gpu_context_type, get_context
from .fp16_help import write_w from .fp16_help import write_w
def as_gpuarray_variable(x): def as_gpuarray_variable(x, context_name):
# This is a pre-optimization to reduce the number of useless
# transfers in the graph and reduce optimization time.
if getattr(x, 'owner', None): if getattr(x, 'owner', None):
if isinstance(x.owner.op, HostFromGpu): if (isinstance(x.owner.op, HostFromGpu) and
x.owner.inputs[0].type.context_name == context_name):
return x.owner.inputs[0] return x.owner.inputs[0]
elif (isinstance(x.owner.op, GpuFromHost) and elif (isinstance(x.owner.op, GpuFromHost) and
x.owner.inputs[0].owner and x.owner.inputs[0].owner and
isinstance(x.owner.inputs[0].owner.op, HostFromGpu)): isinstance(x.owner.inputs[0].owner.op, HostFromGpu) and
x.owner.inputs[0].owner.inputs[0].type.context_name == context_name):
return x.owner.inputs[0].owner.inputs[0] return x.owner.inputs[0].owner.inputs[0]
if hasattr(x, '_as_GpuArrayVariable'): if hasattr(x, '_as_GpuArrayVariable'):
return x._as_GpuArrayVariable() return x._as_GpuArrayVariable(context_name)
# TODO we need to have the cuda -> gpu path taken care of.
tensor_x = tensor.as_tensor_variable(x)
return gpu_from_host(tensor_x)
tensor_x = as_tensor_variable(x)
return GpuFromHost(context_name)(tensor_x)
def as_gpuarray(x):
return gpuarray.array(x, copy=False) def infer_context_name(*vars):
"""
Infer the context name to use from the inputs given
"""
# We try to infer the closest context first
# TODO: What to do in case of context conflicts?
# We currently use a first found wins approach.
todo = deque()
todo.extendleft(vars)
while todo:
v = todo.pop()
if isinstance(v.type, GpuArrayType):
return v.type.context_name
if hasattr(v.tag, 'context_name'):
return v.tag.context_name
if v.owner:
if isinstance(v.owner.op, HostFromGpu):
return v.owner.inputs[0].type.context_name
if len(v.owner.inputs) == 1:
todo.extendleft(v.owner.inputs)
# If we can't find a context we infer None, which is the default
return None
class Kernel(object): class Kernel(object):
...@@ -111,10 +139,12 @@ class Kernel(object): ...@@ -111,10 +139,12 @@ class Kernel(object):
class GpuKernelBase(object): class GpuKernelBase(object):
context_type = gpu_context_type
def gpu_kernels(self, node, name): def gpu_kernels(self, node, name):
""" """
This is the method to override. This should return an iterable of Kernel This is the method to override. This should return an iterable
objects that describe the kernels this op will need. of Kernel objects that describe the kernels this op will need.
""" """
raise MethodNotDefined('gpu_kernels') raise MethodNotDefined('gpu_kernels')
...@@ -126,8 +156,9 @@ class GpuKernelBase(object): ...@@ -126,8 +156,9 @@ class GpuKernelBase(object):
o = [] o = []
return o + ['gpuarray/types.h'] return o + ['gpuarray/types.h']
def _generate_kernel_bin(self, k): def _generate_kernel_bin(self, k, ctx):
gk = gpuarray.GpuKernel(k.code, k.name, k.params, **k.flags) gk = gpuarray.GpuKernel(k.code, k.name, k.params, context=ctx,
**k.flags)
bin = gk._binary bin = gk._binary
bcode = ','.join(hex(ord(c)) for c in bin) bcode = ','.join(hex(ord(c)) for c in bin)
return ("""static const char %(bname)s[] = { %(bcode)s };""" % return ("""static const char %(bname)s[] = { %(bcode)s };""" %
...@@ -153,44 +184,57 @@ class GpuKernelBase(object): ...@@ -153,44 +184,57 @@ class GpuKernelBase(object):
def c_support_code_apply(self, node, name): def c_support_code_apply(self, node, name):
kernels = self.gpu_kernels(node, name) kernels = self.gpu_kernels(node, name)
bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels) ctx = self.get_context(node)
bins = '\n'.join(self._generate_kernel_bin(k, ctx) for k in kernels)
codes = '\n'.join(self._generate_kernel_code(k) for k in kernels) codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels) return '\n'.join([bins, codes])
return '\n'.join([bins, codes, vars])
def _generate_kernel_init(self, k, err): def c_support_code_struct(self, node, name):
if PY3: kernels = self.gpu_kernels(node, name)
error_out = "NULL" return '\n'.join(self._generate_kernel_vars(k) for k in kernels)
else:
error_out = "" def _generate_zeros(self, k):
return """memset(&%(v)s, 0, sizeof(%(v)s));""" % dict(v=k.objvar)
def _generate_kernel_init(self, k, fail, ctx):
return """{ return """{
int err;
int types[%(numargs)u] = {%(types)s}; int types[%(numargs)u] = {%(types)s};
const char *bcode = %(bvar)s; const char *bcode = %(bvar)s;
size_t sz = sizeof(%(bvar)s); size_t sz = sizeof(%(bvar)s);
PyGpuContextObject *c = pygpu_default_context(); if (GpuKernel_init(&%(ovar)s, %(ctx)s->ops, %(ctx)s->ctx, 1, &bcode, &sz,
if (GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &bcode, &sz, "%(kname)s", "%(kname)s", %(numargs)u, types, GA_USE_BINARY, NULL)
%(numargs)u, types, GA_USE_BINARY, NULL) != GA_NO_ERROR) { != GA_NO_ERROR) {
if ((%(err)s = GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &%(cname)s, if ((%(err)s = GpuKernel_init(&%(ovar)s, %(ctx)s->ops, %(ctx)s->ctx, 1,
NULL, "%(kname)s", %(numargs)u, types, &%(cname)s, NULL, "%(kname)s", %(numargs)u,
%(flags)s, NULL)) != GA_NO_ERROR) { types, %(flags)s, NULL)) != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s", PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
%(err)s, Gpu_error(c->ops, c->ctx, %(err)s)); %(err)s, Gpu_error(%(ctx)s->ops, %(ctx)s->ctx, %(err)s));
return %(error_out)s; %(fail)s
} }
} }
}""" % dict(numargs=len(k.params), types=k._get_c_types(), bvar=k.binvar, }""" % dict(numargs=len(k.params), types=k._get_c_types(), bvar=k.binvar,
ovar=k.objvar, kname=k.name, err=err, cname=k.codevar, ovar=k.objvar, kname=k.name, cname=k.codevar,
flags=k._get_c_flags(), error_out=error_out) flags=k._get_c_flags(), fail=fail, ctx=ctx)
def c_init_code_struct(self, node, name, sub):
ctx = sub['context']
kernels = self.gpu_kernels(node, name)
inits_0 = '\n'.join(self._generate_zeros(k) for k in kernels)
inits = '\n'.join(self._generate_kernel_init(k, sub['fail'], ctx)
for k in kernels)
return '\n'.join([inits_0, inits])
def c_init_code_apply(self, node, name): def _generate_kernel_cleanup(self, k):
err = 'err_' + name return "GpuKernel_clear(&%(ovar)s);" % dict(ovar=k.objvar)
def c_cleanup_code_struct(self, node, name):
kernels = self.gpu_kernels(node, name) kernels = self.gpu_kernels(node, name)
inits = '\n'.join(self._generate_kernel_init(k, err) for k in kernels) cleanups = '\n'.join(self._generate_kernel_cleanup(k) for k in kernels)
return ("int %(err)s;\n" % dict(err=err)) + inits return cleanups
def _GpuKernelBase_version(self): def _GpuKernelBase_version(self):
ctx = gpuarray.get_default_context() return (3,)
return (2, ctx.kind, ctx.devname)
GpuKernelBase_version = property(_GpuKernelBase_version) GpuKernelBase_version = property(_GpuKernelBase_version)
...@@ -259,43 +303,51 @@ class HostFromGpu(Op): ...@@ -259,43 +303,51 @@ class HostFromGpu(Op):
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
return [gpu_from_host(gz)] return [GpuFromHost(inputs[0].type.context_name)(gz)]
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
ev, = eval_points ev, = eval_points
if isinstance(ev, tensor.TensorType): if isinstance(ev, tensor.TensorType):
return [gpu_from_host(ev)] return [GpuFromHost(inputs[0].type.context_name)(ev)]
else: else:
return [ev] return [ev]
def infer_shape(self, node, xshp): def infer_shape(self, node, xshp):
return xshp return xshp
host_from_gpu = HostFromGpu() host_from_gpu = HostFromGpu()
class GpuFromHost(Op): class GpuFromHost(Op):
__props__ = () __props__ = ('context_name',)
_f16_ok = True _f16_ok = True
context_type = gpu_context_type
def __init__(self, context_name):
self.context_name = context_name
def __str__(self): def __str__(self):
return 'GpuFromHost(gpuarray)' return 'GpuFromHost<%s>' % (self.context_name,)
def make_node(self, x): def make_node(self, x):
if not isinstance(x.type, tensor.TensorType): if not isinstance(x.type, tensor.TensorType):
raise TypeError(x) raise TypeError(x)
return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable, return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
context_name=self.context_name,
dtype=x.dtype)()]) dtype=x.dtype)()])
def perform(self, node, inp, out): def get_context(self, node):
return get_context(self.context_name)
def perform(self, node, inp, out, ctx):
x, = inp x, = inp
z, = out z, = out
z[0] = gpuarray.array(x) z[0] = gpuarray.array(x, context=ctx)
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
return [host_from_gpu(as_gpuarray_variable(gz))] return [host_from_gpu(as_gpuarray_variable(
gz, context_name=self.context_name))]
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
ev, = eval_points ev, = eval_points
...@@ -314,24 +366,21 @@ class GpuFromHost(Op): ...@@ -314,24 +366,21 @@ class GpuFromHost(Op):
if (%(name)s_tmp == NULL) if (%(name)s_tmp == NULL)
%(fail)s %(fail)s
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = pygpu_fromhostdata(PyArray_DATA(%(name)s_tmp), %(out)s = pygpu_fromhostdata(PyArray_DATA(%(inp)s),
get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)), get_typecode((PyObject *)PyArray_DESCR(%(inp)s)),
PyArray_NDIM(%(name)s_tmp), PyArray_NDIM(%(inp)s),
(size_t *)PyArray_DIMS(%(name)s_tmp), (size_t *)PyArray_DIMS(%(inp)s),
(ssize_t *)PyArray_STRIDES(%(name)s_tmp), (ssize_t *)PyArray_STRIDES(%(inp)s),
pygpu_default_context(), %(ctx)s,
Py_None); Py_None);
if (%(out)s == NULL) {
Py_DECREF(%(name)s_tmp);
if (%(out)s == NULL)
%(fail)s %(fail)s
""" % {'name': name, 'inp': inputs[0], }
""" % {'name': name, 'inp': inputs[0], 'ctx': sub['context'],
'out': outputs[0], 'fail': sub['fail']} 'out': outputs[0], 'fail': sub['fail']}
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (6,)
gpu_from_host = GpuFromHost()
class GpuAlloc(HideC, Alloc): class GpuAlloc(HideC, Alloc):
...@@ -339,28 +388,36 @@ class GpuAlloc(HideC, Alloc): ...@@ -339,28 +388,36 @@ class GpuAlloc(HideC, Alloc):
Parameters Parameters
---------- ----------
memset_0 context : context name
The name of the context in which to allocate memory
memset_0 : bool
It's only an optimized version. True, it means the It's only an optimized version. True, it means the
value is always 0, so the c code call memset as it is faster. value is always 0, so the c code call memset as it is faster.
""" """
__props__ = ('memset_0',) __props__ = ('memset_0', 'context_name')
_f16_ok = True _f16_ok = True
context_type = gpu_context_type
def __init__(self, memset_0=False): def __init__(self, context_name, memset_0=False):
self.context_name = context_name
self.memset_0 = memset_0 self.memset_0 = memset_0
def get_context(self, node):
return get_context(self.context_name)
def __str__(self): def __str__(self):
# Hide the memset parameter when not used to prevent confusion. # Hide the memset parameter when not used to prevent confusion.
if self.memset_0: if self.memset_0:
s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0) m = "{memset_0=True}"
else: else:
s = self.__class__.__name__ m = ""
return s return "%s<%s>{memset_0=%s}" % (self.__class__.__name__,
self.context_name, m)
def make_node(self, value, *shape): def make_node(self, value, *shape):
value = as_gpuarray_variable(value) value = as_gpuarray_variable(value, context_name=self.context_name)
sh, bcast = self.validate_shape(shape) sh, bcast = self.validate_shape(shape)
if value.ndim > len(sh): if value.ndim > len(sh):
TypeError("The GpuAlloc value to use has more dimensions " TypeError("The GpuAlloc value to use has more dimensions "
...@@ -371,15 +428,15 @@ class GpuAlloc(HideC, Alloc): ...@@ -371,15 +428,15 @@ class GpuAlloc(HideC, Alloc):
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>'] return ['<numpy_compat.h>']
def perform(self, node, inputs, outs): def perform(self, node, inputs, outs, ctx):
out, = outs out, = outs
v = inputs[0] v = inputs[0]
sh = tuple(map(int, inputs[1:])) sh = tuple(map(int, inputs[1:]))
if out[0] is None or out[0].shape != sh: if out[0] is None or out[0].shape != sh:
if self.memset_0: if self.memset_0:
out[0] = gpuarray.zeros(sh, dtype=v.dtype) out[0] = gpuarray.zeros(sh, dtype=v.dtype, context=ctx)
else: else:
out[0] = gpuarray.empty(sh, dtype=v.dtype) out[0] = gpuarray.empty(sh, dtype=v.dtype, context=ctx)
out[0][...] = v out[0][...] = v
else: else:
out[0][...] = v out[0][...] = v
...@@ -414,7 +471,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -414,7 +471,7 @@ class GpuAlloc(HideC, Alloc):
Py_XDECREF(%(zz)s); Py_XDECREF(%(zz)s);
%(zz)s = pygpu_zeros(%(ndim)s, %(name)s_shape, %(zz)s = pygpu_zeros(%(ndim)s, %(name)s_shape,
%(vv)s->ga.typecode, GA_C_ORDER, %(vv)s->ga.typecode, GA_C_ORDER,
pygpu_default_context(), Py_None); %(ctx)s, Py_None);
if (!%(zz)s) { if (!%(zz)s) {
%(fail)s %(fail)s
} }
...@@ -423,7 +480,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -423,7 +480,7 @@ class GpuAlloc(HideC, Alloc):
Py_XDECREF(%(zz)s); Py_XDECREF(%(zz)s);
%(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape, %(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape,
%(vv)s->ga.typecode, GA_C_ORDER, %(vv)s->ga.typecode, GA_C_ORDER,
pygpu_default_context(), Py_None); %(ctx)s, Py_None);
if (!%(zz)s) { if (!%(zz)s) {
%(fail)s %(fail)s
} }
...@@ -446,7 +503,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -446,7 +503,7 @@ class GpuAlloc(HideC, Alloc):
%(fail)s %(fail)s
} }
} }
""" % dict(name=name, ndim=ndim, zz=zz, vv=vv, """ % dict(name=name, ndim=ndim, zz=zz, vv=vv, ctx=sub['context'],
fail=sub['fail'], memset_0=memset_0) fail=sub['fail'], memset_0=memset_0)
if config.gpuarray.sync: if config.gpuarray.sync:
...@@ -455,7 +512,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -455,7 +512,7 @@ class GpuAlloc(HideC, Alloc):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (3,)
def do_constant_folding(self, node): def do_constant_folding(self, node):
from . import subtensor, blas from . import subtensor, blas
...@@ -488,29 +545,32 @@ class GpuAlloc(HideC, Alloc): ...@@ -488,29 +545,32 @@ class GpuAlloc(HideC, Alloc):
return True return True
gpu_alloc = GpuAlloc()
class GpuAllocEmpty(HideC, Alloc): class GpuAllocEmpty(HideC, Alloc):
__props__ = ('dtype',) __props__ = ('dtype', 'context_name')
_f16_ok = True _f16_ok = True
context_type = gpu_context_type
def __init__(self, dtype): def __init__(self, dtype, context_name):
self.dtype = dtype self.dtype = dtype
self.context_name = context_name
def get_context(self, node):
return get_context(self.context_name)
def make_node(self, *shape): def make_node(self, *shape):
sh, bcast = self.validate_shape(shape) sh, bcast = self.validate_shape(shape)
output = GpuArrayType(dtype=self.dtype, broadcastable=bcast)() output = GpuArrayType(dtype=self.dtype, broadcastable=bcast,
context_name=self.context_name)()
output.tag.values_eq_approx = tensor.type.values_eq_approx_always_true output.tag.values_eq_approx = tensor.type.values_eq_approx_always_true
# The outut can contain nan/inf. # The outut can contain nan/inf.
output.type.filter_checks_isfinite = False output.type.filter_checks_isfinite = False
return Apply(self, sh, [output]) return Apply(self, sh, [output])
def perform(self, node, inputs, out_): def perform(self, node, inputs, out_, ctx):
out = out_[0] out = out_[0]
sh = [int(i) for i in inputs] sh = [int(i) for i in inputs]
if out[0] is None or out[0].shape != sh: if out[0] is None or out[0].shape != sh:
out[0] = pygpu.empty(sh, dtype=self.dtype) out[0] = pygpu.empty(sh, dtype=self.dtype, context=ctx)
# if out[0] is the right shape, we just return it # if out[0] is the right shape, we just return it
def c_headers(self): def c_headers(self):
...@@ -536,16 +596,16 @@ shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0]; ...@@ -536,16 +596,16 @@ shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
code.append(""" code.append("""
if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER, if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
pygpu_default_context())) { %(ctx)s)) {
%(fail)s %(fail)s
} }
""" % dict(zz=zz, ndim=ndim, type=gpuarray.dtype_to_typecode(self.dtype), """ % dict(zz=zz, ndim=ndim, type=gpuarray.dtype_to_typecode(self.dtype),
fail=fail)) fail=fail, ctx=sub['context']))
return ''.join(code) return ''.join(code)
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,) return (1,)
def do_constant_folding(self, node): def do_constant_folding(self, node):
return False return False
...@@ -559,7 +619,7 @@ if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER, ...@@ -559,7 +619,7 @@ if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
def empty_like(var): def empty_like(var):
return GpuAllocEmpty(var.type.dtype)(*var.shape) return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)
class GpuContiguous(Op): class GpuContiguous(Op):
...@@ -568,7 +628,6 @@ class GpuContiguous(Op): ...@@ -568,7 +628,6 @@ class GpuContiguous(Op):
not already c contiguous. not already c contiguous.
""" """
__props__ = () __props__ = ()
view_map = {0: [0]} view_map = {0: [0]}
_f16_ok = True _f16_ok = True
...@@ -576,12 +635,13 @@ class GpuContiguous(Op): ...@@ -576,12 +635,13 @@ class GpuContiguous(Op):
def grad(self, inputs, dout): def grad(self, inputs, dout):
x, = inputs x, = inputs
dout, = dout dout, = dout
dout = as_gpuarray_variable(dout) dout = as_gpuarray_variable(dout, context_name=infer_context_name(x))
return [dout] return [dout]
def make_node(self, input): def make_node(self, input):
input = as_gpuarray_variable(input) input = as_gpuarray_variable(input,
context_name=infer_context_name(input))
return Apply(self, [input], [input.type()]) return Apply(self, [input], [input.type()])
def c_headers(self): def c_headers(self):
...@@ -633,10 +693,12 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -633,10 +693,12 @@ class GpuReshape(HideC, tensor.Reshape):
# __hash__, __eq__, __str__ come from tensor.Reshape # __hash__, __eq__, __str__ come from tensor.Reshape
def make_node(self, x, shp): def make_node(self, x, shp):
x = as_gpuarray_variable(x) ctx_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name=ctx_name)
res = host_from_gpu(x).reshape(shp, ndim=self.ndim) res = host_from_gpu(x).reshape(shp, ndim=self.ndim)
otype = GpuArrayType(dtype=res.dtype, otype = GpuArrayType(dtype=res.dtype,
broadcastable=res.broadcastable) broadcastable=res.broadcastable,
context_name=ctx_name)
return Apply(self, [x, shp], [otype()]) return Apply(self, [x, shp], [otype()])
def perform(self, node, inp, out_): def perform(self, node, inp, out_):
...@@ -744,22 +806,30 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -744,22 +806,30 @@ class GpuReshape(HideC, tensor.Reshape):
class GpuJoin(HideC, Join): class GpuJoin(HideC, Join):
_f16_ok = True _f16_ok = True
context_type = gpu_context_type
def make_node(self, axis, *tensors): def make_node(self, axis, *tensors):
node = Join.make_node(self, axis, *tensors) node = Join.make_node(self, axis, *tensors)
return Apply(self, [node.inputs[0]] + list(map(as_gpuarray_variable, ctx_name = infer_context_name(*tensors)
tensors)),
def agv(v):
return as_gpuarray_variable(v, context_name=ctx_name)
return Apply(self, [node.inputs[0]] + list(map(agv, tensors)),
[GpuArrayType(broadcastable=node.outputs[0].broadcastable, [GpuArrayType(broadcastable=node.outputs[0].broadcastable,
dtype=node.outputs[0].dtype)()]) dtype=node.outputs[0].dtype,
context_name=ctx_name)()])
def get_context(self, node):
return node.outputs[0].type.context
def perform(self, node, axis_and_tensors, out_): def perform(self, node, axis_and_tensors, out_, ctx):
out, = out_ out, = out_
axis = int(axis_and_tensors[0]) axis = int(axis_and_tensors[0])
tensors = axis_and_tensors[1:] tensors = axis_and_tensors[1:]
out[0] = pygpu.concatenate(tensors, axis=axis).astype( out[0] = pygpu.concatenate(tensors, axis=axis, context=ctx).astype(
node.outputs[0].dtype) node.outputs[0].dtype)
def c_code_cache_version(self): def c_code_cache_version(self):
...@@ -793,15 +863,14 @@ if (axis < 0) { ...@@ -793,15 +863,14 @@ if (axis < 0) {
} }
%(out)s = pygpu_concatenate(als, %(n)s, axis, %(out)s = pygpu_concatenate(als, %(n)s, axis,
%(restype)s, (PyObject *)&PyGpuArrayType, %(restype)s, (PyObject *)&PyGpuArrayType,
pygpu_default_context()); %(ctx)s);
} }
PyMem_Free(als); PyMem_Free(als);
if (%(out)s == NULL) if (%(out)s == NULL)
%(fail)s %(fail)s
""" % dict(n=len(inputs[1:]), fail=sub['fail'], out=out_[0], """ % dict(n=len(inputs[1:]), fail=sub['fail'], out=out_[0],
axis=inputs[0], copy_inputs_to_list='\n'.join(copy_to_list), axis=inputs[0], copy_inputs_to_list='\n'.join(copy_to_list),
restype=restype) restype=restype, ctx=sub['context'])
gpu_join = GpuJoin() gpu_join = GpuJoin()
...@@ -809,21 +878,26 @@ gpu_join = GpuJoin() ...@@ -809,21 +878,26 @@ gpu_join = GpuJoin()
class GpuSplit(HideC, Split): class GpuSplit(HideC, Split):
def make_node(self, x, axis, splits): def make_node(self, x, axis, splits):
node = Split.make_node(self, x, axis, splits) node = Split.make_node(self, x, axis, splits)
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x, infer_context_name(x))
outs = [GpuArrayType(dtype=o.dtype, broadcastable=o.broadcastable)() outs = [GpuArrayType(dtype=o.dtype, broadcastable=o.broadcastable,
context_name=x.type.context_name)()
for o in node.outputs] for o in node.outputs]
return Apply(self, [x] + node.inputs[1:], outs) return Apply(self, [x] + node.inputs[1:], outs)
# we reuse the perform of the CPU op, which is suitable # we reuse the perform of the CPU op, which is suitable
class GpuEye(GpuKernelBase, Op): class GpuEye(GpuKernelBase, Op):
__props__ = ('dtype',) __props__ = ('dtype', 'context_name')
_f16_ok = True _f16_ok = True
def __init__(self, dtype=None): def __init__(self, dtype=None, context_name=None):
if dtype is None: if dtype is None:
dtype = config.floatX dtype = config.floatX
self.dtype = dtype self.dtype = dtype
self.context_name = context_name
def get_context(self, node):
return get_context(self.context_name)
def make_node(self, n, m, k): def make_node(self, n, m, k):
n = tensor.as_tensor_variable(n) n = tensor.as_tensor_variable(n)
...@@ -833,7 +907,8 @@ class GpuEye(GpuKernelBase, Op): ...@@ -833,7 +907,8 @@ class GpuEye(GpuKernelBase, Op):
assert m.ndim == 0 assert m.ndim == 0
assert k.ndim == 0 assert k.ndim == 0
otype = GpuArrayType(dtype=self.dtype, otype = GpuArrayType(dtype=self.dtype,
broadcastable=(False, False)) broadcastable=(False, False),
context_name=self.context_name)
# k != 0 isn't implemented on the GPU yet. # k != 0 isn't implemented on the GPU yet.
assert tensor.get_scalar_constant_value(k) == 0 assert tensor.get_scalar_constant_value(k) == 0
...@@ -866,6 +941,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { ...@@ -866,6 +941,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
n, m = inp n, m = inp
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
ctx = sub['context']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype) typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
sync = bool(config.gpuarray.sync) sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels(node, name)[0].objvar kname = self.gpu_kernels(node, name)[0].objvar
...@@ -882,7 +958,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { ...@@ -882,7 +958,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
%(z)s = pygpu_zeros(2, dims, %(z)s = pygpu_zeros(2, dims,
%(typecode)s, %(typecode)s,
GA_C_ORDER, GA_C_ORDER,
pygpu_default_context(), Py_None); %(ctx)s, Py_None);
if (%(z)s == NULL) { if (%(z)s == NULL) {
%(fail)s %(fail)s
} }
...@@ -908,4 +984,4 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { ...@@ -908,4 +984,4 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
return s return s
def c_code_cache_version(self): def c_code_cache_version(self):
return (4, self.GpuKernelBase_version) return (5, self.GpuKernelBase_version)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论