提交 a28251c6 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Type context for elemwise.py

上级 58371141
...@@ -20,8 +20,8 @@ try: ...@@ -20,8 +20,8 @@ try:
except ImportError: except ImportError:
pass pass
from .basic_ops import (as_gpuarray_variable, HideC, from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
GpuKernelBase, Kernel) infer_context_name)
from .type import GpuArrayType from .type import GpuArrayType
from .fp16_help import load_w, write_w from .fp16_help import load_w, write_w
...@@ -67,12 +67,14 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -67,12 +67,14 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items) return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)
def make_node(self, *inputs): def make_node(self, *inputs):
ctx_name = infer_context_name(*inputs)
res = Elemwise.make_node(self, *inputs) res = Elemwise.make_node(self, *inputs)
outputs = [GpuArrayType(broadcastable=o.type.broadcastable, outputs = [GpuArrayType(broadcastable=o.type.broadcastable,
context_name=ctx_name,
dtype=o.type.dtype)() for o in res.outputs] dtype=o.type.dtype)() for o in res.outputs]
if len(outputs) > 1: if len(outputs) > 1:
raise NotImplementedError() raise NotImplementedError()
inputs = [as_gpuarray_variable(i) for i in inputs] inputs = [as_gpuarray_variable(i, ctx_name) for i in inputs]
node = Apply(self, inputs, outputs) node = Apply(self, inputs, outputs)
# Try to generate the kernel to catch SupportCodeErrors # Try to generate the kernel to catch SupportCodeErrors
...@@ -99,6 +101,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -99,6 +101,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return node return node
def get_context(self, node):
return node.inputs[0].type.context
def generate_kernel(self, node, nodename): def generate_kernel(self, node, nodename):
inps = [make_argument(i, 'i%d' % (n,)) for n, i in inps = [make_argument(i, 'i%d' % (n,)) for n, i in
enumerate(node.inputs)] enumerate(node.inputs)]
...@@ -177,8 +182,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -177,8 +182,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return self.scalar_op.c_support_code() return self.scalar_op.c_support_code()
def _gpu_kernel_code(self, node, nodename): def _gpu_kernel_code(self, node, nodename):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
# This is useless by itself, but will serve an eventual c_code # This is useless by itself, but will serve an eventual c_code
# implementation # implementation
k = self.generate_kernel(node, nodename) k = self.generate_kernel(node, nodename)
...@@ -191,8 +194,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -191,8 +194,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return '\n'.join(res) return '\n'.join(res)
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
src = self._gpu_kernel_code(node, nodename) src = self._gpu_kernel_code(node, nodename)
nd = node.outputs[0].ndim nd = node.outputs[0].ndim
params = ['uintp'] params = ['uintp']
...@@ -214,12 +215,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -214,12 +215,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
objvar='elem_%d_%s' % (nd, nodename))] objvar='elem_%d_%s' % (nd, nodename))]
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
if pygpu.get_default_context().kind == 'opencl': if node.inputs[0].type.context.kind != 'cuda':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
nd = node.outputs[0].ndim nd = node.outputs[0].ndim
fail = sub["fail"] fail = sub["fail"]
initial_dims = ','.join('1' for i in xrange(nd)) initial_dims = ','.join('1' for i in xrange(nd))
opname = str(self.scalar_op) opname = str(self.scalar_op)
ctx = sub['context']
# check that all inputs have valid dimensions # check that all inputs have valid dimensions
emitted_inames = {} emitted_inames = {}
...@@ -264,7 +266,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -264,7 +266,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
if iname in emitted_inames: if iname in emitted_inames:
continue continue
code += """ code += """
//std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
if (%(nd)s != PyGpuArray_NDIM(%(iname)s)) if (%(nd)s != PyGpuArray_NDIM(%(iname)s))
{ {
PyErr_Format(PyExc_TypeError, PyErr_Format(PyExc_TypeError,
...@@ -279,7 +280,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -279,7 +280,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
PyGpuArray_DIMS(%(iname)s)[i] == 1)) && PyGpuArray_DIMS(%(iname)s)[i] == 1)) &&
(dims[i] != PyGpuArray_DIMS(%(iname)s)[i])) (dims[i] != PyGpuArray_DIMS(%(iname)s)[i]))
{ {
//std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"GpuElemwise. Input dimension mis-match. Input" "GpuElemwise. Input dimension mis-match. Input"
" %(idx)d (indices start at 0) has shape[%%i] == %%i" " %(idx)d (indices start at 0) has shape[%%i] == %%i"
...@@ -314,15 +314,11 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -314,15 +314,11 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
{ {
%(oname)s = pygpu_empty(%(nd)d, dims, %(oname)s = pygpu_empty(%(nd)d, dims,
%(typecode)s, GA_C_ORDER, %(typecode)s, GA_C_ORDER,
pygpu_default_context(), Py_None); %(ctx)s, Py_None);
if (!%(oname)s) { if (!%(oname)s) {
//TODO, this check don't seam good. %(fail)s
//TODO, set exception?
%(fail)s
} }
} }
//std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals() """ % locals()
else: else:
input_idx = self.inplace_pattern[idx] input_idx = self.inplace_pattern[idx]
...@@ -348,8 +344,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -348,8 +344,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
%(fail)s; %(fail)s;
} }
} }
//std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals() """ % locals()
z = outputs[0] z = outputs[0]
code += """numEls = PyGpuArray_SIZE(%(z)s); code += """numEls = PyGpuArray_SIZE(%(z)s);
...@@ -367,7 +361,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -367,7 +361,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
if (threads_per_block * n_blocks < numEls) if (threads_per_block * n_blocks < numEls)
threads_per_block = std::min(numEls/n_blocks, (size_t) 256); threads_per_block = std::min(numEls/n_blocks, (size_t) 256);
//std::cerr << "calling callkernel returned\\n";
""" % locals() """ % locals()
kname = 'elem_%d_%s' % (nd, name) kname = 'elem_%d_%s' % (nd, name)
...@@ -588,7 +581,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -588,7 +581,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
quite possible that the GPU might be slower for some cases. quite possible that the GPU might be slower for some cases.
""" """
__props__ = ('axis', 'reduce_mask', 'dtype', 'acc_dtype', 'scalar_op',
'pre_scalar_op')
_f16_ok = True _f16_ok = True
def __init__(self, scalar_op, axis=None, def __init__(self, scalar_op, axis=None,
...@@ -607,24 +601,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -607,24 +601,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if pre_scalar_op: if pre_scalar_op:
assert pre_scalar_op.nin == 1 assert pre_scalar_op.nin == 1
def __eq__(self, other):
return (type(self) == type(other) and
self.axis == other.axis and
self.reduce_mask == other.reduce_mask and
self.dtype == other.dtype and
self.acc_dtype == other.acc_dtype and
self.scalar_op == other.scalar_op and
self.pre_scalar_op == other.pre_scalar_op)
def __hash__(self):
return (hash(type(self)) ^
hash(self.axis) ^
hash(self.reduce_mask) ^
hash(self.dtype) ^
hash(self.acc_dtype) ^
hash(type(self.scalar_op)) ^
hash(type(self.pre_scalar_op)))
def __str__(self): def __str__(self):
pre = "" pre = ""
if self.pre_scalar_op: if self.pre_scalar_op:
...@@ -641,7 +617,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -641,7 +617,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
self.pre_scalar_op = None self.pre_scalar_op = None
def make_node(self, x): def make_node(self, x):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x, infer_context_name(x))
if x.type.context.kind != 'cuda':
raise TypeError("GpuCAReduceCuda doesn't work for non-cuda devices")
ret = super(GpuCAReduceCuda, self).make_node(x) ret = super(GpuCAReduceCuda, self).make_node(x)
self = copy.copy(self) self = copy.copy(self)
self.axis = ret.op.axis self.axis = ret.op.axis
...@@ -666,7 +644,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -666,7 +644,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
"complex" in self._acc_dtype(x.dtype)): "complex" in self._acc_dtype(x.dtype)):
raise NotImplementedError("We don't support complex in gpu reduction") raise NotImplementedError("We don't support complex in gpu reduction")
return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype, return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype,
ret.outputs[0].type.broadcastable)()]) ret.outputs[0].type.broadcastable,
context_name=x.type.context_name)()])
def get_context(self, node):
return node.inputs[0].type.context
def perform(self, node, inp, out): def perform(self, node, inp, out):
raise MethodNotDefined("") raise MethodNotDefined("")
...@@ -1914,7 +1896,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1914,7 +1896,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
version = [17] # the version corresponding to the c code in this Op version = [17] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
version.extend(self.scalar_op.c_code_cache_version()) scalar_node = Apply(
self.scalar_op,
[Scalar(dtype=input.type.dtype)() for input in node.inputs],
[Scalar(dtype=output.type.dtype)() for output in node.outputs])
version.extend(self.scalar_op.c_code_cache_version_apply(scalar_node))
for i in node.inputs + node.outputs: for i in node.inputs + node.outputs:
version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version()) version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
if all(version): if all(version):
...@@ -2676,8 +2662,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2676,8 +2662,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling):
# cache the kernel object # cache the kernel object
self.get_kernel_cache(node) self.get_kernel_cache(node)
return super(GpuCAReduceCPY, self).make_thunk(node, storage_map, return super(GpuCAReduceCPY, self).make_thunk(
compute_map, no_recycling) node, storage_map, compute_map, no_recycling)
def get_kernel_cache(self, node): def get_kernel_cache(self, node):
attr = '@cache_reduction_k' attr = '@cache_reduction_k'
...@@ -2776,33 +2762,33 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2776,33 +2762,33 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
j += 1 j += 1
code += """ code += """
if (need_out) { if (need_out) {
%(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, pygpu_default_context(), Py_None); %(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, %(ctx)s, Py_None);
if (!%(output)s) { if (!%(output)s) {
%(fail)s %(fail)s
} }
} }
""" % dict(output=output, nd_out=nd_out, fail=sub['fail'], """ % dict(output=output, nd_out=nd_out, fail=sub['fail'],
ctx=sub['context'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype)) out_type=dtype_to_typecode(node.outputs[0].type.dtype))
else: else:
code += """ code += """
if (%(output)s == NULL || %(output)s->ga.nd != 0) { if (%(output)s == NULL || %(output)s->ga.nd != 0) {
Py_XDECREF(%(output)s); Py_XDECREF(%(output)s);
%(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER, %(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER,
pygpu_default_context(), Py_None); %(ctx)s, Py_None);
if (!%(output)s) { if (!%(output)s) {
%(fail)s %(fail)s
} }
} }
""" % dict(output=output, fail=sub['fail'], """ % dict(output=output, fail=sub['fail'], ctx=sub['context'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype)) out_type=dtype_to_typecode(node.outputs[0].type.dtype))
if acc_dtype != node.outputs[0].type.dtype: if acc_dtype != node.outputs[0].type.dtype:
code += """ code += """
tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions, tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions,
%(acc_type)s, GA_C_ORDER, pygpu_default_context(), %(acc_type)s, GA_C_ORDER, %(ctx)s, Py_None);
Py_None);
if (!tmp) %(fail)s if (!tmp) %(fail)s
""" % dict(output=output, fail=sub['fail'], """ % dict(output=output, fail=sub['fail'], ctx=sub['context'],
acc_type=dtype_to_typecode(acc_dtype)) acc_type=dtype_to_typecode(acc_dtype))
else: else:
code += """ code += """
...@@ -2893,7 +2879,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2893,7 +2879,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
reduce_expr = "a * b" reduce_expr = "a * b"
else: else:
raise NotImplementedError() raise NotImplementedError()
return ReductionKernel(pygpu.get_default_context(), odtype, return ReductionKernel(node.inputs[0].type.context, odtype,
self.scalar_op.identity, reduce_expr, redux, self.scalar_op.identity, reduce_expr, redux,
arguments=[make_argument(node.inputs[0], 'a')], arguments=[make_argument(node.inputs[0], 'a')],
init_nd=node.inputs[0].ndim) init_nd=node.inputs[0].ndim)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论