提交 a28251c6 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Type context for elemwise.py

上级 58371141
......@@ -20,8 +20,8 @@ try:
except ImportError:
pass
from .basic_ops import (as_gpuarray_variable, HideC,
GpuKernelBase, Kernel)
from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
infer_context_name)
from .type import GpuArrayType
from .fp16_help import load_w, write_w
......@@ -67,12 +67,14 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)
def make_node(self, *inputs):
ctx_name = infer_context_name(*inputs)
res = Elemwise.make_node(self, *inputs)
outputs = [GpuArrayType(broadcastable=o.type.broadcastable,
context_name=ctx_name,
dtype=o.type.dtype)() for o in res.outputs]
if len(outputs) > 1:
raise NotImplementedError()
inputs = [as_gpuarray_variable(i) for i in inputs]
inputs = [as_gpuarray_variable(i, ctx_name) for i in inputs]
node = Apply(self, inputs, outputs)
# Try to generate the kernel to catch SupportCodeErrors
......@@ -99,6 +101,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return node
def get_context(self, node):
return node.inputs[0].type.context
def generate_kernel(self, node, nodename):
inps = [make_argument(i, 'i%d' % (n,)) for n, i in
enumerate(node.inputs)]
......@@ -177,8 +182,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return self.scalar_op.c_support_code()
def _gpu_kernel_code(self, node, nodename):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
# This is useless by itself, but will serve an eventual c_code
# implementation
k = self.generate_kernel(node, nodename)
......@@ -191,8 +194,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return '\n'.join(res)
def gpu_kernels(self, node, nodename):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
src = self._gpu_kernel_code(node, nodename)
nd = node.outputs[0].ndim
params = ['uintp']
......@@ -214,12 +215,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
objvar='elem_%d_%s' % (nd, nodename))]
def c_code(self, node, name, inputs, outputs, sub):
if pygpu.get_default_context().kind == 'opencl':
if node.inputs[0].type.context.kind != 'cuda':
raise MethodNotDefined('cuda only')
nd = node.outputs[0].ndim
fail = sub["fail"]
initial_dims = ','.join('1' for i in xrange(nd))
opname = str(self.scalar_op)
ctx = sub['context']
# check that all inputs have valid dimensions
emitted_inames = {}
......@@ -264,7 +266,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
if iname in emitted_inames:
continue
code += """
//std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
if (%(nd)s != PyGpuArray_NDIM(%(iname)s))
{
PyErr_Format(PyExc_TypeError,
......@@ -279,7 +280,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
PyGpuArray_DIMS(%(iname)s)[i] == 1)) &&
(dims[i] != PyGpuArray_DIMS(%(iname)s)[i]))
{
//std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
PyErr_Format(PyExc_ValueError,
"GpuElemwise. Input dimension mis-match. Input"
" %(idx)d (indices start at 0) has shape[%%i] == %%i"
......@@ -314,15 +314,11 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
{
%(oname)s = pygpu_empty(%(nd)d, dims,
%(typecode)s, GA_C_ORDER,
pygpu_default_context(), Py_None);
%(ctx)s, Py_None);
if (!%(oname)s) {
//TODO, this check don't seam good.
//TODO, set exception?
%(fail)s
%(fail)s
}
}
//std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals()
else:
input_idx = self.inplace_pattern[idx]
......@@ -348,8 +344,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
%(fail)s;
}
}
//std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals()
z = outputs[0]
code += """numEls = PyGpuArray_SIZE(%(z)s);
......@@ -367,7 +361,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
if (threads_per_block * n_blocks < numEls)
threads_per_block = std::min(numEls/n_blocks, (size_t) 256);
//std::cerr << "calling callkernel returned\\n";
""" % locals()
kname = 'elem_%d_%s' % (nd, name)
......@@ -588,7 +581,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
quite possible that the GPU might be slower for some cases.
"""
__props__ = ('axis', 'reduce_mask', 'dtype', 'acc_dtype', 'scalar_op',
'pre_scalar_op')
_f16_ok = True
def __init__(self, scalar_op, axis=None,
......@@ -607,24 +601,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if pre_scalar_op:
assert pre_scalar_op.nin == 1
def __eq__(self, other):
return (type(self) == type(other) and
self.axis == other.axis and
self.reduce_mask == other.reduce_mask and
self.dtype == other.dtype and
self.acc_dtype == other.acc_dtype and
self.scalar_op == other.scalar_op and
self.pre_scalar_op == other.pre_scalar_op)
def __hash__(self):
return (hash(type(self)) ^
hash(self.axis) ^
hash(self.reduce_mask) ^
hash(self.dtype) ^
hash(self.acc_dtype) ^
hash(type(self.scalar_op)) ^
hash(type(self.pre_scalar_op)))
def __str__(self):
pre = ""
if self.pre_scalar_op:
......@@ -641,7 +617,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
self.pre_scalar_op = None
def make_node(self, x):
x = as_gpuarray_variable(x)
x = as_gpuarray_variable(x, infer_context_name(x))
if x.type.context.kind != 'cuda':
raise TypeError("GpuCAReduceCuda doesn't work for non-cuda devices")
ret = super(GpuCAReduceCuda, self).make_node(x)
self = copy.copy(self)
self.axis = ret.op.axis
......@@ -666,7 +644,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
"complex" in self._acc_dtype(x.dtype)):
raise NotImplementedError("We don't support complex in gpu reduction")
return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype,
ret.outputs[0].type.broadcastable)()])
ret.outputs[0].type.broadcastable,
context_name=x.type.context_name)()])
def get_context(self, node):
return node.inputs[0].type.context
def perform(self, node, inp, out):
raise MethodNotDefined("")
......@@ -1914,7 +1896,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
version = [17] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
version.extend(self.scalar_op.c_code_cache_version())
scalar_node = Apply(
self.scalar_op,
[Scalar(dtype=input.type.dtype)() for input in node.inputs],
[Scalar(dtype=output.type.dtype)() for output in node.outputs])
version.extend(self.scalar_op.c_code_cache_version_apply(scalar_node))
for i in node.inputs + node.outputs:
version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
if all(version):
......@@ -2676,8 +2662,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
def make_thunk(self, node, storage_map, compute_map, no_recycling):
# cache the kernel object
self.get_kernel_cache(node)
return super(GpuCAReduceCPY, self).make_thunk(node, storage_map,
compute_map, no_recycling)
return super(GpuCAReduceCPY, self).make_thunk(
node, storage_map, compute_map, no_recycling)
def get_kernel_cache(self, node):
attr = '@cache_reduction_k'
......@@ -2776,33 +2762,33 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
j += 1
code += """
if (need_out) {
%(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, pygpu_default_context(), Py_None);
%(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, %(ctx)s, Py_None);
if (!%(output)s) {
%(fail)s
}
}
""" % dict(output=output, nd_out=nd_out, fail=sub['fail'],
ctx=sub['context'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
else:
code += """
if (%(output)s == NULL || %(output)s->ga.nd != 0) {
Py_XDECREF(%(output)s);
%(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER,
pygpu_default_context(), Py_None);
%(ctx)s, Py_None);
if (!%(output)s) {
%(fail)s
}
}
""" % dict(output=output, fail=sub['fail'],
""" % dict(output=output, fail=sub['fail'], ctx=sub['context'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
if acc_dtype != node.outputs[0].type.dtype:
code += """
tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions,
%(acc_type)s, GA_C_ORDER, pygpu_default_context(),
Py_None);
%(acc_type)s, GA_C_ORDER, %(ctx)s, Py_None);
if (!tmp) %(fail)s
""" % dict(output=output, fail=sub['fail'],
""" % dict(output=output, fail=sub['fail'], ctx=sub['context'],
acc_type=dtype_to_typecode(acc_dtype))
else:
code += """
......@@ -2893,7 +2879,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
reduce_expr = "a * b"
else:
raise NotImplementedError()
return ReductionKernel(pygpu.get_default_context(), odtype,
return ReductionKernel(node.inputs[0].type.context, odtype,
self.scalar_op.identity, reduce_expr, redux,
arguments=[make_argument(node.inputs[0], 'a')],
init_nd=node.inputs[0].ndim)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论