提交 a536464a authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #4323 from abergeron/gpua_newelem

Use the new GpuElemwise from libgpuarray
...@@ -42,7 +42,7 @@ register_transfer(transfer) ...@@ -42,7 +42,7 @@ register_transfer(transfer)
def init_dev(dev, name=None): def init_dev(dev, name=None):
v = pygpu.gpuarray.api_version() v = pygpu.gpuarray.api_version()
if v[0] != -10000: if v[0] != -9999:
raise RuntimeError("Wrong major API version for gpuarray:", v[0], raise RuntimeError("Wrong major API version for gpuarray:", v[0],
"Make sure Theano and libgpuarray/pygpu " "Make sure Theano and libgpuarray/pygpu "
"are in sync.") "are in sync.")
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import copy import copy
from theano.compat import izip
import numpy import numpy
import theano import theano
from theano import Apply, scalar, config from theano import Apply, scalar, config, Op
from theano import scalar as scal
from six.moves import StringIO, xrange from six.moves import StringIO, xrange
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
from theano.scalar import Scalar from theano.scalar import Scalar
...@@ -14,41 +12,20 @@ from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype) ...@@ -14,41 +12,20 @@ from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)
try: try:
import pygpu import pygpu
from pygpu import gpuarray from pygpu import gpuarray
from pygpu.tools import ScalarArg, ArrayArg from pygpu.tools import ArrayArg
from pygpu.elemwise import ElemwiseKernel
from pygpu.reduction import ReductionKernel from pygpu.reduction import ReductionKernel
from pygpu.gpuarray import dtype_to_typecode, dtype_to_ctype from pygpu.gpuarray import dtype_to_typecode
except ImportError: except ImportError:
pass pass
from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel, from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
infer_context_name) infer_context_name)
from .type import GpuArrayType from .type import GpuArrayType, gpu_context_type
from .fp16_help import load_w, write_w from .fp16_help import load_w, write_w
def _is_scalar(v):
False
def make_argument(v, name): def make_argument(v, name):
if _is_scalar(v): return ArrayArg(numpy.dtype(v.type.dtype), name)
return ScalarArg(numpy.dtype(v.type.dtype), name)
else:
return ArrayArg(numpy.dtype(v.type.dtype), name)
def ensure_allocated(storage, shape, dtype, ctx):
odat = storage[0]
if odat is not None:
if odat.shape != shape:
# It is unsafe to try to resize odat,
# we have to allocate output storage.
odat = None
if odat is None:
odat = pygpu.empty(shape, dtype=dtype, context=ctx)
storage[0] = odat
return odat
def as_C_string_const(s): def as_C_string_const(s):
...@@ -56,11 +33,12 @@ def as_C_string_const(s): ...@@ -56,11 +33,12 @@ def as_C_string_const(s):
for l in s.split('\n')) for l in s.split('\n'))
class GpuElemwise(GpuKernelBase, HideC, Elemwise): class GpuElemwise(HideC, Elemwise):
""" """
Elemwise on the GPU. Elemwise on the GPU.
""" """
params_type = gpu_context_type
nin = property(lambda self: self.scalar_op.nin) nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout) nout = property(lambda self: self.scalar_op.nout)
_f16_ok = True _f16_ok = True
...@@ -109,20 +87,21 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -109,20 +87,21 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
def get_params(self, node): def get_params(self, node):
return node.inputs[0].type.context return node.inputs[0].type.context
def generate_kernel(self, node, nodename): def _get_vnames(self, node):
inps = [make_argument(i, 'i%d' % (n,)) for n, i in inps = ['i%d' % (n,) for n, _ in enumerate(node.inputs)]
enumerate(node.inputs)] outs = ['o%d' % (n,) for n, _ in enumerate(node.outputs) if n not in self.inplace_pattern]
scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs] return inps, outs
outs = [make_argument(o, 'o%d' % (n,)) for n, o in def _generate_op_string(self, node):
enumerate(node.outputs) if n not in self.inplace_pattern] scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs] scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
inps, outs = self._get_vnames(node)
fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins], fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins],
[o() for o in scal_v_outs]) [o() for o in scal_v_outs])
scal_in = [i.name + '[i]' if i.dtype != 'float16' else scal_in = [i if si.dtype != 'float16' else
'__half2float(' + i.name + '[i])' for i in inps] 'load_half(&' + i + ')' for i, si in zip(inps, scal_v_ins)]
scal_out = [] scal_out = []
oi = 0 oi = 0
...@@ -133,13 +112,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -133,13 +112,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
else: else:
arg = outs[oi] arg = outs[oi]
oi += 1 oi += 1
if arg.dtype == 'float16': if node.outputs[n].dtype == 'float16':
scal_f16.append(('tmpf16%i' % (len(scal_f16),), arg)) scal_f16.append(('tmpf16%i' % (len(scal_f16),), arg))
scal_out.append(scal_f16[-1][0]) scal_out.append(scal_f16[-1][0])
else: else:
scal_out.append(arg.name + '[i]') scal_out.append(arg)
kop = self.scalar_op.c_code(fake_node, nodename + '_scalar', kop = self.scalar_op.c_code(fake_node, 'elem_scalar',
scal_in, scal_out, scal_in, scal_out,
dict(fail='return;')) dict(fail='return;'))
...@@ -154,7 +133,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -154,7 +133,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
# variables inthe middle are float32 # variables inthe middle are float32
code.append(kop.replace('npy_float16', 'ga_float')) code.append(kop.replace('npy_float16', 'ga_float'))
for f in scal_f16: for f in scal_f16:
code.append('%s[i] = __float2half_rn(%s);' % (f[1].name, f[0])) code.append('store_half(&%s, %s);' % (f[1], f[0]))
code.append('}') code.append('}')
kop = '\n'.join(code) kop = '\n'.join(code)
...@@ -178,76 +157,74 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -178,76 +157,74 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
("npy_float64", "ga_double"), ("npy_float64", "ga_double"),
]: ]:
kop = kop.replace(npy, ga) kop = kop.replace(npy, ga)
return ElemwiseKernel(self.get_params(node), inps + outs, kop, return support_code, kop
preamble=support_code)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>',
'<gpuarray/elemwise.h>']
def c_support_code(self):
return self.scalar_op.c_support_code() def c_support_code_struct(self, node, name):
return "\nGpuElemwise *ge;\n"
def _gpu_kernel_code(self, node, nodename):
# This is useless by itself, but will serve an eventual c_code def c_init_code_struct(self, node, name, sub):
# implementation inps, outs = self._get_vnames(node)
k = self.generate_kernel(node, nodename) nargs = len(inps) + len(outs)
nd = node.inputs[0].type.ndim support_code, kop = self._generate_op_string(node)
res = [] res = """
for i in range(0, nd + 1): gpuelemwise_arg args[%(nargs)s] = {{0}};
res.append(k.render_basic(i, name="elem_" + str(i)) + ';') """ % dict(nargs=nargs)
res.append(k.contig_src + ';')
for n, (i, name) in enumerate(zip(node.inputs, inps)):
res += """
args[%(n)s].name = %(name)s;
args[%(n)s].typecode = %(typecode)s;
args[%(n)s].flags = GE_READ;
""" % dict(n=n, name='"%s"' % (name,),
typecode=i.type.typecode)
p = 0
for n, o in enumerate(node.outputs):
if n in self.inplace_pattern:
assert(len(node.outputs) == 1)
res += "\nargs[%(n)s].flags |= GE_WRITE;\n" % dict(n=self.inplace_pattern[n])
else:
nn = len(inps) + p
name = outs[p]
p += 1
res += """
args[%(n)s].name = %(name)s;
args[%(n)s].typecode = %(typecode)s;
args[%(n)s].flags = GE_WRITE;
""" % dict(n=nn, name='"%s"' % (name,),
typecode=o.type.typecode)
res += """
ge = GpuElemwise_new(%(ctx)s->ops, %(ctx)s->ctx, %(support)s, %(kop)s, %(nargs)s, args, %(nd)s, 0);
if (ge == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not initialize elemwise support");
%(fail)s
}
""" % dict(nargs=nargs, ctx=sub['params'], fail=sub['fail'],
support=as_C_string_const(support_code),
kop=as_C_string_const(kop), nd=node.inputs[0].ndim)
return '\n'.join(res) return res
def gpu_kernels(self, node, nodename):
src = self._gpu_kernel_code(node, nodename)
nd = node.outputs[0].ndim
params = ['uintp']
params.extend('uintp' for _ in range(nd))
num_inputs = len(node.inputs)
num_outputs = len(node.outputs)
for n in range(num_inputs + num_outputs):
if (n - len(node.inputs)) in self.inplace_pattern:
continue
params.extend([gpuarray.GpuArray, 'uintp'])
params.extend('intp' for _ in range(nd))
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
return [Kernel(code=src, name="elem_%d" % nd, params=params,
flags=Kernel.get_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype),
objvar='elem_%d_%s' % (nd, nodename))]
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise MethodNotDefined('cuda only')
nd = node.outputs[0].ndim nd = node.outputs[0].ndim
fail = sub["fail"] fail = sub["fail"]
initial_dims = ','.join('1' for i in xrange(nd)) initial_dims = ','.join('1' for i in xrange(nd))
opname = str(self.scalar_op) opname = str(self.scalar_op)
ctx = sub['params'] ctx = sub['params']
nargs = len(node.inputs) + len(node.outputs) - len(self.inplace_pattern)
# check that all inputs have valid dimensions # check that all inputs have valid dimensions
emitted_inames = {} emitted_inames = {}
num_kernel_params = 1 + nd + len(inputs + outputs) * (2 + nd)
code = """ code = """
size_t n_blocks = 0; // +1 is so that MSVC is happy when nd == 0
size_t threads_per_block = 0; size_t dims[%(nd)s+1] = {%(initial_dims)s};
size_t numEls = 0; void *rargs[%(nargs)s] = {0};
const ssize_t zero = 0;
void *kernel_params[%(num_kernel_params)d] = {0};
int err;
""" % locals() """ % locals()
if nd > 0:
code += """
size_t dims[%(nd)s] = {%(initial_dims)s};
""" % locals()
else:
code += """
size_t *dims = NULL;
"""
for idx, iname in enumerate(inputs): for idx, iname in enumerate(inputs):
if iname in emitted_inames: if iname in emitted_inames:
assert emitted_inames[iname] is node.inputs[idx] assert emitted_inames[iname] is node.inputs[idx]
...@@ -256,19 +233,15 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -256,19 +233,15 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
broadcasts = map(int, node.inputs[idx].broadcastable) broadcasts = map(int, node.inputs[idx].broadcastable)
broadcasts = ', '.join(map(str, broadcasts)) broadcasts = ', '.join(map(str, broadcasts))
nd = node.inputs[idx].ndim nd = node.inputs[idx].ndim
if nd > 0: code += """
code += """ int broadcasts_%(iname)s[%(nd)s+1] = {%(broadcasts)s};
int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s}; """ % locals()
""" % locals()
else:
code += """
int *broadcasts_%(iname)s = NULL;
""" % locals()
emitted_inames[iname] = node.inputs[idx] emitted_inames[iname] = node.inputs[idx]
# check that all inputs have valid dimensions # check that all inputs have valid dimensions
emitted_inames = {} emitted_inames = {}
for idx, iname in enumerate(inputs): for idx, iname in enumerate(inputs):
code += "rargs[%(idx)s] = &%(iname)s->ga;\n" % dict(idx=idx, iname=iname)
if iname in emitted_inames: if iname in emitted_inames:
continue continue
code += """ code += """
...@@ -300,6 +273,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -300,6 +273,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
""" % locals() """ % locals()
emitted_inames[iname] = True emitted_inames[iname] = True
# check that all outputs have valid dimensions # check that all outputs have valid dimensions
p = len(node.inputs)
for idx, oname in enumerate(outputs): for idx, oname in enumerate(outputs):
typecode = dtype_to_typecode(node.outputs[idx].dtype) typecode = dtype_to_typecode(node.outputs[idx].dtype)
if idx not in self.inplace_pattern.keys(): if idx not in self.inplace_pattern.keys():
...@@ -325,7 +299,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -325,7 +299,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
%(fail)s %(fail)s
} }
} }
""" % locals() rargs[%(p)s] = &%(oname)s->ga;
""" % locals()
p += 1
else: else:
input_idx = self.inplace_pattern[idx] input_idx = self.inplace_pattern[idx]
iname = inputs[input_idx] iname = inputs[input_idx]
...@@ -351,92 +327,35 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -351,92 +327,35 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
} }
} }
""" % locals() """ % locals()
z = outputs[0]
code += """numEls = PyGpuArray_SIZE(%(z)s);
//first use at least a full warp
threads_per_block = std::min(numEls, (size_t)32); //WARP SIZE
//next start adding multiprocessors
// UP TO NUMBER OF MULTIPROCESSORS, use 30 for now.
n_blocks = std::min(numEls/threads_per_block +
(numEls %% threads_per_block?1:0),
(size_t)30);
// next start adding more warps per multiprocessor
if (threads_per_block * n_blocks < numEls)
threads_per_block = std::min(numEls/n_blocks, (size_t) 256);
""" % locals()
kname = 'elem_%d_%s' % (nd, name)
param = ["(void *)&numEls"]
for i in range(nd):
param.append("(void *)&%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
i=i))
for n, (name, var) in enumerate(zip(inputs + outputs,
node.inputs + node.outputs)):
if (n - len(inputs)) in self.inplace_pattern:
continue
dtype = dtype_to_ctype(var.dtype)
param.append("(void *)%(name)s->ga.data" % locals())
param.append("(void *)&%(name)s->ga.offset" % locals())
for i in range(nd):
param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? (void *)&zero: (void *)&PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
for n, p in enumerate(param):
code += "kernel_params[%(n)d] = %(p)s;\n" % locals()
code += """ code += """
err = GpuKernel_call(&%(kname)s, 1, &threads_per_block, &n_blocks, 0, kernel_params); if (GpuElemwise_call(ge, rargs, GE_BROADCAST) != GA_NO_ERROR) {
if (err != GA_NO_ERROR) { PyErr_SetString(PyExc_RuntimeError, "Error in the elemwise call");
PyErr_Format(PyExc_RuntimeError, %(fail)s
"gpuarray error: %(kname)s: %%s.",
GpuKernel_error(&%(kname)s, err));
%(fail)s;
} }
""" % dict(kname=kname, fail=fail) """ % dict(fail=sub['fail'])
if config.gpuarray.sync: if config.gpuarray.sync:
z = outputs[0]
code += """ code += """
err = GpuArray_sync(&%(z)s->ga); err = GpuArray_sync(&%(z)s->ga);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(kname)s: %%s.", "gpuarray error: %%s.",
GpuKernel_error(&%(kname)s, err)); GpuArray_error(&%(z)s->ga, err));
%(fail)s; %(fail)s;
} }
""" % locals() """ % locals()
return str(code)
def perform(self, node, inputs, output_storage, ctx): return str(code)
# Try to reuse the kernel from a previous call to hopefully
# avoid recompiling
if not hasattr(node, '_cache_elemwise_k'):
node._cache_elemwise_k = self.generate_kernel(node, "kcode")
out_shape = []
for values in izip(*[input.shape for input in inputs]):
if any(v == 0 for v in values):
# All non-broadcasted dimensions should be zero
assert max(values) <= 1
out_shape.append(0)
else:
out_shape.append(max(values))
out_shape = tuple(out_shape)
args = copy.copy(inputs)
for n, (stor, out) in enumerate(izip(output_storage, node.outputs)):
if n in self.inplace_pattern:
stor[0] = inputs[self.inplace_pattern[n]]
else:
args.append(ensure_allocated(stor, out_shape, out.type.dtype, ctx))
node._cache_elemwise_k(*args, broadcast=True) # To disable the superclass perform.
if config.gpuarray.sync: perform = Op.perform
output_storage[0][0].sync()
def c_code_cache_version(self): def c_code_cache_version(self):
ver = self.scalar_op.c_code_cache_version() ver = self.scalar_op.c_code_cache_version()
if ver: if ver:
return (4, ver) return (6, ver)
else: else:
return ver return ver
...@@ -585,7 +504,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -585,7 +504,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
This op was recently upgraded from just GpuSum a general CAReduce. Not This op was recently upgraded from just GpuSum a general CAReduce. Not
many code cases are supported for scalar_op being anything other than many code cases are supported for scalar_op being anything other than
scal.Add instances yet. scalar.Add instances yet.
Important note: if you implement new cases for this op, be sure to Important note: if you implement new cases for this op, be sure to
benchmark them and make sure that they actually result in a speedup. benchmark them and make sure that they actually result in a speedup.
...@@ -735,7 +654,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -735,7 +654,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# It might be nice to use a property of the op class to do this, # It might be nice to use a property of the op class to do this,
# but tensor.elemwise.CAReduce has this exact same check so I guess # but tensor.elemwise.CAReduce has this exact same check so I guess
# this is OK to do # this is OK to do
if self.scalar_op in [scal.minimum, scal.maximum]: if self.scalar_op in [scalar.minimum, scalar.maximum]:
conds = ["(PyGpuArray_DIMS(%s)[%d] == 0)" % (x, i) conds = ["(PyGpuArray_DIMS(%s)[%d] == 0)" % (x, i)
for i in xrange(nd_in) for i in xrange(nd_in)
if self.reduce_mask[i]] if self.reduce_mask[i]]
...@@ -1060,13 +979,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1060,13 +979,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if hasattr(self.scalar_op, 'identity'): if hasattr(self.scalar_op, 'identity'):
return str(self.scalar_op.identity) return str(self.scalar_op.identity)
else: else:
assert isinstance(self.scalar_op, (scal.Maximum, assert isinstance(self.scalar_op, (scalar.Maximum,
scal.Minimum)) scalar.Minimum))
if self.pre_scalar_op: # TODO: multiple dtypes if self.pre_scalar_op: # TODO: multiple dtypes
# dtype = node.inputs[0].dtype # dtype = node.inputs[0].dtype
dtype = 'float32' dtype = 'float32'
dummy_var = scal.Scalar(dtype=dtype)() dummy_var = scalar.Scalar(dtype=dtype)()
dummy_node = self.pre_scalar_op.make_node(dummy_var) dummy_node = self.pre_scalar_op.make_node(dummy_var)
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import os import os
import copy
import numpy import numpy
from six import integer_types from six import integer_types
from six.moves import StringIO from six.moves import StringIO
import theano
from theano import tensor, gof from theano import tensor, gof
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
import theano.tensor.inplace
try: try:
import pygpu import pygpu
...@@ -18,10 +15,9 @@ try: ...@@ -18,10 +15,9 @@ try:
except ImportError: except ImportError:
pass pass
from .type import GpuArrayType from .type import GpuArrayType, gpu_context_type
from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel, from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
infer_context_name) infer_context_name)
from .elemwise import GpuElemwise
class GpuSubtensor(HideC, Subtensor): class GpuSubtensor(HideC, Subtensor):
...@@ -168,7 +164,7 @@ class GpuSubtensor(HideC, Subtensor): ...@@ -168,7 +164,7 @@ class GpuSubtensor(HideC, Subtensor):
return (6,) return (6,)
class GpuIncSubtensor(GpuKernelBase, IncSubtensor): class GpuIncSubtensor(IncSubtensor):
""" """
Implement IncSubtensor on the gpu. Implement IncSubtensor on the gpu.
...@@ -181,45 +177,20 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -181,45 +177,20 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
:meth:`copy_of_x`, etc. specialize the c_code for this Op. :meth:`copy_of_x`, etc. specialize the c_code for this Op.
""" """
_f16_ok = True
@property params_type = gpu_context_type
def _f16_ok(self):
return self.iadd_node.op._f16_ok
def c_headers(self):
return self.iadd_node.op.c_headers()
def c_init_code(self):
return self.iadd_node.op.c_init_code()
def gpu_kernels(self, node, nodename):
subname = nodename + "_add_to_zview"
return self.iadd_node.op.gpu_kernels(self.iadd_node, subname)
def make_node(self, x, y, *inputs): def make_node(self, x, y, *inputs):
ctx_name = infer_context_name(x, y) ctx_name = infer_context_name(x, y)
x = as_gpuarray_variable(x, ctx_name) x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name) y = as_gpuarray_variable(y, ctx_name)
rval = tensor.IncSubtensor.make_node(self, x, y, *inputs) rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
op = copy.copy(self) ret = gof.Apply(self, [x, y] + rval.inputs[2:], [x.type()])
ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()])
op.create_iadd_node(ret)
return ret return ret
def get_params(self, node): def get_params(self, node):
return node.outputs[0].type.context return node.outputs[0].type.context
def create_iadd_node(self, node):
# We store a iadd_node in the op that contain the info needed
# for the inplace add.
cop = theano.tensor.inplace.add_inplace
gop = GpuElemwise(cop.scalar_op, copy.copy(cop.inplace_pattern),
"Gpu" + cop.name, cop.nfunc_spec)
y = node.inputs[1]
xview = y.type()
iadd_node = gop(xview, y).owner
self.iadd_node = iadd_node
def perform(self, node, inputs, out_, ctx): def perform(self, node, inputs, out_, ctx):
out, = out_ out, = out_
x, y = inputs[:2] x, y = inputs[:2]
...@@ -261,18 +232,6 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -261,18 +232,6 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
x.__setitem__(cdata, y) x.__setitem__(cdata, y)
out[0] = x out[0] = x
def __setstate__(self, d):
self.__dict__.update(d)
owner = getattr(self, "owner", None)
if owner:
self.create_iadd_node(owner)
def __getstate__(self):
d = copy.copy(self.__dict__)
if "iadd_node" in d:
d.pop('iadd_node')
return d
def do_type_checking(self, node): def do_type_checking(self, node):
""" """
Should raise NotImplementedError if c_code does not support Should raise NotImplementedError if c_code does not support
...@@ -365,47 +324,52 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -365,47 +324,52 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
""" """
return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals() return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/error.h>', '<gpuarray/array.h>',
'<gpuarray/elemwise.h>']
def c_support_code_struct(self, node, nodename): def c_support_code_struct(self, node, nodename):
gop = self.iadd_node.op return "\nGpuElemwise *iadd;\n"
sub_name = nodename + "_add_to_zview"
ret = gop.c_support_code_struct(self.iadd_node, sub_name)
ret += """
PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
PyGpuArrayObject* src){
PyGpuArrayObject* ret = NULL;
""" % locals()
inputs = ["dst", "src"]
outputs = ["ret"]
sub = {"fail": "return NULL;", "params": "dst->context"}
ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
ret += """
return ret;
def c_init_code_struct(self, node, name, sub):
return """
gpuelemwise_arg args[2] = {{0}};
args[0].name = "a";
args[0].typecode = %(type1)s;
args[0].flags = GE_READ|GE_WRITE;
args[1].name = "b";
args[1].typecode = %(type2)s;
args[1].flags = GE_READ;
iadd = GpuElemwise_new(%(ctx)s->ops, %(ctx)s->ctx, "", "a += b",
2, args, %(nd)s, 0);
if (iadd == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
%(fail)s
} }
""" """ % dict(ctx=sub['params'], fail=sub['fail'],
return ret type1=node.inputs[0].type.typecode,
type2=node.inputs[1].type.typecode,
nd=node.inputs[1].ndim)
def add_to_zview(self, nodename, x, fail): def add_to_zview(self, nodename, x, fail):
return """ return """
PyGpuArrayObject * add_result = inc_sub_iadd_%(nodename)s(zview, %(x)s);
if (! add_result )
{ {
void *args[2];
args[0] = &zview->ga;
args[1] = &%(x)s->ga;
if (GpuElemwise_call(iadd, args, GE_BROADCAST) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Error doing inplace add");
Py_DECREF(zview); Py_DECREF(zview);
%(fail)s; %(fail)s
} }
else
{
Py_DECREF(add_result);
} }
""" % locals() """ % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
parent_version = super(GpuIncSubtensor, self).c_code_cache_version() parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
elemwise_version = self.iadd_node.c_code_cache_version() if not parent_version:
if not parent_version or not elemwise_version:
return return
return parent_version + elemwise_version + (3,) return parent_version + (5,)
class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1): class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
......
...@@ -18,40 +18,18 @@ from pygpu import ndgpuarray as gpuarray ...@@ -18,40 +18,18 @@ from pygpu import ndgpuarray as gpuarray
# This is acutally a test for GpuElemwise # This is acutally a test for GpuElemwise
class test_gpu_Broadcast(test_elemwise.test_Broadcast): class test_gpu_Broadcast(test_elemwise.test_Broadcast):
op = GpuElemwise
type = GpuArrayType
cop = GpuElemwise cop = GpuElemwise
ctype = GpuArrayType ctype = GpuArrayType
# The order is important # The order is important
linkers = [gof.PerformLinker, gof.CLinker] linkers = [gof.PerformLinker, gof.CLinker]
def setUp(self):
if get_context(test_ctx_name).kind != 'cuda':
self.linkers = [gof.PerformLinker]
def rand_val(self, shp):
return rand_gpuarray(*shp, **dict(cls=gpuarray))
def rand_cval(self, shp): def rand_cval(self, shp):
return rand_gpuarray(*shp, **dict(cls=gpuarray)) return rand_gpuarray(*shp, **dict(cls=gpuarray))
def test_c(self):
if get_context(test_ctx_name).kind != 'cuda':
raise SkipTest("Cuda specific tests")
super(test_gpu_Broadcast, self).test_c()
def test_c_inplace(self):
if get_context(test_ctx_name).kind != 'cuda':
raise SkipTest("Cuda specific tests")
super(test_gpu_Broadcast, self).test_c_inplace()
def test_elemwise_pow(): def test_elemwise_pow():
# Test that GpuElemwise(pow) can compile with any combination of integer # Test that GpuElemwise(pow) can compile with any combination of integer
# or float input dtype. # or float input dtype.
if get_context(test_ctx_name).kind != 'cuda':
raise SkipTest("Cuda specific tests")
dtypes = ["uint8", "uint16", "uint32", "uint64", dtypes = ["uint8", "uint16", "uint32", "uint64",
"int8", "int16", "int32", "int64", "int8", "int16", "int32", "int64",
"float16", "float32", "float64"] "float16", "float32", "float64"]
...@@ -65,10 +43,10 @@ def test_elemwise_pow(): ...@@ -65,10 +43,10 @@ def test_elemwise_pow():
output = base ** exp output = base ** exp
f = theano.function([base, exp], output) f = theano.function([base, exp], output)
# Call the function to make sure the output is valid
base_val = numpy.random.randint(0, 5, size=10).astype(dtype_base) base_val = numpy.random.randint(0, 5, size=10).astype(dtype_base)
exp_val = numpy.random.randint(0, 3, size=10).astype(dtype_exp) exp_val = numpy.random.randint(0, 3, size=10).astype(dtype_exp)
# Call the function to make sure the output is valid
out = f(base_val, exp_val) out = f(base_val, exp_val)
expected_out = base_val ** exp_val expected_out = base_val ** exp_val
assert_allclose(out, expected_out) assert_allclose(out, expected_out)
......
...@@ -166,10 +166,12 @@ class test_Broadcast(unittest.TestCase): ...@@ -166,10 +166,12 @@ class test_Broadcast(unittest.TestCase):
linkers = [gof.PerformLinker, gof.CLinker] linkers = [gof.PerformLinker, gof.CLinker]
def rand_val(self, shp): def rand_val(self, shp):
return numpy.asarray(numpy.random.rand(*shp)) return numpy.asarray(numpy.random.rand(*shp),
dtype=theano.config.floatX)
def rand_cval(self, shp): def rand_cval(self, shp):
return numpy.asarray(numpy.random.rand(*shp)) return numpy.asarray(numpy.random.rand(*shp),
dtype=theano.config.floatX)
def setUp(self): def setUp(self):
unittest_tools.seed_rng() unittest_tools.seed_rng()
...@@ -189,8 +191,10 @@ class test_Broadcast(unittest.TestCase): ...@@ -189,8 +191,10 @@ class test_Broadcast(unittest.TestCase):
((2, 3, 4, 5), (1, 3, 1, 5)), ((2, 3, 4, 5), (1, 3, 1, 5)),
((2, 3, 4, 5), (1, 1, 1, 1)), ((2, 3, 4, 5), (1, 1, 1, 1)),
((), ())]: ((), ())]:
x = type('float64', [(entry == 1) for entry in xsh])('x') x = type(theano.config.floatX,
y = type('float64', [(entry == 1) for entry in ysh])('y') [(entry == 1) for entry in xsh])('x')
y = type(theano.config.floatX,
[(entry == 1) for entry in ysh])('y')
e = op(scalar.add)(x, y) e = op(scalar.add)(x, y)
f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function() f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function()
xv = rand_val(xsh) xv = rand_val(xsh)
...@@ -202,8 +206,10 @@ class test_Broadcast(unittest.TestCase): ...@@ -202,8 +206,10 @@ class test_Broadcast(unittest.TestCase):
# test Elemwise.infer_shape # test Elemwise.infer_shape
# the Shape op don't implement c_code! # the Shape op don't implement c_code!
if isinstance(linker, gof.PerformLinker): if isinstance(linker, gof.PerformLinker):
x = type('float64', [(entry == 1) for entry in xsh])('x') x = type(theano.config.floatX,
y = type('float64', [(entry == 1) for entry in ysh])('y') [(entry == 1) for entry in xsh])('x')
y = type(theano.config.floatX,
[(entry == 1) for entry in ysh])('y')
e = op(scalar.add)(x, y) e = op(scalar.add)(x, y)
f = copy(linker).accept(FunctionGraph( f = copy(linker).accept(FunctionGraph(
[x, y], [e.shape])).make_function() [x, y], [e.shape])).make_function()
...@@ -218,8 +224,10 @@ class test_Broadcast(unittest.TestCase): ...@@ -218,8 +224,10 @@ class test_Broadcast(unittest.TestCase):
((2, 3, 4, 5), (1, 3, 1, 5)), ((2, 3, 4, 5), (1, 3, 1, 5)),
((2, 3, 4, 5), (1, 1, 1, 1)), ((2, 3, 4, 5), (1, 1, 1, 1)),
((), ())]: ((), ())]:
x = type('float64', [(entry == 1) for entry in xsh])('x') x = type(theano.config.floatX,
y = type('float64', [(entry == 1) for entry in ysh])('y') [(entry == 1) for entry in xsh])('x')
y = type(theano.config.floatX,
[(entry == 1) for entry in ysh])('y')
e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y) e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function() f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function()
xv = rand_val(xsh) xv = rand_val(xsh)
...@@ -232,8 +240,10 @@ class test_Broadcast(unittest.TestCase): ...@@ -232,8 +240,10 @@ class test_Broadcast(unittest.TestCase):
# test Elemwise.infer_shape # test Elemwise.infer_shape
# the Shape op don't implement c_code! # the Shape op don't implement c_code!
if isinstance(linker, gof.PerformLinker): if isinstance(linker, gof.PerformLinker):
x = type('float64', [(entry == 1) for entry in xsh])('x') x = type(theano.config.floatX,
y = type('float64', [(entry == 1) for entry in ysh])('y') [(entry == 1) for entry in xsh])('x')
y = type(theano.config.floatX,
[(entry == 1) for entry in ysh])('y')
e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y) e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
f = copy(linker).accept(FunctionGraph( f = copy(linker).accept(FunctionGraph(
[x, y], [e.shape])).make_function() [x, y], [e.shape])).make_function()
...@@ -267,13 +277,15 @@ class test_Broadcast(unittest.TestCase): ...@@ -267,13 +277,15 @@ class test_Broadcast(unittest.TestCase):
def test_fill(self): def test_fill(self):
if not theano.config.cxx: if not theano.config.cxx:
raise SkipTest("G++ not available, so we need to skip this test.") raise SkipTest("G++ not available, so we need to skip this test.")
x = self.ctype('float64', [0, 0])('x') for linker, op, t, rval in zip(self.linkers, [self.op, self.cop],
y = self.ctype('float64', [1, 1])('y') [self.type, self.ctype],
for linker, op in zip(self.linkers, [self.op, self.cop]): [self.rand_val, self.rand_cval]):
x = t(theano.config.floatX, [0, 0])('x')
y = t(theano.config.floatX, [1, 1])('y')
e = op(scalar.Second(scalar.transfer_type(0)), {0: 0})(x, y) e = op(scalar.Second(scalar.transfer_type(0)), {0: 0})(x, y)
f = linker().accept(FunctionGraph([x, y], [e])).make_function() f = linker().accept(FunctionGraph([x, y], [e])).make_function()
xv = self.rand_cval((5, 5)) xv = rval((5, 5))
yv = self.rand_cval((1, 1)) yv = rval((1, 1))
f(xv, yv) f(xv, yv)
assert (xv == yv).all() assert (xv == yv).all()
...@@ -292,24 +304,28 @@ class test_Broadcast(unittest.TestCase): ...@@ -292,24 +304,28 @@ class test_Broadcast(unittest.TestCase):
def test_weird_strides(self): def test_weird_strides(self):
if not theano.config.cxx: if not theano.config.cxx:
raise SkipTest("G++ not available, so we need to skip this test.") raise SkipTest("G++ not available, so we need to skip this test.")
x = self.ctype('float64', [0, 0, 0, 0, 0])('x') for linker, op, t, rval in zip(self.linkers, [self.op, self.cop],
y = self.ctype('float64', [0, 0, 0, 0, 0])('y') [self.type, self.ctype],
for linker, op in zip(self.linkers, [self.op, self.cop]): [self.rand_val, self.rand_cval]):
x = t(theano.config.floatX, [0, 0, 0, 0, 0])('x')
y = t(theano.config.floatX, [0, 0, 0, 0, 0])('y')
e = op(scalar.add)(x, y) e = op(scalar.add)(x, y)
f = linker().accept(FunctionGraph([x, y], [e])).make_function() f = linker().accept(FunctionGraph([x, y], [e])).make_function()
xv = self.rand_cval((2, 2, 2, 2, 2)) xv = rval((2, 2, 2, 2, 2))
yv = self.rand_cval((2, 2, 2, 2, 2)).transpose(4, 0, 3, 1, 2) yv = rval((2, 2, 2, 2, 2)).transpose(4, 0, 3, 1, 2)
zv = xv + yv zv = xv + yv
assert (f(xv, yv) == zv).all() assert (f(xv, yv) == zv).all()
def test_same_inputs(self): def test_same_inputs(self):
if not theano.config.cxx: if not theano.config.cxx:
raise SkipTest("G++ not available, so we need to skip this test.") raise SkipTest("G++ not available, so we need to skip this test.")
x = self.ctype('float64', [0, 0])('x') for linker, op, t, rval in zip(self.linkers, [self.op, self.cop],
for linker, op in zip(self.linkers, [self.op, self.cop]): [self.type, self.ctype],
[self.rand_val, self.rand_cval]):
x = t(theano.config.floatX, [0, 0])('x')
e = op(scalar.add)(x, x) e = op(scalar.add)(x, x)
f = linker().accept(FunctionGraph([x], [e])).make_function() f = linker().accept(FunctionGraph([x], [e])).make_function()
xv = self.rand_cval((2, 2)) xv = rval((2, 2))
zv = xv + xv zv = xv + xv
assert (f(xv) == zv).all() assert (f(xv) == zv).all()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论