提交 e05e801f authored 作者: abergeron's avatar abergeron

Merge pull request #1664 from nouiz/gpu_inc_sub

GpuIncSubtensor
...@@ -2794,20 +2794,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): ...@@ -2794,20 +2794,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
""" """
return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals() return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
def set_view_base(self, x, fail): def add_to_zview(self, name, x, fail):
return """
//Set the base only now
if(CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(zview),
%(x)s)){
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set"
" the base of the view array");
Py_XDECREF(zview);
%(fail)s;
}""" % locals()
def add_to_zview(self, x, fail):
return """ return """
PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview, PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview,
......
...@@ -149,9 +149,19 @@ class GpuElemwise(HideC, Elemwise): ...@@ -149,9 +149,19 @@ class GpuElemwise(HideC, Elemwise):
#define ga_double double #define ga_double double
#define ga_half uint16_t #define ga_half uint16_t
#include <Python.h>
#include <numpy/npy_common.h>
""" """
for npy, ga in [("npy_uint8", "ga_ubyte"),
("npy_uint16", "ga_ushort"),
("npy_uin32", "ga_uint"),
("npy_uin64", "ga_ulong"),
("npy_int8", "ga_byte"),
("npy_int16", "ga_short"),
("npy_int32", "ga_int"),
("npy_int64", "ga_long"),
("npy_float32", "ga_float"),
("npy_float64", "ga_double"),
]:
kop = kop.replace(npy, ga)
return ElemwiseKernel(None, inps+outs, kop, preamble=support_code) return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
def c_headers(self): def c_headers(self):
...@@ -165,9 +175,34 @@ class GpuElemwise(HideC, Elemwise): ...@@ -165,9 +175,34 @@ class GpuElemwise(HideC, Elemwise):
# implementation # implementation
k = self.generate_kernel(node, nodename) k = self.generate_kernel(node, nodename)
nd = node.inputs[0].type.ndim nd = node.inputs[0].type.ndim
import pycuda._cluda CLUDA_PREAMBLE = """
#define local_barrier() __syncthreads();
#define WITHIN_KERNEL __device__
#define KERNEL extern "C" __global__
#define GLOBAL_MEM /* empty */
#define LOCAL_MEM __shared__
#define LOCAL_MEM_ARG /* empty */
#define REQD_WG_SIZE(X,Y,Z) __launch_bounds__(X*Y*Z, 1)
#define LID_0 threadIdx.x
#define LID_1 threadIdx.y
#define LID_2 threadIdx.z
#define GID_0 blockIdx.x
#define GID_1 blockIdx.y
#define GID_2 blockIdx.z
#define LDIM_0 blockDim.x
#define LDIM_1 blockDim.y
#define LDIM_2 blockDim.z
#define GDIM_0 gridDim.x
#define GDIM_1 gridDim.y
#define GDIM_2 gridDim.z
"""
res = ["CUdeviceptr (*cuda_get_ptr)(gpudata *g);", res = ["CUdeviceptr (*cuda_get_ptr)(gpudata *g);",
pycuda._cluda.CLUDA_PREAMBLE] CLUDA_PREAMBLE]
for i in range(0, nd + 1): for i in range(0, nd + 1):
res.append(k.render_basic(i, name="elem_" + str(i)) + ';') res.append(k.render_basic(i, name="elem_" + str(i)) + ';')
res.append(k.contig_src + ';') res.append(k.contig_src + ';')
...@@ -338,8 +373,8 @@ class GpuElemwise(HideC, Elemwise): ...@@ -338,8 +373,8 @@ class GpuElemwise(HideC, Elemwise):
node.inputs + node.outputs)): node.inputs + node.outputs)):
if (n - len(inputs)) in self.inplace_pattern: if (n - len(inputs)) in self.inplace_pattern:
continue continue
dtype = var.dtype dtype = dtype_to_ctype(var.dtype)
param.append("(npy_%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals()) param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
param.append("%(name)s->ga.offset" % locals()) param.append("%(name)s->ga.offset" % locals())
for i in range(nd): for i in range(nd):
param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals()) param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
......
...@@ -24,7 +24,7 @@ from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBi ...@@ -24,7 +24,7 @@ from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBi
GpuCrossentropySoftmax1HotWithBiasDx) GpuCrossentropySoftmax1HotWithBiasDx)
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar, from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduce) GpuDimShuffle, GpuCAReduce)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
from theano.sandbox.gpuarray.type import GpuArrayConstant from theano.sandbox.gpuarray.type import GpuArrayConstant
gpu_optimizer = EquilibriumDB() gpu_optimizer = EquilibriumDB()
...@@ -234,6 +234,14 @@ def local_gpua_subtensor(node): ...@@ -234,6 +234,14 @@ def local_gpua_subtensor(node):
return GpuSubtensor(node.op.idx_list) return GpuSubtensor(node.op.idx_list)
@register_opt()
@op_lifter([tensor.IncSubtensor])
def local_gpua_incsubtensor(node):
return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
node.op.set_instead_of_inc,
node.op.destroyhandler_tolerate_aliased)
@register_opt() @register_opt()
@op_lifter([tensor.CAReduce, tensor.Sum]) @op_lifter([tensor.CAReduce, tensor.Sum])
def local_gpua_careduce(node): def local_gpua_careduce(node):
......
import copy
import StringIO import StringIO
import numpy import numpy
import theano import theano
from theano import tensor, gof from theano import tensor, gof
from theano.tensor.subtensor import Subtensor, get_idx_list
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
try: try:
import pygpu import pygpu
...@@ -16,6 +17,7 @@ except ImportError: ...@@ -16,6 +17,7 @@ except ImportError:
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
from theano.sandbox.gpuarray.elemwise import GpuElemwise
class GpuSubtensor(HideC, Subtensor): class GpuSubtensor(HideC, Subtensor):
...@@ -154,3 +156,203 @@ class GpuSubtensor(HideC, Subtensor): ...@@ -154,3 +156,203 @@ class GpuSubtensor(HideC, Subtensor):
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (5,)
class GpuIncSubtensor(IncSubtensor):
"""
Implement IncSubtensor on the gpu.
Note: The optimization to make this inplace is in tensor/opt.
The same optimization handles IncSubtensor and GpuIncSubtensor.
This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
The helper methods like do_type_checking, copy_of_x, etc. specialize
the c_code for this Op.
"""
def c_headers(self):
return self.iadd_node.op.c_headers()
def c_compiler(self):
return self.iadd_node.op.c_compiler()
def c_init_code(self):
return self.iadd_node.op.c_init_code()
def make_node(self, x, y, *inputs):
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
op = copy.copy(self)
ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()])
op.create_iadd_node(ret)
return ret
def create_iadd_node(self, node):
# We store a iadd_node in the op that contain the info needed
# for the inplace add.
cop = theano.tensor.inplace.add_inplace
gop = GpuElemwise(cop.scalar_op, copy.copy(cop.inplace_pattern),
"Gpu" + cop.name, cop.nfunc_spec)
y = node.inputs[1]
xview = y.type()
iadd_node = gop(xview, y).owner
self.iadd_node = iadd_node
def perform(self, node, inputs, out_):
out, = out_
x, y = inputs[:2]
indices = list(reversed(inputs[2:]))
def convert(entry):
if isinstance(entry, gof.Type):
rval = indices.pop()
return rval
elif isinstance(entry, slice):
return slice(convert(entry.start),
convert(entry.stop),
convert(entry.step))
else:
return entry
cdata = tuple(map(convert, self.idx_list))
if len(cdata) == 1:
cdata = cdata[0]
if not self.inplace:
x = x.copy()
sub_x = x.__getitem__(cdata)
if sub_x.shape:
# we've sliced out an N-D tensor with N > 0
if not self.set_instead_of_inc:
#sub_x += y
pygpu.elemwise.ielemwise2(sub_x, '+', y, broadcast=False)
else:
#sub_x += -sub_x + y
x.__setitem__(cdata, y)
else:
# scalar case
if not self.set_instead_of_inc:
#x.__setitem__(cdata, sub_x + y)
tmp = pygpu.elemwise.elemwise2(sub_x, '+', y, sub_x, broadcast=False)
x.__setitem__(cdata, tmp)
else:
x.__setitem__(cdata, y)
out[0] = x
def __setstate__(self, d):
self.__dict__.update(d)
owner = getattr(self.__dict__, "owner", None)
if owner:
op.create_iadd_node(owner)
def __getstate__(self):
d = copy.copy(self.__dict__)
if "iadd_node" in d:
d.pop('iadd_node')
return d
def do_type_checking(self, node):
""" Should raise NotImplementedError if c_code does not support
the types involved in this node.
"""
if not isinstance(node.inputs[0].type, GpuArrayType):
raise NotImplementedError()
def copy_of_x(self, x):
"""
:param x: a string giving the name of a C variable
pointing to an array
:return: C code expression to make a copy of x
Base class uses `PyArrayObject *`, subclasses may override for
different types of arrays.
"""
return """pygpu_copy(%(x)s, GA_ANY_ORDER)""" % locals()
def decl_view(self):
return "PyGpuArrayObject* zview = NULL;"
def make_view_array(self, x, view_ndim):
"""//TODO
:param x: a string identifying an array to be viewed
:param view_ndim: a string specifying the number of dimensions
to have in the view
This doesn't need to actually set up the view with the
right indexing; we'll do that manually later.
"""
ret = """
size_t dims[%(view_ndim)s];
for(int i=0; i<%(view_ndim)s; i++)
dims[i] = xview_dims[i];
zview = pygpu_fromgpudata(%(x)s->ga.data,
xview_offset,
%(x)s->ga.typecode,
%(view_ndim)s,
dims,
xview_strides,
pygpu_default_context(),
1,
(PyObject *)%(x)s,
(PyObject *)&PyGpuArrayType);
""" % locals()
return ret
def get_helper_c_code_args(self):
""" Return a dictionary of arguments to use with helper_c_code"""
return {'c_prefix': 'PyGpuArray',
'strides_mul': 1
}
def copy_into(self, view, source):
"""
view: string, C code expression for an array
source: string, C code expression for an array
returns a C code expression to copy source into view, and
return 0 on success
"""
return """GpuArray_move(&%(view)s->ga, &%(source)s->ga)""" % locals()
def c_support_code_apply(self, node, nodename):
gop = self.iadd_node.op
sub_name = nodename + "_add_to_zview"
ret = gop.c_support_code_apply(self.iadd_node, sub_name)
ret += """
PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
PyGpuArrayObject* src){
PyGpuArrayObject* ret = NULL;
""" % locals()
#def c_code(self, node, name, inputs, outputs, sub):
inputs = ["dst", "src"]
outputs = ["ret"]
sub = {"fail": "return NULL;"}
ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
ret += """
return dst;
}
"""
return ret
def add_to_zview(self, nodename, x, fail):
#TODO
return """
PyGpuArrayObject * add_result = inc_sub_iadd_%(nodename)s(zview, %(x)s);
if (! add_result )
{
Py_DECREF(zview);
%(fail)s;
}
else
{
Py_DECREF(add_result);
}
""" % locals()
def c_code_cache_version(self):
parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
elemwise_version = self.iadd_node.c_code_cache_version()
if not parent_version or not elemwise_version:
return
return parent_version + elemwise_version + (0,)
from theano.tensor.tests.test_subtensor import T_subtensor from theano.tensor.tests.test_subtensor import T_subtensor
from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost) from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
from theano.sandbox.gpuarray.type import gpuarray_shared_constructor from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
...@@ -11,6 +11,7 @@ from theano.compile import DeepCopyOp ...@@ -11,6 +11,7 @@ from theano.compile import DeepCopyOp
from theano import tensor from theano import tensor
class G_subtensor(T_subtensor): class G_subtensor(T_subtensor):
def shortDescription(self): def shortDescription(self):
return None return None
...@@ -19,8 +20,10 @@ class G_subtensor(T_subtensor): ...@@ -19,8 +20,10 @@ class G_subtensor(T_subtensor):
T_subtensor.__init__(self, name, T_subtensor.__init__(self, name,
shared=gpuarray_shared_constructor, shared=gpuarray_shared_constructor,
sub=GpuSubtensor, sub=GpuSubtensor,
inc_sub=GpuIncSubtensor,
mode=mode_with_gpu, mode=mode_with_gpu,
# avoid errors with limited devices # avoid errors with limited devices
dtype='float32', dtype='float32',
ignore_topo=(HostFromGpu,GpuFromHost,DeepCopyOp)) ignore_topo=(HostFromGpu, GpuFromHost,
DeepCopyOp))
assert self.sub == GpuSubtensor assert self.sub == GpuSubtensor
...@@ -1255,7 +1255,7 @@ class IncSubtensor(Op): ...@@ -1255,7 +1255,7 @@ class IncSubtensor(Op):
copy_into = self.copy_into("zview", y) copy_into = self.copy_into("zview", y)
add_to_zview = self.add_to_zview(y, fail) add_to_zview = self.add_to_zview(name, y, fail)
make_modification = """ make_modification = """
if (%(op_is_set)s) if (%(op_is_set)s)
...@@ -1353,7 +1353,7 @@ class IncSubtensor(Op): ...@@ -1353,7 +1353,7 @@ class IncSubtensor(Op):
""" """
return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals() return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals()
def add_to_zview(self, x, fail): def add_to_zview(self, name, x, fail):
""" Return C code to add x to zview. Should DECREF zview if the """ Return C code to add x to zview. Should DECREF zview if the
add fails.""" add fails."""
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论