提交 03bb1866 authored 作者: abergeron's avatar abergeron

Merge pull request #1642 from nouiz/gpua_elemwise

Gpua elemwise
...@@ -362,7 +362,7 @@ def get_module_hash(src_code, key): ...@@ -362,7 +362,7 @@ def get_module_hash(src_code, key):
# it changes, then the module hash should be different. # it changes, then the module hash should be different.
# We start with the source code itself (stripping blanks might avoid # We start with the source code itself (stripping blanks might avoid
# recompiling after a basic indentation fix for instance). # recompiling after a basic indentation fix for instance).
to_hash = map(str.strip, src_code.split('\n')) to_hash = [l.strip() for l in src_code.split('\n')]
# Get the version part of the key (ignore if unversioned). # Get the version part of the key (ignore if unversioned).
if key[0]: if key[0]:
to_hash += map(str, key[0]) to_hash += map(str, key[0])
......
...@@ -4,12 +4,14 @@ from itertools import izip ...@@ -4,12 +4,14 @@ from itertools import izip
import numpy import numpy
from theano import Op, Apply, scalar, config from theano import Op, Apply, scalar, config
from theano.tensor.elemwise import Elemwise, DimShuffle, CAReduceDtype from theano.tensor.elemwise import Elemwise, DimShuffle, CAReduceDtype
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
try: try:
import pygpu import pygpu
from pygpu.tools import ScalarArg, ArrayArg from pygpu.tools import ScalarArg, ArrayArg
from pygpu.elemwise import ElemwiseKernel from pygpu.elemwise import ElemwiseKernel
from pygpu.reduction import ReductionKernel from pygpu.reduction import ReductionKernel
from pygpu.gpuarray import dtype_to_typecode
except ImportError: except ImportError:
pass pass
...@@ -63,10 +65,35 @@ class GpuElemwise(HideC, Elemwise): ...@@ -63,10 +65,35 @@ class GpuElemwise(HideC, Elemwise):
outputs = [GpuArrayType(broadcastable=o.type.broadcastable, outputs = [GpuArrayType(broadcastable=o.type.broadcastable,
dtype=o.type.dtype)() for o in res.outputs] dtype=o.type.dtype)() for o in res.outputs]
inputs = [as_gpuarray_variable(i) for i in inputs] inputs = [as_gpuarray_variable(i) for i in inputs]
res = Apply(self, inputs, outputs) node = Apply(self, inputs, outputs)
# Try to generate the kernel to catch SupportCodeErrors # Try to generate the kernel to catch SupportCodeErrors
k = self.generate_kernel(res, 'test') try:
return res inps = [make_argument(i, 'i%d' % (n,)) for n, i in
enumerate(node.inputs)]
scal_ins = [scalar.Scalar(i.dtype) for i in node.inputs]
outs = [make_argument(o, 'o%d' % (n,)) for n, o in
enumerate(node.outputs) if not n in self.inplace_pattern]
scal_out = [scalar.Scalar(o.dtype) for o in node.outputs]
fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
[o() for o in scal_out])
code = self.scalar_op.c_support_code_apply(fake_node, "test")
if code:
raise SupportCodeError(code)
except MethodNotDefined:
pass
try:
support_code = self.scalar_op.c_support_code()
if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and
support_code.strip() != ""):
# The macro is fine, the C++ struct is not.
raise SupportCodeError(support_code)
except MethodNotDefined:
pass
return node
def generate_kernel(self, node, nodename): def generate_kernel(self, node, nodename):
inps = [make_argument(i, 'i%d' % (n,)) for n, i in inps = [make_argument(i, 'i%d' % (n,)) for n, i in
...@@ -80,27 +107,9 @@ class GpuElemwise(HideC, Elemwise): ...@@ -80,27 +107,9 @@ class GpuElemwise(HideC, Elemwise):
fake_node = Apply(self.scalar_op, [i() for i in scal_ins], fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
[o() for o in scal_out]) [o() for o in scal_out])
try:
code = self.scalar_op.c_support_code_apply(fake_node, nodename)
if code:
raise SupportCodeError(code)
except MethodNotDefined:
pass
support_code = ""
try:
support_code = self.scalar_op.c_support_code()
except MethodNotDefined:
pass
if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and
support_code.strip() != ""):
# The macro is fine, the C++ struct is not.
raise SupportCodeError(support_code)
scal_out = [] scal_out = []
oi = 0 oi = 0
for n in range(len(fake_node.outputs)): for n in range(len(node.outputs)):
if n in self.inplace_pattern: if n in self.inplace_pattern:
scal_out.append(inps[self.inplace_pattern[n]].name+'[i]') scal_out.append(inps[self.inplace_pattern[n]].name+'[i]')
else: else:
...@@ -113,37 +122,229 @@ class GpuElemwise(HideC, Elemwise): ...@@ -113,37 +122,229 @@ class GpuElemwise(HideC, Elemwise):
dict(fail='return;')) dict(fail='return;'))
# Translate types for scalar composite ops (except complex). # Translate types for scalar composite ops (except complex).
support_code += """ support_code = """
#define npy_float64 ga_double #ifdef _MSC_VER
#define npy_float32 ga_float #define signed __int8 int8_t
#define npy_uint8 ga_ubyte #define unsigned __int8 uint8_t
#define npy_int8 ga_byte #define signed __int16 int16_t
#define npy_uint16 ga_ushort #define unsigned __int16 uint16_t
#define npy_int16 ga_short #define signed __int32 int32_t
#define npy_uint32 ga_uint #define unsigned __int32 uint32_t
#define npy_int32 ga_int #define signed __int64 int64_t
#define npy_uint64 ga_ulong #define unsigned __int64 uint64_t
#define npy_int64 ga_long #else
#include <stdint.h>
#endif
#define ga_bool uint8_t
#define ga_byte int8_t
#define ga_ubyte uint8_t
#define ga_short int16_t
#define ga_ushort uint16_t
#define ga_int int32_t
#define ga_uint uint32_t
#define ga_long int64_t
#define ga_ulong uint64_t
#define ga_float float
#define ga_double double
#define ga_half uint16_t
""" """
return ElemwiseKernel(None, inps+outs, kop, preamble=support_code) return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
def c_headers(self):
return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>']
def c_compiler(self):
return NVCC_compiler
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# This is useless by itself, but will serve an eventual c_code # This is useless by itself, but will serve an eventual c_code
# implementation # implementation
k = self.generate_kernel(node, nodename) k = self.generate_kernel(node, nodename)
nd = node.inputs[0].type.ndim nd = node.inputs[0].type.ndim
res = [] import pycuda._cluda
for i in range(1, nd): res = ["CUdeviceptr (*cuda_get_ptr)(gpudata *g);",
var = "static const char %s_%s[] = " % (nodename, str(i)) pycuda._cluda.CLUDA_PREAMBLE]
res.append(var + as_C_string_const(k.render_basic(i)) + ';') for i in range(0, nd + 1):
res.append("static const gpukernel *%s_%s_k = NULL;" % (nodename, res.append(k.render_basic(i, name="elem_" + str(i)) + ';')
str(i))) res.append(k.contig_src + ';')
var = "static const char %s_c[] = " % (nodename,)
res.append(var + as_C_string_const(k.contig_src) + ';')
res.append("static const gpukernel *%s_c_k = NULL;" % (nodename,))
return '\n'.join(res) return '\n'.join(res)
def c_init_code(self):
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))'
'compyte_get_extension("cuda_get_ptr");']
def c_code(self, node, name, inputs, outputs, sub):
nd = node.outputs[0].ndim
fail = sub["fail"]
initial_dims = ','.join('1' for i in xrange(nd))
opname = str(self.scalar_op)
#check that all inputs have valid dimensions
emitted_inames = {}
code = """
int n_blocks = 0;
int threads_per_block = 0;
size_t numEls = 0;
"""
if nd > 0:
code += """
size_t dims[%(nd)s] = {%(initial_dims)s};
""" % locals()
else:
code += """
size_t *dims = NULL;
"""
for idx, iname in enumerate(inputs):
if iname in emitted_inames:
assert emitted_inames[iname] is node.inputs[idx]
continue
broadcasts = map(int, node.inputs[idx].broadcastable)
broadcasts = ', '.join(map(str, broadcasts))
nd = node.inputs[idx].ndim
if nd > 0:
code += """
int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
""" % locals()
else:
code += """
int *broadcasts_%(iname)s = NULL;
""" % locals()
emitted_inames[iname] = node.inputs[idx]
#check that all inputs have valid dimensions
emitted_inames = {}
for idx, iname in enumerate(inputs):
if iname in emitted_inames:
continue
code += """
//std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
if (%(nd)s != PyGpuArray_NDIM(%(iname)s))
{
PyErr_Format(PyExc_TypeError,
"need %(nd)s dims, not %%i",
PyGpuArray_NDIM(%(iname)s));
%(fail)s;
}
for (int i = 0; i< %(nd)s; ++i)
{
dims[i] = (dims[i] == 1) ? PyGpuArray_DIMS(%(iname)s)[i] : dims[i];
if ((!(broadcasts_%(iname)s[i] &&
PyGpuArray_DIMS(%(iname)s)[i] == 1)) &&
(dims[i] != PyGpuArray_DIMS(%(iname)s)[i]))
{
//std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
PyErr_Format(PyExc_ValueError,
"GpuElemwise. Input dimension mis-match. Input"
" %(idx)d (indices start at 0) has shape[%%i] == %%i"
", but the output's size on that axis is %%i.",
i,
PyGpuArray_DIMS(%(iname)s)[i],
dims[i]
);
%(fail)s;
}
}
""" % locals()
emitted_inames[iname] = True
#check that all outputs have valid dimensions
for idx, oname in enumerate(outputs):
typecode = dtype_to_typecode(node.outputs[idx].dtype)
if idx not in self.inplace_pattern.keys():
code += """
for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
{
Py_DECREF(%(oname)s);
%(oname)s = NULL;
}
}
if (%(oname)s && !GpuArray_CHKFLAGS(&(%(oname)s->ga), GA_C_CONTIGUOUS))
{
Py_XDECREF(%(oname)s);
%(oname)s = NULL;
}
if (NULL == %(oname)s)
{
%(oname)s = pygpu_empty(%(nd)d, dims,
%(typecode)s, GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(oname)s) {
//TODO, this check don't seam good.
//TODO, set exception?
%(fail)s
}
}
//std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals()
else:
input_idx = self.inplace_pattern[idx]
iname = inputs[input_idx]
code += """
Py_XDECREF(%(oname)s);
%(oname)s = %(iname)s;
Py_INCREF(%(oname)s);
for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
{
PyErr_Format(PyExc_ValueError,
"GpuElemwise. Output dimension mis-match. Output"
" %(idx)d (indices start at 0), working inplace"
" on input %(input_idx)s, has shape[%%i] == %%i"
", but the output's size on that axis is %%i.",
i,
PyGpuArray_DIMS(%(oname)s)[i],
dims[i]
);
Py_DECREF(%(oname)s);
%(oname)s = NULL;
%(fail)s;
}
}
//std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals()
z = outputs[0]
code += """numEls = PyGpuArray_SIZE(%(z)s);
//first use at least a full warp
threads_per_block = std::min(numEls, (size_t)32); //WARP SIZE
//next start adding multiprocessors
// UP TO NUMBER OF MULTIPROCESSORS, use 30 for now.
n_blocks = std::min(numEls/threads_per_block +
(numEls %% threads_per_block?1:0),
(size_t)30);
// next start adding more warps per multiprocessor
if (threads_per_block * n_blocks < numEls)
threads_per_block = std::min(numEls/n_blocks, (size_t) 256);
//std::cerr << "calling callkernel returned\\n";
""" % locals()
code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals()
param = []
for i in range(nd):
param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
i=i))
for n, (name, var) in enumerate(zip(inputs + outputs,
node.inputs + node.outputs)):
if (n - len(inputs)) in self.inplace_pattern:
continue
dtype = var.dtype
param.append("(npy_%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
param.append("%(name)s->ga.offset" % locals())
for i in range(nd):
param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
code += ',\n'.join(param) + ");\n"
if config.gpuarray.sync:
code += "GpuArray_sync(&%(zz)s->ga);\n" % dict(zz=zz)
return str(code)
def perform(self, node, inputs, output_storage): def perform(self, node, inputs, output_storage):
# Try to reuse the kernel from a previous call to hopefully # Try to reuse the kernel from a previous call to hopefully
# avoid recompiling # avoid recompiling
...@@ -167,11 +368,17 @@ class GpuElemwise(HideC, Elemwise): ...@@ -167,11 +368,17 @@ class GpuElemwise(HideC, Elemwise):
else: else:
args.append(ensure_allocated(stor, out_shape, out.type.dtype)) args.append(ensure_allocated(stor, out_shape, out.type.dtype))
# the dict call is there to avoid a syntax error in python < 2.6 node._cache_elemwise_k(*args, broadcast=True)
node._cache_elemwise_k(*args, **dict(broadcast=True))
if config.gpuarray.sync: if config.gpuarray.sync:
output_storage[0][0].sync() output_storage[0][0].sync()
def c_code_cache_version(self):
ver = self.scalar_op.c_code_cache_version()
if ver:
return (1, ver)
else:
return ver
class SupportCodeError(Exception): class SupportCodeError(Exception):
""" """
......
...@@ -15,11 +15,14 @@ from theano.sandbox.gpuarray.type import GpuArrayType ...@@ -15,11 +15,14 @@ from theano.sandbox.gpuarray.type import GpuArrayType
from pygpu.array import gpuarray from pygpu.array import gpuarray
# This is acutally a test for GpuElemwise # This is acutally a test for GpuElemwise
class test_gpu_Broadcast(test_Broadcast): class test_gpu_Broadcast(test_Broadcast):
op = GpuElemwise op = GpuElemwise
type = GpuArrayType type = GpuArrayType
cop = GpuElemwise
ctype = GpuArrayType
def rand_val(self, shp): def rand_val(self, shp):
return rand_gpuarray(*shp, **dict(cls=gpuarray)) return rand_gpuarray(*shp, **dict(cls=gpuarray))
...@@ -27,8 +30,8 @@ class test_gpu_Broadcast(test_Broadcast): ...@@ -27,8 +30,8 @@ class test_gpu_Broadcast(test_Broadcast):
#cop = GpuElemwise #cop = GpuElemwise
#ctype = GpuArrayType #ctype = GpuArrayType
#def rand_cval(self, shp): def rand_cval(self, shp):
# return rand_gpuarray(*shp, **dict(cls=gpuarray)) return rand_gpuarray(*shp, **dict(cls=gpuarray))
class test_GpuDimShuffle(test_DimShuffle): class test_GpuDimShuffle(test_DimShuffle):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论