提交 5aa6945a authored 作者: James Bergstra's avatar James Bergstra

adding some faster Elemwise kernels, but more work to do still

上级 a3758920
import StringIO import StringIO, sys
import numpy import numpy
from theano import Op, Type, Apply, Variable, Constant from theano import Op, Type, Apply, Variable, Constant
...@@ -338,7 +338,7 @@ class GpuElemwise(Op): ...@@ -338,7 +338,7 @@ class GpuElemwise(Op):
print >> sio, "// Input ", ipos, str(i.type) print >> sio, "// Input ", ipos, str(i.type)
for ipos, i in enumerate(node.outputs): for ipos, i in enumerate(node.outputs):
print >> sio, "// Output ", ipos, str(i.type) print >> sio, "// Output ", ipos, str(i.type)
print >> sio, "static __global__ void kernel_%s_%s(unsigned int numEls" %(self.scalar_op.__class__.__name__,nodename) print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" %(self.scalar_op.__class__.__name__,nodename, id(self))
if (nd): if (nd):
print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
if (nd): if (nd):
...@@ -411,11 +411,178 @@ class GpuElemwise(Op): ...@@ -411,11 +411,178 @@ class GpuElemwise(Op):
#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print >> sio, "}" print >> sio, "}"
#print sio.getvalue() print sio.getvalue()
return sio.getvalue()
def naivealgo_c_src_kernel_tiling(self, node, nodename):
""" The kernel applies to problems with <= 5 dimensions """
#The kernel is intended to be structured roughly like this:
"""
static __global__ void kernel()
{
for (int v = blockIdx.y; v < dim0; v += gridDim.x)
{
for (int w = blockIdx.y; w < dim1; w += gridDim.y)
{
for (int x = threadIdx.x; x < dim2; x += blockDim.x)
{
for (int y = threadIdx.y; y < dim3; y += blockDim.y)
{
for (int z = threadIdx.z; z < dim4; z += blockDim.z)
{
out[v * out_stride[0] + ...] = f(in1[...], in2[...])
}
}
}
}
}
}
"""
nd = node.outputs[0].type.ndim
sio = StringIO.StringIO()
#print 'C_SRC_KERNEL', sio.getvalue()
def _logical_scalar(x):
return all(x.type.broadcastable)
if nd in (4,):
# print some leading comments to make the code easier to read
for ipos, i in enumerate(node.inputs):
print >> sio, "// Input ", ipos, str(i.type)
for ipos, i in enumerate(node.outputs):
print >> sio, "// Output ", ipos, str(i.type)
print >> sio, "static __global__ void kernel_%s_%s_%s_%s(unsigned int numEls" %(
self.scalar_op.__class__.__name__,
nodename,
id(self),
'tiling%i'%nd)
if (nd):
print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
if (nd):
print >> sio, "\t,", ", ".join("const int log2_dim%i" % i for i in xrange(nd))
#declare inputs
for ipos, i in enumerate(node.inputs):
s = ", ".join(["const float * i%i_data" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
print >> sio, "\t,", s
#declare outputs
for ipos, i in enumerate(node.outputs):
s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
print >> sio, "\t,", s
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#print >> sio, "\t,", "float * o%i_data" % ipos
print >> sio, "\t)\n{"
# For each input that is a scalar which has been broadcasted to a tensor,
# load it into a local variable
print >> sio, " __shared__ float value0[%i];" % len(node.inputs)
print >> sio, " if ((threadIdx.x == 0) && (threadIdx.y == 0)) {"
for ipos, i in enumerate(node.inputs):
if _logical_scalar(i):
print >> sio, " value0[%i] = i%i_data[0];" % (ipos, ipos)
print >> sio, " }"
if (nd == 4):
print >> sio, """
for (int pos0 = blockIdx.x; pos0 < dim0; pos0 += gridDim.x)
{
for (int pos1 = blockIdx.y; pos1 < dim1; pos1 += gridDim.y)
{
//for (int pos2 = threadIdx.x; pos2 < dim2; pos2 += blockDim.x)
for (int pos2 = threadIdx.y; pos2 < dim2; pos2 += blockDim.y)
{
//for (int pos3 = threadIdx.y; pos3 < dim3; pos3 += blockDim.y)
for (int pos3 = threadIdx.x; pos3 < dim3; pos3 += blockDim.x)
{
"""
else:
raise NotImplementedError()
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print >> sio, " const float * ii_i%i_data = i%i_data;" % (ipos, ipos)
for ipos, i in enumerate(node.outputs):
print >> sio, " float * ii_o%i_data = o%i_data;" % (ipos, ipos)
for d in xrange(nd):
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)
for ipos, i in enumerate(node.outputs):
print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d)
# perform the scalar operation on the input and output references
#TODO: What if the scalar_op needs support_code??
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
[scalar.Scalar(dtype = input.type.dtype)() for input in node.inputs],
[scalar.Scalar(dtype = output.type.dtype)() for output in node.outputs])
, nodename + '_scalar_'
, [('value0[%i]' if _logical_scalar(i) else 'ii_i%i_data[0]')%ipos for ipos, i in enumerate(node.inputs)]
, ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)]
, sub=dict(fail='return;')) #TODO: set a failure code somehow!!!
print >> sio, " ", task_code
print >> sio, " }" * nd
#TODO: insert runtime stride checks that select the best loop order either here, or in
# the host code that launched the kernel (host code probably better spot)
#indent = " "*(4*d+7)
#for ipos, i in enumerate(node.inputs):
#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print >> sio, "}"
print sio.getvalue()
return sio.getvalue()
def naivealgo_c_src_kernel_Ccontiguous(self, node, nodename):
nd = node.outputs[0].type.ndim
sio = StringIO.StringIO()
#print 'C_SRC_KERNEL', sio.getvalue()
def _logical_scalar(x):
return all(x.type.broadcastable)
for ipos, i in enumerate(node.inputs):
print >> sio, "// Input ", ipos, str(i.type)
for ipos, i in enumerate(node.outputs):
print >> sio, "// Output ", ipos, str(i.type)
print >> sio, "static __global__ void kernel_%s_%s_Ccontiguous (unsigned int numEls" %(self.scalar_op.__class__.__name__,nodename)
#declare inputs
for ipos, i in enumerate(node.inputs):
print >> sio, "\t,", "const float * i%i_data" % ipos
#declare outputs
for ipos, i in enumerate(node.outputs):
print >> sio, "\t,", "float * o%i_data" % ipos
print >> sio, "\t)\n{"
print >> sio, " const int idx = blockIdx.x * blockDim.x + threadIdx.x;"
print >> sio, " const int numThreads = blockDim.x * gridDim.x;"
#loop over the elements to be treated by this kernel call
print >> sio, " for (int i = idx; i < numEls; i += numThreads) {"
# perform the scalar operation on the input and output references
#TODO: What if the scalar_op needs support_code??
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
[scalar.Scalar(dtype = input.type.dtype)() for input in node.inputs],
[scalar.Scalar(dtype = output.type.dtype)() for output in node.outputs])
, nodename + '_scalar_'
, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)]
, ['o%i_data[i]'%ipos for ipos, i in enumerate(node.outputs)]
, sub=dict(fail='return;')) #TODO: set a failure code somehow!!!
print >> sio, " ", task_code
print >> sio, " }"
print >> sio, "}"
print sio.getvalue()
return sio.getvalue() return sio.getvalue()
def naivealgo_c_src_callkernel(self, node, nodename): def naivealgo_c_src_callkernel(self, node, nodename):
nd = node.outputs[0].type.ndim nd = node.outputs[0].type.ndim
id_self = id(self)
d = dict() d = dict()
#input_params and output_params go into the function declaration/definition #input_params and output_params go into the function declaration/definition
input_params = ", ".join("const float * i%i_data, const int * i%i_str"%(ipos, ipos) input_params = ", ".join("const float * i%i_data, const int * i%i_str"%(ipos, ipos)
...@@ -431,6 +598,73 @@ class GpuElemwise(Op): ...@@ -431,6 +598,73 @@ class GpuElemwise(Op):
prod_dims = '*'.join("dims[%i]"%di for di in xrange(nd)) prod_dims = '*'.join("dims[%i]"%di for di in xrange(nd))
scalar_op=self.scalar_op.__class__.__name__
### NOTE WELL: log2_dims is not initialized on input to this function... it is meant as
### storage space where the log2_dims *could* be computed and stored.
sio = StringIO.StringIO()
print >> sio, """
static inline bool
_is_c_contiguous_%(nodename)s(const int nd, const int * dims, const int * strides)
{
bool c_contiguous = true;
int size = 1;
for (int i = nd-1; (i >= 0) and c_contiguous; --i)
{
if (dims[i] == 1)
continue;
if (strides[i] != size)
{
c_contiguous = false;
}
size = size * dims[i];
}
return c_contiguous;
}
static void callkernel_%(nodename)s(unsigned int numEls, const int d,
const int * dims, const int * log2_dims,
%(input_params)s,
%(output_params)s)
{
numEls = %(prod_dims)s;
std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s_%(id_self)s w numEls" << numEls << "\\n";
""" %locals()
# DEBUGPRINT
print >> sio, 'std::cerr << ' + " << ' ' << ".join(['" "']+list("dims[%i]"%di
for di in xrange(nd)) + ["'\\n';"])
# DEBUGPRINT
for ipos in xrange(len(node.inputs)):
print >> sio, """
std::cerr << " %(ipos)s " <<
""" %locals() + " << ' ' << ".join(["i%i_data"%ipos]
+ list("i%i_str[%i]"%(ipos, di) for di in xrange(nd))) + ''' << "\\n"; '''
# Try to launch the Ccontiguous version
kernel_call_args = ["numEls"]
for ipos in xrange(len(node.inputs)):
kernel_call_args.append("i%i_data"%ipos)
for ipos in xrange(len(node.outputs)):
kernel_call_args.append("o%i_data"%ipos)
kernel_call_args = ", ".join(kernel_call_args)
print >> sio, "if (" \
+ " && ".join(["_is_c_contiguous_%s(%i, dims, i%i_str)" % (nodename, nd, ipos) for ipos in xrange(len(node.inputs))]) \
+ ')'
print >> sio, """
{
std::cerr << " Running Ccontiguous version \\n";
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
//TODO: check error value. If success, return.
}
""" %locals()
#
# Try to launch a general version
#
# kernel_call_args are used to invoke the cuda kernel # kernel_call_args are used to invoke the cuda kernel
kernel_call_args = ["numEls"] kernel_call_args = ["numEls"]
kernel_call_args.extend("dims[%i]"%di for di in xrange(nd)) kernel_call_args.extend("dims[%i]"%di for di in xrange(nd))
...@@ -449,38 +683,57 @@ class GpuElemwise(Op): ...@@ -449,38 +683,57 @@ class GpuElemwise(Op):
#kernel_call_args.append( "%s, o%i_data" % (strides, ipos)) #kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
kernel_call_args = ", ".join(kernel_call_args) kernel_call_args = ", ".join(kernel_call_args)
d.update(locals()) if (nd == 4): # tiling kernel
d["scalar_op"]=self.scalar_op.__class__.__name__ print >> sio, """
else
{
std::cerr << " Running tiling 4D \\n";
dim3 gridDim(dims[0], dims[1]);
dim3 blockDim;
if (0) {
blockDim.y = std::min(dims[3], NUM_VECTOR_OP_THREADS_PER_BLOCK);
blockDim.x = std::min(dims[2], (int)(NUM_VECTOR_OP_THREADS_PER_BLOCK/ blockDim.y));
}
else {
blockDim.x = std::min(dims[3], NUM_VECTOR_OP_THREADS_PER_BLOCK);
blockDim.y = std::min(dims[2], (int)(NUM_VECTOR_OP_THREADS_PER_BLOCK/ blockDim.x));
}
### NOTE WELL: log2_dims is not initialized on input to this function... it is meant as kernel_%(scalar_op)s_%(nodename)s_%(id_self)s_tiling4<<<gridDim, blockDim>>>(%(kernel_call_args)s);
### storage space where the log2_dims *could* be computed and stored.
rval = """
static void callkernel_%(nodename)s(unsigned int numEls, const int d, if( cudaSuccess != cudaGetLastError())
const int * dims, const int * log2_dims, {
%(input_params)s, std::cerr << " DEBUG: tiling4 call failure... falling back to general version \\n";
%(output_params)s) int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
{ int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
numEls = %(prod_dims)s; kernel_%(scalar_op)s_%(nodename)s_%(id_self)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); }
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS); }
kernel_%(scalar_op)s_%(nodename)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
//std::cerr << "ADDCALL a str" << i0_str[0] << " "<< i0_str[1] << "\\n";
//std::cerr << "ADDCALL a data" << i0_data << "\\n";
//std::cerr << "ADDCALL b str" << i1_str[0] << " "<< i1_str[1] << "\\n";
//std::cerr << "ADDCALL b data" << i1_data << "\\n";
//std::cerr << "ADDCALL z str" << o0_str[0] << " "<< o0_str[1] << "\\n";
//std::cerr << "ADDCALL z data" << o0_data << "\\n";
} }
""" %d """ %locals()
return rval else:
print >> sio, """
else
{
std::cerr << " Running general version \\n";
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
kernel_%(scalar_op)s_%(nodename)s_%(id_self)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
}
}
""" %locals()
#N.B. cudaGetLastError is called by c_code
return sio.getvalue()
def naivealgo_c_support_code_apply(self, node, nodename): def naivealgo_c_support_code_apply(self, node, nodename):
return self.naivealgo_c_src_kernel(node, nodename) + self.naivealgo_c_src_callkernel(node, nodename) return self.naivealgo_c_src_kernel(node, nodename) \
+ self.naivealgo_c_src_kernel_Ccontiguous(node, nodename) \
+ self.naivealgo_c_src_kernel_tiling(node, nodename) \
+ self.naivealgo_c_src_callkernel(node, nodename)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
#rval = self.naivealgo_c_support_code_apply(node, nodename) rval = self.naivealgo_c_support_code_apply(node, nodename)
rval = self.recalgo_c_support_code_apply(node, nodename) #rval = self.recalgo_c_support_code_apply(node, nodename)
#print rval #print rval
return rval return rval
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论