提交 bf825258 authored 作者: James Bergstra's avatar James Bergstra

merge

import StringIO
import StringIO, sys
import numpy
from theano import Op, Type, Apply, Variable, Constant
......@@ -143,7 +143,7 @@ class GpuElemwise(Op):
#define INTMOD_POW2(a, b) (a & ((1<<b)-1))
"""
def c_src_kernel(self, node, nodename):
def recalgo_c_src_kernel(self, node, nodename):
nd = node.outputs[0].type.ndim
sio = StringIO.StringIO()
#print 'C_SRC_KERNEL', sio.getvalue()
......@@ -151,7 +151,6 @@ class GpuElemwise(Op):
def _logical_scalar(x):
return all(x.type.broadcastable)
print >> sio, "// Elemwise kernel for ", str(self.scalar_op)
for ipos, i in enumerate(node.inputs):
print >> sio, "// Input ", ipos, str(i.type)
for ipos, i in enumerate(node.outputs):
......@@ -227,11 +226,7 @@ class GpuElemwise(Op):
#print sio.getvalue()
return sio.getvalue()
def c_support_code_apply(self, node, nodename):
return self.c_src_kernel(node, nodename) + \
self.c_src_callkernel(node, nodename)
def c_src_callkernel(self, node, nodename):
def recalgo_c_src_callkernel(self, node, nodename):
nd = node.outputs[0].type.ndim
d = dict()
#input_params and output_params go into the function declaration/definition
......@@ -328,6 +323,420 @@ class GpuElemwise(Op):
}
""" %d
def recalgo_c_support_code_apply(self, node, nodename):
return self.recalgo_c_src_kernel(node, nodename) + self.recalgo_c_src_callkernel(node, nodename)
def naivealgo_c_src_kernel(self, node, nodename):
nd = node.outputs[0].type.ndim
sio = StringIO.StringIO()
#print 'C_SRC_KERNEL', sio.getvalue()
def _logical_scalar(x):
return all(x.type.broadcastable)
for ipos, i in enumerate(node.inputs):
print >> sio, "// Input ", ipos, str(i.type)
for ipos, i in enumerate(node.outputs):
print >> sio, "// Output ", ipos, str(i.type)
print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" %(self.scalar_op.__class__.__name__,nodename, id(self))
if (nd):
print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
if (nd):
print >> sio, "\t,", ", ".join("const int log2_dim%i" % i for i in xrange(nd))
#declare inputs
for ipos, i in enumerate(node.inputs):
s = ", ".join(["const float * i%i_data" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
print >> sio, "\t,", s
#declare outputs
for ipos, i in enumerate(node.outputs):
s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
print >> sio, "\t,", s
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#print >> sio, "\t,", "float * o%i_data" % ipos
print >> sio, "\t)\n{"
print >> sio, " const int idx = blockIdx.x * blockDim.x + threadIdx.x;"
print >> sio, " const int numThreads = blockDim.x * gridDim.x;"
# For each input that is a scalar which has been broadcasted to a tensor,
# load it into a local variable
for ipos, i in enumerate(node.inputs):
if _logical_scalar(i):
print >> sio, " const float ii_i%i_value = i%i_data[0];" % (ipos, ipos)
#TODO: insert code to check for strides of 1, and use a different loop
#loop over the elements to be treated by this kernel call
print >> sio, " for (int i = idx; i < numEls; i += numThreads) {"
# calculate the data pointers for all arguments
print >> sio, " int ii = i;"
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print >> sio, " const float * ii_i%i_data = i%i_data;" % (ipos, ipos)
for ipos, i in enumerate(node.outputs):
print >> sio, " float * ii_o%i_data = o%i_data;" % (ipos, ipos)
for d in xrange(nd-1, -1, -1):
if d > 0:
#print >> sio, " int pos%i = INTMOD_POW2(ii, log2_dim%i);" %(d, d)
#print >> sio, " ii = INTDIV_POW2(ii, log2_dim%i);" %d
print >> sio, " int pos%i = ii %% dim%i;" %(d, d)
print >> sio, " ii = ii / dim%i;" %d
else:
print >> sio, " int pos%i = ii;" %d
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)
for ipos, i in enumerate(node.outputs):
print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d)
# perform the scalar operation on the input and output references
#TODO: What if the scalar_op needs support_code??
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
[scalar.Scalar(dtype = input.type.dtype)() for input in node.inputs],
[scalar.Scalar(dtype = output.type.dtype)() for output in node.outputs])
, nodename + '_scalar_'
, [('ii_i%i_value' if _logical_scalar(i) else 'ii_i%i_data[0]')%ipos for ipos, i in enumerate(node.inputs)]
, ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)]
, sub=dict(fail='return;')) #TODO: set a failure code somehow!!!
print >> sio, " ", task_code
print >> sio, " }"
#TODO: insert runtime stride checks that select the best loop order either here, or in
# the host code that launched the kernel (host code probably better spot)
#indent = " "*(4*d+7)
#for ipos, i in enumerate(node.inputs):
#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print >> sio, "}"
print sio.getvalue()
return sio.getvalue()
def naivealgo_c_src_kernel_tiling(self, node, nodename):
""" The kernel applies to problems with <= 5 dimensions """
#The kernel is intended to be structured roughly like this:
"""
static __global__ void kernel()
{
for (int v = blockIdx.y; v < dim0; v += gridDim.x)
{
for (int w = blockIdx.y; w < dim1; w += gridDim.y)
{
for (int x = threadIdx.x; x < dim2; x += blockDim.x)
{
for (int y = threadIdx.y; y < dim3; y += blockDim.y)
{
for (int z = threadIdx.z; z < dim4; z += blockDim.z)
{
out[v * out_stride[0] + ...] = f(in1[...], in2[...])
}
}
}
}
}
}
"""
nd = node.outputs[0].type.ndim
sio = StringIO.StringIO()
#print 'C_SRC_KERNEL', sio.getvalue()
def _logical_scalar(x):
return all(x.type.broadcastable)
if nd in (4,):
# print some leading comments to make the code easier to read
for ipos, i in enumerate(node.inputs):
print >> sio, "// Input ", ipos, str(i.type)
for ipos, i in enumerate(node.outputs):
print >> sio, "// Output ", ipos, str(i.type)
print >> sio, "static __global__ void kernel_%s_%s_%s_%s(unsigned int numEls" %(
self.scalar_op.__class__.__name__,
nodename,
id(self),
'tiling%i'%nd)
if (nd):
print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
if (nd):
print >> sio, "\t,", ", ".join("const int log2_dim%i" % i for i in xrange(nd))
#declare inputs
for ipos, i in enumerate(node.inputs):
s = ", ".join(["const float * i%i_data" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
print >> sio, "\t,", s
#declare outputs
for ipos, i in enumerate(node.outputs):
s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
print >> sio, "\t,", s
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#print >> sio, "\t,", "float * o%i_data" % ipos
print >> sio, "\t)\n{"
# For each input that is a scalar which has been broadcasted to a tensor,
# load it into a local variable
print >> sio, " __shared__ float value0[%i];" % len(node.inputs)
print >> sio, " if ((threadIdx.x == 0) && (threadIdx.y == 0)) {"
for ipos, i in enumerate(node.inputs):
if _logical_scalar(i):
print >> sio, " value0[%i] = i%i_data[0];" % (ipos, ipos)
print >> sio, " }"
if (nd == 4):
print >> sio, """
for (int pos0 = blockIdx.x; pos0 < dim0; pos0 += gridDim.x)
{
for (int pos1 = blockIdx.y; pos1 < dim1; pos1 += gridDim.y)
{
//for (int pos2 = threadIdx.x; pos2 < dim2; pos2 += blockDim.x)
for (int pos2 = threadIdx.y; pos2 < dim2; pos2 += blockDim.y)
{
//for (int pos3 = threadIdx.y; pos3 < dim3; pos3 += blockDim.y)
for (int pos3 = threadIdx.x; pos3 < dim3; pos3 += blockDim.x)
{
"""
else:
raise NotImplementedError()
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print >> sio, " const float * ii_i%i_data = i%i_data;" % (ipos, ipos)
for ipos, i in enumerate(node.outputs):
print >> sio, " float * ii_o%i_data = o%i_data;" % (ipos, ipos)
for d in xrange(nd):
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)
for ipos, i in enumerate(node.outputs):
print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d)
# perform the scalar operation on the input and output references
#TODO: What if the scalar_op needs support_code??
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
[scalar.Scalar(dtype = input.type.dtype)() for input in node.inputs],
[scalar.Scalar(dtype = output.type.dtype)() for output in node.outputs])
, nodename + '_scalar_'
, [('value0[%i]' if _logical_scalar(i) else 'ii_i%i_data[0]')%ipos for ipos, i in enumerate(node.inputs)]
, ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)]
, sub=dict(fail='return;')) #TODO: set a failure code somehow!!!
print >> sio, " ", task_code
print >> sio, " }" * nd
#TODO: insert runtime stride checks that select the best loop order either here, or in
# the host code that launched the kernel (host code probably better spot)
#indent = " "*(4*d+7)
#for ipos, i in enumerate(node.inputs):
#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print >> sio, "}"
print sio.getvalue()
return sio.getvalue()
def naivealgo_c_src_kernel_Ccontiguous(self, node, nodename):
nd = node.outputs[0].type.ndim
sio = StringIO.StringIO()
#print 'C_SRC_KERNEL', sio.getvalue()
def _logical_scalar(x):
return all(x.type.broadcastable)
for ipos, i in enumerate(node.inputs):
print >> sio, "// Input ", ipos, str(i.type)
for ipos, i in enumerate(node.outputs):
print >> sio, "// Output ", ipos, str(i.type)
print >> sio, "static __global__ void kernel_%s_%s_Ccontiguous (unsigned int numEls" %(self.scalar_op.__class__.__name__,nodename)
#declare inputs
for ipos, i in enumerate(node.inputs):
print >> sio, "\t,", "const float * i%i_data" % ipos
#declare outputs
for ipos, i in enumerate(node.outputs):
print >> sio, "\t,", "float * o%i_data" % ipos
print >> sio, "\t)\n{"
print >> sio, " const int idx = blockIdx.x * blockDim.x + threadIdx.x;"
print >> sio, " const int numThreads = blockDim.x * gridDim.x;"
#loop over the elements to be treated by this kernel call
print >> sio, " for (int i = idx; i < numEls; i += numThreads) {"
# perform the scalar operation on the input and output references
#TODO: What if the scalar_op needs support_code??
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
[scalar.Scalar(dtype = input.type.dtype)() for input in node.inputs],
[scalar.Scalar(dtype = output.type.dtype)() for output in node.outputs])
, nodename + '_scalar_'
, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)]
, ['o%i_data[i]'%ipos for ipos, i in enumerate(node.outputs)]
, sub=dict(fail='return;')) #TODO: set a failure code somehow!!!
print >> sio, " ", task_code
print >> sio, " }"
print >> sio, "}"
print sio.getvalue()
return sio.getvalue()
def naivealgo_c_src_callkernel(self, node, nodename):
nd = node.outputs[0].type.ndim
id_self = id(self)
d = dict()
#input_params and output_params go into the function declaration/definition
input_params = ", ".join("const float * i%i_data, const int * i%i_str"%(ipos, ipos)
for ipos in xrange(len(node.inputs)))
output_params = ", ".join("float * o%i_data, const int * o%i_str"%(ipos, ipos)
for ipos in xrange(len(node.outputs)))
#input_args and output_args go into the recursive call.
input_args = ", ".join("i%i_data, i%i_str"%(ipos, ipos)
for ipos in xrange(len(node.inputs)))
output_args = ", ".join("o%i_data, o%i_str"%(ipos, ipos)
for ipos in xrange(len(node.outputs)))
prod_dims = '*'.join("dims[%i]"%di for di in xrange(nd))
scalar_op=self.scalar_op.__class__.__name__
### NOTE WELL: log2_dims is not initialized on input to this function... it is meant as
### storage space where the log2_dims *could* be computed and stored.
sio = StringIO.StringIO()
print >> sio, """
static inline bool
_is_c_contiguous_%(nodename)s(const int nd, const int * dims, const int * strides)
{
bool c_contiguous = true;
int size = 1;
for (int i = nd-1; (i >= 0) and c_contiguous; --i)
{
if (dims[i] == 1)
continue;
if (strides[i] != size)
{
c_contiguous = false;
}
size = size * dims[i];
}
return c_contiguous;
}
static void callkernel_%(nodename)s(unsigned int numEls, const int d,
const int * dims, const int * log2_dims,
%(input_params)s,
%(output_params)s)
{
numEls = %(prod_dims)s;
std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s_%(id_self)s w numEls" << numEls << "\\n";
""" %locals()
# DEBUGPRINT
print >> sio, 'std::cerr << ' + " << ' ' << ".join(['" "']+list("dims[%i]"%di
for di in xrange(nd)) + ["'\\n';"])
# DEBUGPRINT
for ipos in xrange(len(node.inputs)):
print >> sio, """
std::cerr << " %(ipos)s " <<
""" %locals() + " << ' ' << ".join(["i%i_data"%ipos]
+ list("i%i_str[%i]"%(ipos, di) for di in xrange(nd))) + ''' << "\\n"; '''
# Try to launch the Ccontiguous version
kernel_call_args = ["numEls"]
for ipos in xrange(len(node.inputs)):
kernel_call_args.append("i%i_data"%ipos)
for ipos in xrange(len(node.outputs)):
kernel_call_args.append("o%i_data"%ipos)
kernel_call_args = ", ".join(kernel_call_args)
print >> sio, "if (" \
+ " && ".join(["_is_c_contiguous_%s(%i, dims, i%i_str)" % (nodename, nd, ipos) for ipos in xrange(len(node.inputs))]) \
+ ')'
print >> sio, """
{
std::cerr << " Running Ccontiguous version \\n";
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
//TODO: check error value. If success, return.
}
""" %locals()
#
# Try to launch a general version
#
# kernel_call_args are used to invoke the cuda kernel
kernel_call_args = ["numEls"]
kernel_call_args.extend("dims[%i]"%di for di in xrange(nd))
kernel_call_args.extend("log2_dims[%i]"%di for di in xrange(nd))
for ipos in xrange(len(node.inputs)):
kernel_call_args.append(
", ".join(["i%i_data"%ipos] + list("i%i_str[%i]"%(ipos, di) for di in xrange(nd)))
)
#strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(nd))
#kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
for ipos in xrange(len(node.outputs)):
kernel_call_args.append(
", ".join(["o%i_data"%ipos] + list("o%i_str[%i]"%(ipos, di) for di in xrange(nd)))
)
#strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(nd))
#kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
kernel_call_args = ", ".join(kernel_call_args)
if (nd == 4): # tiling kernel
print >> sio, """
else
{
std::cerr << " Running tiling 4D \\n";
dim3 gridDim(dims[0], dims[1]);
dim3 blockDim;
if (0) {
blockDim.y = std::min(dims[3], NUM_VECTOR_OP_THREADS_PER_BLOCK);
blockDim.x = std::min(dims[2], (int)(NUM_VECTOR_OP_THREADS_PER_BLOCK/ blockDim.y));
}
else {
blockDim.x = std::min(dims[3], NUM_VECTOR_OP_THREADS_PER_BLOCK);
blockDim.y = std::min(dims[2], (int)(NUM_VECTOR_OP_THREADS_PER_BLOCK/ blockDim.x));
}
kernel_%(scalar_op)s_%(nodename)s_%(id_self)s_tiling4<<<gridDim, blockDim>>>(%(kernel_call_args)s);
if( cudaSuccess != cudaGetLastError())
{
std::cerr << " DEBUG: tiling4 call failure... falling back to general version \\n";
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
kernel_%(scalar_op)s_%(nodename)s_%(id_self)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
}
}
}
""" %locals()
else:
print >> sio, """
else
{
std::cerr << " Running general version \\n";
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
kernel_%(scalar_op)s_%(nodename)s_%(id_self)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
}
}
""" %locals()
#N.B. cudaGetLastError is called by c_code
return sio.getvalue()
def naivealgo_c_support_code_apply(self, node, nodename):
return self.naivealgo_c_src_kernel(node, nodename) \
+ self.naivealgo_c_src_kernel_Ccontiguous(node, nodename) \
+ self.naivealgo_c_src_kernel_tiling(node, nodename) \
+ self.naivealgo_c_src_callkernel(node, nodename)
def c_support_code_apply(self, node, nodename):
rval = self.naivealgo_c_support_code_apply(node, nodename)
#rval = self.recalgo_c_support_code_apply(node, nodename)
#print rval
return rval
def c_code(self, node, nodename, inputs, outputs, sub):
d = dict(sub)
nd = node.outputs[0].type.ndim
......@@ -432,10 +841,12 @@ class GpuElemwise(Op):
}
//std::cerr << "C_CODE %(opname)s END\\n";
""" % locals()
#print sio.getvalue()
return sio.getvalue()
def c_code_cache_version(self):
return (1,0)
#return (1,0)
return ()
class GpuDimShuffle(Op):
def __init__(self, input_broadcastable, new_order):
......
......@@ -20,7 +20,8 @@ def print_mode(mode):
mode.print_summary()
def run_nnet(use_gpu):
n_batch = 16
#n_batch = 16
n_batch = 60 #Fred recommends a nice big batch
n_in = 1024
n_hid = 2048
n_out = 10
......@@ -213,19 +214,20 @@ def test_conv_nnet2():
print rval_cpu[0], rval_gpu[0],rval_cpu[0]-rval_gpu[0]
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-4)
def run_conv_nnet2_classif(shared_fn): # pretend we are training LeNet for MNIST
def run_conv_nnet2_classif(shared_fn, isize, ksize):
n_batch = 60
shape_img = (n_batch, 1, 32, 32)
shape_img = (n_batch, 1, isize, isize)
n_kern = 20
shape_kern = (n_kern, 1, 5, 5)
n_kern = 20 # 6 were used in LeNet5
shape_kern = (n_kern, 1, ksize, ksize)
n_kern1 = 30
shape_kern1 = (n_kern1, n_kern, 5, 5)
n_kern1 = 30 # 16 were used in LeNet5
shape_kern1 = (n_kern1, n_kern, ksize, ksize)
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((32, 32), (5, 5), 'valid')
logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2, logical_hid_shape[1]/2), (5, 5), 'valid')
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((isize, isize), (ksize, ksize), 'valid')
logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2,
logical_hid_shape[1]/2), (ksize, ksize), 'valid')
n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
n_out = 10
......@@ -246,8 +248,8 @@ def run_conv_nnet2_classif(shared_fn): # pretend we are training LeNet for MNIST
hid = tensor.tanh(conv_op(x, w0)+b0)
hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1)
hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
out = tensor.nnet.softmax(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, tensor.argmax(y, axis=1)) * lr)
print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c]
......@@ -270,10 +272,21 @@ def run_conv_nnet2_classif(shared_fn): # pretend we are training LeNet for MNIST
print_mode(mode)
return rval
def test_conv_nnet2_classif():
numpy.random.seed(23456)
rval_cpu = run_conv_nnet2(shared)
numpy.random.seed(23456)
rval_gpu = run_conv_nnet2(tcn.shared_constructor)
def run_test_conv_nnet2_classif(seed, isize, ksize):
numpy.random.seed(seed)
rval_cpu = run_conv_nnet2_classif(shared, isize, ksize)
numpy.random.seed(seed)
rval_gpu = run_conv_nnet2_classif(tcn.shared_constructor, isize, ksize)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def test_lenet_28(): #MNIST
run_test_conv_nnet2_classif(23485, 28, 5)
def test_lenet_32(): #CIFAR10 / Shapeset
run_test_conv_nnet2_classif(23485, 32, 5)
def test_lenet_108(): # NORB
run_test_conv_nnet2_classif(23485, 108, 7)
def test_lenet_256(): # ImageNet
run_test_conv_nnet2_classif(23485, 256, 9)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论