提交 bf825258 authored 作者: James Bergstra's avatar James Bergstra

merge

import StringIO import StringIO, sys
import numpy import numpy
from theano import Op, Type, Apply, Variable, Constant from theano import Op, Type, Apply, Variable, Constant
...@@ -143,7 +143,7 @@ class GpuElemwise(Op): ...@@ -143,7 +143,7 @@ class GpuElemwise(Op):
#define INTMOD_POW2(a, b) (a & ((1<<b)-1)) #define INTMOD_POW2(a, b) (a & ((1<<b)-1))
""" """
def c_src_kernel(self, node, nodename): def recalgo_c_src_kernel(self, node, nodename):
nd = node.outputs[0].type.ndim nd = node.outputs[0].type.ndim
sio = StringIO.StringIO() sio = StringIO.StringIO()
#print 'C_SRC_KERNEL', sio.getvalue() #print 'C_SRC_KERNEL', sio.getvalue()
...@@ -151,7 +151,6 @@ class GpuElemwise(Op): ...@@ -151,7 +151,6 @@ class GpuElemwise(Op):
def _logical_scalar(x): def _logical_scalar(x):
return all(x.type.broadcastable) return all(x.type.broadcastable)
print >> sio, "// Elemwise kernel for ", str(self.scalar_op)
for ipos, i in enumerate(node.inputs): for ipos, i in enumerate(node.inputs):
print >> sio, "// Input ", ipos, str(i.type) print >> sio, "// Input ", ipos, str(i.type)
for ipos, i in enumerate(node.outputs): for ipos, i in enumerate(node.outputs):
...@@ -227,11 +226,7 @@ class GpuElemwise(Op): ...@@ -227,11 +226,7 @@ class GpuElemwise(Op):
#print sio.getvalue() #print sio.getvalue()
return sio.getvalue() return sio.getvalue()
def c_support_code_apply(self, node, nodename): def recalgo_c_src_callkernel(self, node, nodename):
return self.c_src_kernel(node, nodename) + \
self.c_src_callkernel(node, nodename)
def c_src_callkernel(self, node, nodename):
nd = node.outputs[0].type.ndim nd = node.outputs[0].type.ndim
d = dict() d = dict()
#input_params and output_params go into the function declaration/definition #input_params and output_params go into the function declaration/definition
...@@ -328,6 +323,420 @@ class GpuElemwise(Op): ...@@ -328,6 +323,420 @@ class GpuElemwise(Op):
} }
""" %d """ %d
def recalgo_c_support_code_apply(self, node, nodename):
return self.recalgo_c_src_kernel(node, nodename) + self.recalgo_c_src_callkernel(node, nodename)
def naivealgo_c_src_kernel(self, node, nodename):
nd = node.outputs[0].type.ndim
sio = StringIO.StringIO()
#print 'C_SRC_KERNEL', sio.getvalue()
def _logical_scalar(x):
return all(x.type.broadcastable)
for ipos, i in enumerate(node.inputs):
print >> sio, "// Input ", ipos, str(i.type)
for ipos, i in enumerate(node.outputs):
print >> sio, "// Output ", ipos, str(i.type)
print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" %(self.scalar_op.__class__.__name__,nodename, id(self))
if (nd):
print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
if (nd):
print >> sio, "\t,", ", ".join("const int log2_dim%i" % i for i in xrange(nd))
#declare inputs
for ipos, i in enumerate(node.inputs):
s = ", ".join(["const float * i%i_data" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
print >> sio, "\t,", s
#declare outputs
for ipos, i in enumerate(node.outputs):
s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
print >> sio, "\t,", s
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#print >> sio, "\t,", "float * o%i_data" % ipos
print >> sio, "\t)\n{"
print >> sio, " const int idx = blockIdx.x * blockDim.x + threadIdx.x;"
print >> sio, " const int numThreads = blockDim.x * gridDim.x;"
# For each input that is a scalar which has been broadcasted to a tensor,
# load it into a local variable
for ipos, i in enumerate(node.inputs):
if _logical_scalar(i):
print >> sio, " const float ii_i%i_value = i%i_data[0];" % (ipos, ipos)
#TODO: insert code to check for strides of 1, and use a different loop
#loop over the elements to be treated by this kernel call
print >> sio, " for (int i = idx; i < numEls; i += numThreads) {"
# calculate the data pointers for all arguments
print >> sio, " int ii = i;"
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print >> sio, " const float * ii_i%i_data = i%i_data;" % (ipos, ipos)
for ipos, i in enumerate(node.outputs):
print >> sio, " float * ii_o%i_data = o%i_data;" % (ipos, ipos)
for d in xrange(nd-1, -1, -1):
if d > 0:
#print >> sio, " int pos%i = INTMOD_POW2(ii, log2_dim%i);" %(d, d)
#print >> sio, " ii = INTDIV_POW2(ii, log2_dim%i);" %d
print >> sio, " int pos%i = ii %% dim%i;" %(d, d)
print >> sio, " ii = ii / dim%i;" %d
else:
print >> sio, " int pos%i = ii;" %d
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)
for ipos, i in enumerate(node.outputs):
print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d)
# perform the scalar operation on the input and output references
#TODO: What if the scalar_op needs support_code??
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
[scalar.Scalar(dtype = input.type.dtype)() for input in node.inputs],
[scalar.Scalar(dtype = output.type.dtype)() for output in node.outputs])
, nodename + '_scalar_'
, [('ii_i%i_value' if _logical_scalar(i) else 'ii_i%i_data[0]')%ipos for ipos, i in enumerate(node.inputs)]
, ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)]
, sub=dict(fail='return;')) #TODO: set a failure code somehow!!!
print >> sio, " ", task_code
print >> sio, " }"
#TODO: insert runtime stride checks that select the best loop order either here, or in
# the host code that launched the kernel (host code probably better spot)
#indent = " "*(4*d+7)
#for ipos, i in enumerate(node.inputs):
#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print >> sio, "}"
print sio.getvalue()
return sio.getvalue()
def naivealgo_c_src_kernel_tiling(self, node, nodename):
""" The kernel applies to problems with <= 5 dimensions """
#The kernel is intended to be structured roughly like this:
"""
static __global__ void kernel()
{
for (int v = blockIdx.y; v < dim0; v += gridDim.x)
{
for (int w = blockIdx.y; w < dim1; w += gridDim.y)
{
for (int x = threadIdx.x; x < dim2; x += blockDim.x)
{
for (int y = threadIdx.y; y < dim3; y += blockDim.y)
{
for (int z = threadIdx.z; z < dim4; z += blockDim.z)
{
out[v * out_stride[0] + ...] = f(in1[...], in2[...])
}
}
}
}
}
}
"""
nd = node.outputs[0].type.ndim
sio = StringIO.StringIO()
#print 'C_SRC_KERNEL', sio.getvalue()
def _logical_scalar(x):
return all(x.type.broadcastable)
if nd in (4,):
# print some leading comments to make the code easier to read
for ipos, i in enumerate(node.inputs):
print >> sio, "// Input ", ipos, str(i.type)
for ipos, i in enumerate(node.outputs):
print >> sio, "// Output ", ipos, str(i.type)
print >> sio, "static __global__ void kernel_%s_%s_%s_%s(unsigned int numEls" %(
self.scalar_op.__class__.__name__,
nodename,
id(self),
'tiling%i'%nd)
if (nd):
print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
if (nd):
print >> sio, "\t,", ", ".join("const int log2_dim%i" % i for i in xrange(nd))
#declare inputs
for ipos, i in enumerate(node.inputs):
s = ", ".join(["const float * i%i_data" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
print >> sio, "\t,", s
#declare outputs
for ipos, i in enumerate(node.outputs):
s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
print >> sio, "\t,", s
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#print >> sio, "\t,", "float * o%i_data" % ipos
print >> sio, "\t)\n{"
# For each input that is a scalar which has been broadcasted to a tensor,
# load it into a local variable
print >> sio, " __shared__ float value0[%i];" % len(node.inputs)
print >> sio, " if ((threadIdx.x == 0) && (threadIdx.y == 0)) {"
for ipos, i in enumerate(node.inputs):
if _logical_scalar(i):
print >> sio, " value0[%i] = i%i_data[0];" % (ipos, ipos)
print >> sio, " }"
if (nd == 4):
print >> sio, """
for (int pos0 = blockIdx.x; pos0 < dim0; pos0 += gridDim.x)
{
for (int pos1 = blockIdx.y; pos1 < dim1; pos1 += gridDim.y)
{
//for (int pos2 = threadIdx.x; pos2 < dim2; pos2 += blockDim.x)
for (int pos2 = threadIdx.y; pos2 < dim2; pos2 += blockDim.y)
{
//for (int pos3 = threadIdx.y; pos3 < dim3; pos3 += blockDim.y)
for (int pos3 = threadIdx.x; pos3 < dim3; pos3 += blockDim.x)
{
"""
else:
raise NotImplementedError()
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print >> sio, " const float * ii_i%i_data = i%i_data;" % (ipos, ipos)
for ipos, i in enumerate(node.outputs):
print >> sio, " float * ii_o%i_data = o%i_data;" % (ipos, ipos)
for d in xrange(nd):
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)
for ipos, i in enumerate(node.outputs):
print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d)
# perform the scalar operation on the input and output references
#TODO: What if the scalar_op needs support_code??
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
[scalar.Scalar(dtype = input.type.dtype)() for input in node.inputs],
[scalar.Scalar(dtype = output.type.dtype)() for output in node.outputs])
, nodename + '_scalar_'
, [('value0[%i]' if _logical_scalar(i) else 'ii_i%i_data[0]')%ipos for ipos, i in enumerate(node.inputs)]
, ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)]
, sub=dict(fail='return;')) #TODO: set a failure code somehow!!!
print >> sio, " ", task_code
print >> sio, " }" * nd
#TODO: insert runtime stride checks that select the best loop order either here, or in
# the host code that launched the kernel (host code probably better spot)
#indent = " "*(4*d+7)
#for ipos, i in enumerate(node.inputs):
#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print >> sio, "}"
print sio.getvalue()
return sio.getvalue()
def naivealgo_c_src_kernel_Ccontiguous(self, node, nodename):
nd = node.outputs[0].type.ndim
sio = StringIO.StringIO()
#print 'C_SRC_KERNEL', sio.getvalue()
def _logical_scalar(x):
return all(x.type.broadcastable)
for ipos, i in enumerate(node.inputs):
print >> sio, "// Input ", ipos, str(i.type)
for ipos, i in enumerate(node.outputs):
print >> sio, "// Output ", ipos, str(i.type)
print >> sio, "static __global__ void kernel_%s_%s_Ccontiguous (unsigned int numEls" %(self.scalar_op.__class__.__name__,nodename)
#declare inputs
for ipos, i in enumerate(node.inputs):
print >> sio, "\t,", "const float * i%i_data" % ipos
#declare outputs
for ipos, i in enumerate(node.outputs):
print >> sio, "\t,", "float * o%i_data" % ipos
print >> sio, "\t)\n{"
print >> sio, " const int idx = blockIdx.x * blockDim.x + threadIdx.x;"
print >> sio, " const int numThreads = blockDim.x * gridDim.x;"
#loop over the elements to be treated by this kernel call
print >> sio, " for (int i = idx; i < numEls; i += numThreads) {"
# perform the scalar operation on the input and output references
#TODO: What if the scalar_op needs support_code??
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
[scalar.Scalar(dtype = input.type.dtype)() for input in node.inputs],
[scalar.Scalar(dtype = output.type.dtype)() for output in node.outputs])
, nodename + '_scalar_'
, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)]
, ['o%i_data[i]'%ipos for ipos, i in enumerate(node.outputs)]
, sub=dict(fail='return;')) #TODO: set a failure code somehow!!!
print >> sio, " ", task_code
print >> sio, " }"
print >> sio, "}"
print sio.getvalue()
return sio.getvalue()
def naivealgo_c_src_callkernel(self, node, nodename):
nd = node.outputs[0].type.ndim
id_self = id(self)
d = dict()
#input_params and output_params go into the function declaration/definition
input_params = ", ".join("const float * i%i_data, const int * i%i_str"%(ipos, ipos)
for ipos in xrange(len(node.inputs)))
output_params = ", ".join("float * o%i_data, const int * o%i_str"%(ipos, ipos)
for ipos in xrange(len(node.outputs)))
#input_args and output_args go into the recursive call.
input_args = ", ".join("i%i_data, i%i_str"%(ipos, ipos)
for ipos in xrange(len(node.inputs)))
output_args = ", ".join("o%i_data, o%i_str"%(ipos, ipos)
for ipos in xrange(len(node.outputs)))
prod_dims = '*'.join("dims[%i]"%di for di in xrange(nd))
scalar_op=self.scalar_op.__class__.__name__
### NOTE WELL: log2_dims is not initialized on input to this function... it is meant as
### storage space where the log2_dims *could* be computed and stored.
sio = StringIO.StringIO()
print >> sio, """
static inline bool
_is_c_contiguous_%(nodename)s(const int nd, const int * dims, const int * strides)
{
bool c_contiguous = true;
int size = 1;
for (int i = nd-1; (i >= 0) and c_contiguous; --i)
{
if (dims[i] == 1)
continue;
if (strides[i] != size)
{
c_contiguous = false;
}
size = size * dims[i];
}
return c_contiguous;
}
static void callkernel_%(nodename)s(unsigned int numEls, const int d,
const int * dims, const int * log2_dims,
%(input_params)s,
%(output_params)s)
{
numEls = %(prod_dims)s;
std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s_%(id_self)s w numEls" << numEls << "\\n";
""" %locals()
# DEBUGPRINT
print >> sio, 'std::cerr << ' + " << ' ' << ".join(['" "']+list("dims[%i]"%di
for di in xrange(nd)) + ["'\\n';"])
# DEBUGPRINT
for ipos in xrange(len(node.inputs)):
print >> sio, """
std::cerr << " %(ipos)s " <<
""" %locals() + " << ' ' << ".join(["i%i_data"%ipos]
+ list("i%i_str[%i]"%(ipos, di) for di in xrange(nd))) + ''' << "\\n"; '''
# Try to launch the Ccontiguous version
kernel_call_args = ["numEls"]
for ipos in xrange(len(node.inputs)):
kernel_call_args.append("i%i_data"%ipos)
for ipos in xrange(len(node.outputs)):
kernel_call_args.append("o%i_data"%ipos)
kernel_call_args = ", ".join(kernel_call_args)
print >> sio, "if (" \
+ " && ".join(["_is_c_contiguous_%s(%i, dims, i%i_str)" % (nodename, nd, ipos) for ipos in xrange(len(node.inputs))]) \
+ ')'
print >> sio, """
{
std::cerr << " Running Ccontiguous version \\n";
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
//TODO: check error value. If success, return.
}
""" %locals()
#
# Try to launch a general version
#
# kernel_call_args are used to invoke the cuda kernel
kernel_call_args = ["numEls"]
kernel_call_args.extend("dims[%i]"%di for di in xrange(nd))
kernel_call_args.extend("log2_dims[%i]"%di for di in xrange(nd))
for ipos in xrange(len(node.inputs)):
kernel_call_args.append(
", ".join(["i%i_data"%ipos] + list("i%i_str[%i]"%(ipos, di) for di in xrange(nd)))
)
#strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(nd))
#kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
for ipos in xrange(len(node.outputs)):
kernel_call_args.append(
", ".join(["o%i_data"%ipos] + list("o%i_str[%i]"%(ipos, di) for di in xrange(nd)))
)
#strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(nd))
#kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
kernel_call_args = ", ".join(kernel_call_args)
if (nd == 4): # tiling kernel
print >> sio, """
else
{
std::cerr << " Running tiling 4D \\n";
dim3 gridDim(dims[0], dims[1]);
dim3 blockDim;
if (0) {
blockDim.y = std::min(dims[3], NUM_VECTOR_OP_THREADS_PER_BLOCK);
blockDim.x = std::min(dims[2], (int)(NUM_VECTOR_OP_THREADS_PER_BLOCK/ blockDim.y));
}
else {
blockDim.x = std::min(dims[3], NUM_VECTOR_OP_THREADS_PER_BLOCK);
blockDim.y = std::min(dims[2], (int)(NUM_VECTOR_OP_THREADS_PER_BLOCK/ blockDim.x));
}
kernel_%(scalar_op)s_%(nodename)s_%(id_self)s_tiling4<<<gridDim, blockDim>>>(%(kernel_call_args)s);
if( cudaSuccess != cudaGetLastError())
{
std::cerr << " DEBUG: tiling4 call failure... falling back to general version \\n";
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
kernel_%(scalar_op)s_%(nodename)s_%(id_self)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
}
}
}
""" %locals()
else:
print >> sio, """
else
{
std::cerr << " Running general version \\n";
int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
kernel_%(scalar_op)s_%(nodename)s_%(id_self)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
}
}
""" %locals()
#N.B. cudaGetLastError is called by c_code
return sio.getvalue()
def naivealgo_c_support_code_apply(self, node, nodename):
return self.naivealgo_c_src_kernel(node, nodename) \
+ self.naivealgo_c_src_kernel_Ccontiguous(node, nodename) \
+ self.naivealgo_c_src_kernel_tiling(node, nodename) \
+ self.naivealgo_c_src_callkernel(node, nodename)
def c_support_code_apply(self, node, nodename):
rval = self.naivealgo_c_support_code_apply(node, nodename)
#rval = self.recalgo_c_support_code_apply(node, nodename)
#print rval
return rval
def c_code(self, node, nodename, inputs, outputs, sub): def c_code(self, node, nodename, inputs, outputs, sub):
d = dict(sub) d = dict(sub)
nd = node.outputs[0].type.ndim nd = node.outputs[0].type.ndim
...@@ -432,10 +841,12 @@ class GpuElemwise(Op): ...@@ -432,10 +841,12 @@ class GpuElemwise(Op):
} }
//std::cerr << "C_CODE %(opname)s END\\n"; //std::cerr << "C_CODE %(opname)s END\\n";
""" % locals() """ % locals()
#print sio.getvalue()
return sio.getvalue() return sio.getvalue()
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,0) #return (1,0)
return ()
class GpuDimShuffle(Op): class GpuDimShuffle(Op):
def __init__(self, input_broadcastable, new_order): def __init__(self, input_broadcastable, new_order):
......
...@@ -20,7 +20,8 @@ def print_mode(mode): ...@@ -20,7 +20,8 @@ def print_mode(mode):
mode.print_summary() mode.print_summary()
def run_nnet(use_gpu): def run_nnet(use_gpu):
n_batch = 16 #n_batch = 16
n_batch = 60 #Fred recommends a nice big batch
n_in = 1024 n_in = 1024
n_hid = 2048 n_hid = 2048
n_out = 10 n_out = 10
...@@ -213,19 +214,20 @@ def test_conv_nnet2(): ...@@ -213,19 +214,20 @@ def test_conv_nnet2():
print rval_cpu[0], rval_gpu[0],rval_cpu[0]-rval_gpu[0] print rval_cpu[0], rval_gpu[0],rval_cpu[0]-rval_gpu[0]
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-4) assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-4)
def run_conv_nnet2_classif(shared_fn): # pretend we are training LeNet for MNIST def run_conv_nnet2_classif(shared_fn, isize, ksize):
n_batch = 60 n_batch = 60
shape_img = (n_batch, 1, 32, 32) shape_img = (n_batch, 1, isize, isize)
n_kern = 20 n_kern = 20 # 6 were used in LeNet5
shape_kern = (n_kern, 1, 5, 5) shape_kern = (n_kern, 1, ksize, ksize)
n_kern1 = 30 n_kern1 = 30 # 16 were used in LeNet5
shape_kern1 = (n_kern1, n_kern, 5, 5) shape_kern1 = (n_kern1, n_kern, ksize, ksize)
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((32, 32), (5, 5), 'valid') logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((isize, isize), (ksize, ksize), 'valid')
logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2, logical_hid_shape[1]/2), (5, 5), 'valid') logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2,
logical_hid_shape[1]/2), (ksize, ksize), 'valid')
n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1] n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
n_out = 10 n_out = 10
...@@ -246,8 +248,8 @@ def run_conv_nnet2_classif(shared_fn): # pretend we are training LeNet for MNIST ...@@ -246,8 +248,8 @@ def run_conv_nnet2_classif(shared_fn): # pretend we are training LeNet for MNIST
hid = tensor.tanh(conv_op(x, w0)+b0) hid = tensor.tanh(conv_op(x, w0)+b0)
hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1) hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1)
hid_flat = hid1.reshape((n_batch, n_hid)) hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c) out = tensor.nnet.softmax(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr) loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, tensor.argmax(y, axis=1)) * lr)
print 'loss type', loss.type print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c] params = [w0, b0, w1, b1, v, c]
...@@ -270,10 +272,21 @@ def run_conv_nnet2_classif(shared_fn): # pretend we are training LeNet for MNIST ...@@ -270,10 +272,21 @@ def run_conv_nnet2_classif(shared_fn): # pretend we are training LeNet for MNIST
print_mode(mode) print_mode(mode)
return rval return rval
def test_conv_nnet2_classif(): def run_test_conv_nnet2_classif(seed, isize, ksize):
numpy.random.seed(23456) numpy.random.seed(seed)
rval_cpu = run_conv_nnet2(shared) rval_cpu = run_conv_nnet2_classif(shared, isize, ksize)
numpy.random.seed(23456) numpy.random.seed(seed)
rval_gpu = run_conv_nnet2(tcn.shared_constructor) rval_gpu = run_conv_nnet2_classif(tcn.shared_constructor, isize, ksize)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6) assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def test_lenet_28(): #MNIST
run_test_conv_nnet2_classif(23485, 28, 5)
def test_lenet_32(): #CIFAR10 / Shapeset
run_test_conv_nnet2_classif(23485, 32, 5)
def test_lenet_108(): # NORB
run_test_conv_nnet2_classif(23485, 108, 7)
def test_lenet_256(): # ImageNet
run_test_conv_nnet2_classif(23485, 256, 9)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论