提交 24858525 authored 作者: James Bergstra's avatar James Bergstra

adding cross-entropy but not done yet

上级 a317f101
...@@ -138,16 +138,24 @@ class GpuElemwise(Op): ...@@ -138,16 +138,24 @@ class GpuElemwise(Op):
def _logical_scalar(x): def _logical_scalar(x):
return all(x.type.broadcastable) return all(x.type.broadcastable)
print >> sio, "static __global__ void kernel_%s(unsigned int numEls," %nodename print >> sio, "// Elemwise kernel for ", str(self.scalar_op)
print >> sio, "\t ", ", ".join("unsigned int log2_dim%i" % i for i in xrange(nd)) for ipos, i in enumerate(node.inputs):
print >> sio, "// Input ", ipos, str(i.type)
for ipos, i in enumerate(node.outputs):
print >> sio, "// Output ", ipos, str(i.type)
print >> sio, "static __global__ void kernel_%s(unsigned int numEls" %nodename
if (nd):
print >> sio, "\t,", ", ".join("unsigned int log2_dim%i" % i for i in xrange(nd))
#declare inputs #declare inputs
for ipos, i in enumerate(node.inputs): for ipos, i in enumerate(node.inputs):
print >> sio, "\t,", ", ".join("int i%i_str_%i" % (ipos, d) for d in xrange(nd)) s = ", ".join(["const float * i%i_data" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
print >> sio, "\t,", "const float * i%i_data" % ipos print >> sio, "\t,", s
#declare outputs #declare outputs
for ipos, i in enumerate(node.outputs): for ipos, i in enumerate(node.outputs):
print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t,", s
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#print >> sio, "\t,", "float * o%i_data" % ipos
print >> sio, "\t)\n{" print >> sio, "\t)\n{"
print >> sio, " const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;" print >> sio, " const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;"
print >> sio, " const unsigned int numThreads = blockDim.x * gridDim.x;" print >> sio, " const unsigned int numThreads = blockDim.x * gridDim.x;"
...@@ -183,17 +191,16 @@ class GpuElemwise(Op): ...@@ -183,17 +191,16 @@ class GpuElemwise(Op):
print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d) print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d)
# perform the scalar operation on the input and output references # perform the scalar operation on the input and output references
if d == 0: #TODO: What if the scalar_op needs support_code??
#TODO: What if the scalar_op needs support_code?? task_code = self.scalar_op.c_code(
task_code = self.scalar_op.c_code( Apply(self.scalar_op,
Apply(self.scalar_op, [scalar.Scalar(dtype = input.type.dtype)() for input in node.inputs],
[scalar.Scalar(dtype = input.type.dtype)() for input in node.inputs], [scalar.Scalar(dtype = output.type.dtype)() for output in node.outputs])
[scalar.Scalar(dtype = output.type.dtype)() for output in node.outputs]) , nodename + '_scalar_'
, nodename + '_scalar_' , [('ii_i%i_value' if _logical_scalar(i) else 'ii_i%i_data[0]')%ipos for ipos, i in enumerate(node.inputs)]
, [('ii_i%i_value' if _logical_scalar(i) else 'ii_i%i_data[0]')%ipos for ipos, i in enumerate(node.inputs)] , ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)]
, ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)] , sub=dict(fail='return;')) #TODO: set a failure code somehow!!!
, sub=dict(fail='return;')) #TODO: set a failure code somehow!!! print >> sio, " ", task_code
print >> sio, " ", task_code
print >> sio, " }" print >> sio, " }"
#TODO: insert runtime stride checks that select the best loop order either here, or in #TODO: insert runtime stride checks that select the best loop order either here, or in
...@@ -230,11 +237,17 @@ class GpuElemwise(Op): ...@@ -230,11 +237,17 @@ class GpuElemwise(Op):
kernel_call_args = ["numEls"] kernel_call_args = ["numEls"]
kernel_call_args.extend("log2_dims[%i]"%di for di in xrange(nd)) kernel_call_args.extend("log2_dims[%i]"%di for di in xrange(nd))
for ipos in xrange(len(node.inputs)): for ipos in xrange(len(node.inputs)):
strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(nd)) kernel_call_args.append(
kernel_call_args.append( "%s, i%i_data" % (strides, ipos)) ", ".join(["i%i_data"%ipos] + list("i%i_str[%i]"%(ipos, di) for di in xrange(nd)))
)
#strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(nd))
#kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
for ipos in xrange(len(node.outputs)): for ipos in xrange(len(node.outputs)):
strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(nd)) kernel_call_args.append(
kernel_call_args.append( "%s, o%i_data" % (strides, ipos)) ", ".join(["o%i_data"%ipos] + list("o%i_str[%i]"%(ipos, di) for di in xrange(nd)))
)
#strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(nd))
#kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
kernel_call_args = ", ".join(kernel_call_args) kernel_call_args = ", ".join(kernel_call_args)
# the data_pointer_increments are inserted after each recursive call # the data_pointer_increments are inserted after each recursive call
...@@ -388,7 +401,7 @@ class GpuElemwise(Op): ...@@ -388,7 +401,7 @@ class GpuElemwise(Op):
); );
//std::cerr << "calling callkernel returned\\n"; //std::cerr << "calling callkernel returned\\n";
cudaThreadSynchronize(); CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
...@@ -568,7 +581,7 @@ class GpuDimShuffle(Op): ...@@ -568,7 +581,7 @@ class GpuDimShuffle(Op):
} }
""" %locals() """ %locals()
if 1: if 0: # print full code to stdout
print '--------------------------------------' print '--------------------------------------'
print 'C_CODE' print 'C_CODE'
print '' print ''
...@@ -724,7 +737,6 @@ class GpuSubtensor(tensor.Subtensor): ...@@ -724,7 +737,6 @@ class GpuSubtensor(tensor.Subtensor):
#sys.stdout.flush() #sys.stdout.flush()
#sys.exit() #sys.exit()
class GpuShape(tensor.Shape): class GpuShape(tensor.Shape):
def make_node(self, x): def make_node(self, x):
return Apply(self, [x], [tensor.lvector()]) return Apply(self, [x], [tensor.lvector()])
......
...@@ -3,6 +3,7 @@ from theano import tensor, scalar ...@@ -3,6 +3,7 @@ from theano import tensor, scalar
import StringIO import StringIO
import cuda_ndarray import cuda_ndarray
from .type import CudaNdarrayType
class GpuDot22(Op): class GpuDot22(Op):
def __str__(self): def __str__(self):
...@@ -184,3 +185,164 @@ class GpuConv(Op): ...@@ -184,3 +185,164 @@ class GpuConv(Op):
logical_kern_shape=self.logical_kern_hw, logical_kern_shape=self.logical_kern_hw,
kern_align=self.logical_kern_align_top) kern_align=self.logical_kern_align_top)
class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
nin=3
nout=3
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, x, b, y_idx):
nll = y_idx.type() #N.B. won't work when we don't cast y_idx to float anymore
sm = x.type()
am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am])
def c_support_code(self):
return """
__global__ void k_xent_sm_1hot_bias(int M, int N,
const float * x_data, int xs0, int xs1,
const float * b, int bs0,
const float * y_idx_data, int y_idxs0,
float * nll_data, int nlls0,
float * sm_data, int sms0, int sms1,
float * am_data, int ams0)
{
const int row = blockIdx.x;
const float * x = x_data + xs0 * row;
const int y_idx = (int)y_idx_data[row * y_idxs0];
float * sm = sm_data + sms0 * row;
float sum = 0.0;
int row_max_j = 0;
float row_max = x[0] + b[0];
for (int j = 1; j < N; ++j)
{
float row_ij = x[j*xs1] + b[j*bs0];
//todo: store to shared memory
row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max;
}
//compute the exp
for (int j = 0; j < N; ++j)
{
float row_ij = x[j*xs1] + b[j*bs0];
float sm_ij = exp(row_ij - row_max);
sum += sm_ij;
sm[j * sms1] = sm_ij;
}
float sum_inv = 1.0 / sum;
for (int j = 0; j < N; ++j)
{
sm[j * sms1] *= sum_inv;
}
if ((y_idx >= N) || (y_idx < 0))
{
//TODO: set raise an error bit in a global var?
nll_data[row*nlls0] = 0.0; // raise some suspicion at least...
}
else
{
nll_data[row*nlls0] = - x[y_idx*xs1]
- b[y_idx*bs0]
+ row_max
+ log(sum);
}
am_data[row*ams0] = row_max_j;
}
"""
def c_code(self, node, nodename, (x, b, y_idx), (nll, sm, am), sub):
classname=self.__class__.__name__
fail = sub['fail']
sio = StringIO.StringIO()
print >> sio, """
if (cnda_%(y_idx)s->nd != 1)
{
PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
%(fail)s;
}
if (cnda_%(x)s->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
%(fail)s;
}
if (cnda_%(b)s->nd != 1)
{
PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(cnda_%(x)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0])
{
PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,y_idx arguments");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(cnda_%(x)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(b)s)[0])
{
PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,b arguments");
%(fail)s;
}
if ((NULL == cnda_%(nll)s) //initial condition
|| (CudaNdarray_HOST_DIMS(cnda_%(nll)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0]))
{
Py_XDECREF(cnda_%(nll)s);
cnda_%(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(cnda_%(y_idx)s));
if(!cnda_%(nll)s)
{
%(fail)s;
}
}
if ((NULL == cnda_%(sm)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(sm)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]))
{
Py_XDECREF(cnda_%(sm)s);
cnda_%(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2, CudaNdarray_HOST_DIMS(cnda_%(x)s));
if(!cnda_%(sm)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc sm output");
// no need to decref cnda_nll, the cleanup code should pick it up.
%(fail)s;
}
}
if ((NULL == cnda_%(am)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(am)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0]))
{
Py_XDECREF(cnda_%(am)s);
cnda_%(am)s = (CudaNdarray*) CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(cnda_%(y_idx)s));
if(!cnda_%(am)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc am output");
// no need to decref nll amd sm, the cleanup code should pick it up.
%(fail)s;
}
}
{
int n_blocks = CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0];
int n_threads = 1; //TODO: launch more threads per row and do parallel sum and max reductions.
int n_shared_bytes = 0; //n_threads * sizeof(float);
k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(x)s), CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0], CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(b)s), CudaNdarray_HOST_STRIDES(cnda_%(b)s)[0],
CudaNdarray_DEV_DATA(cnda_%(y_idx)s), CudaNdarray_HOST_STRIDES(cnda_%(y_idx)s)[0],
CudaNdarray_DEV_DATA(cnda_%(nll)s), CudaNdarray_HOST_STRIDES(cnda_%(nll)s)[0],
CudaNdarray_DEV_DATA(cnda_%(sm)s), CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[0], CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[1],
CudaNdarray_DEV_DATA(cnda_%(am)s), CudaNdarray_HOST_STRIDES(cnda_%(am)s)[0]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %(classname)s %(nodename)s: %%s.\\n", cudaGetErrorString(err));
// no need to decref output vars the cleanup code should pick them up.
%(fail)s;
}
}
""" % locals()
return sio.getvalue()
...@@ -3,7 +3,7 @@ from theano import tensor, scalar, compile ...@@ -3,7 +3,7 @@ from theano import tensor, scalar, compile
from theano.gof import local_optimizer, EquilibriumDB, SequenceDB from theano.gof import local_optimizer, EquilibriumDB, SequenceDB
from .basic_ops import * from .basic_ops import *
from .blas import gpu_dot22, gpu_gemm, GpuConv from .blas import gpu_dot22, gpu_gemm, GpuConv, GpuCrossentropySoftmaxArgmax1HotWithBias
from theano.compile import optdb from theano.compile import optdb
#optdb.print_summary() # this shows what is currently registered (in a so-far crude way...) #optdb.print_summary() # this shows what is currently registered (in a so-far crude way...)
...@@ -229,3 +229,26 @@ def local_gpu_shape(node): ...@@ -229,3 +229,26 @@ def local_gpu_shape(node):
return [gpu_shape(gpu_x)] return [gpu_shape(gpu_x)]
return False return False
def cast(x, dtype):
stype = theano.scalar.Scalar(dtype)
cast_op = theano.tensor.Elemwise(scalar.Identity(scalar.specific_out(stype)))
return cast_op(x)
import theano.tensor.nnet
@register_opt()
@local_optimizer([])
def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
if isinstance(node.op, tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias):
x,b,y = node.inputs
if x.owner and x.owner.op == host_from_gpu:
gpu_x, = x.owner.inputs
gpu_nll, gpu_sm, gpu_am = GpuCrossentropySoftmaxArgmax1HotWithBias()(
gpu_x,
gpu_from_host(b),
gpu_from_host(cast(y, 'float32')))
am_dtype = node.outputs[2].type.dtype
return [host_from_gpu(gpu_nll),
host_from_gpu(gpu_sm),
cast(host_from_gpu(gpu_am), am_dtype)]
return False
...@@ -3,6 +3,7 @@ import theano, theano.sandbox.conv ...@@ -3,6 +3,7 @@ import theano, theano.sandbox.conv
from theano.compile.sandbox.sharedvalue import shared from theano.compile.sandbox.sharedvalue import shared
from theano.compile.sandbox.pfunc import pfunc from theano.compile.sandbox.pfunc import pfunc
from theano import tensor from theano import tensor
import theano.tensor.nnet
import numpy import numpy
...@@ -120,8 +121,9 @@ def test_conv_nnet1(): ...@@ -120,8 +121,9 @@ def test_conv_nnet1():
rval_gpu = run_conv_nnet1(tcn.shared_constructor) rval_gpu = run_conv_nnet1(tcn.shared_constructor)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6) assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def run_conv_nnet2(shared_fn): def run_conv_nnet2(shared_fn): # pretend we are training LeNet for MNIST
n_batch = 16
n_batch = 60
shape_img = (n_batch, 1, 32, 32) shape_img = (n_batch, 1, 32, 32)
n_kern = 20 n_kern = 20
...@@ -168,12 +170,15 @@ def run_conv_nnet2(shared_fn): ...@@ -168,12 +170,15 @@ def run_conv_nnet2(shared_fn):
print i, n print i, n
xval = numpy.asarray(numpy.random.rand(*shape_img), dtype='float32') xval = numpy.asarray(numpy.random.rand(*shape_img), dtype='float32')
yval = numpy.asarray(numpy.random.rand(n_batch, n_out), dtype='float32') yval = numpy.asarray(numpy.random.rand(n_batch,n_out), dtype='int32')
lr = numpy.asarray(0.01, dtype='float32') lr = numpy.asarray(0.01, dtype='float32')
for i in xrange(10): for i in xrange(10):
rval = train(xval, yval, lr) rval = train(xval, yval, lr)
mode.print_summary() try:
mode.print_summary()
except:
pass
return rval return rval
def test_conv_nnet2(): def test_conv_nnet2():
...@@ -183,3 +188,70 @@ def test_conv_nnet2(): ...@@ -183,3 +188,70 @@ def test_conv_nnet2():
rval_gpu = run_conv_nnet2(tcn.shared_constructor) rval_gpu = run_conv_nnet2(tcn.shared_constructor)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6) assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def run_conv_nnet2_classif(shared_fn): # pretend we are training LeNet for MNIST
n_batch = 60
shape_img = (n_batch, 1, 32, 32)
n_kern = 20
shape_kern = (n_kern, 1, 5, 5)
n_kern1 = 30
shape_kern1 = (n_kern1, n_kern, 5, 5)
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((32, 32), (5, 5), 'valid')
logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2, logical_hid_shape[1]/2), (5, 5), 'valid')
n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
n_out = 10
w0 = shared_fn(numpy.asarray(0.01*(numpy.random.rand(*shape_kern)-0.5), dtype='float32'), 'w0')
b0 = shared_fn(numpy.asarray(numpy.zeros((n_kern,1,1)), dtype='float32'), 'b0')
w1 = shared_fn(numpy.asarray(0.01*(numpy.random.rand(*shape_kern1)-0.5), dtype='float32'), 'w1')
b1 = shared_fn(numpy.asarray(numpy.zeros((n_kern1,1,1)), dtype='float32'), 'b1')
v = shared_fn(numpy.asarray(numpy.zeros((n_hid, n_out)), dtype='float32'), 'c')
c = shared_fn(numpy.asarray(numpy.zeros(n_out), dtype='float32'), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,0,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = theano.sandbox.conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op1 = theano.sandbox.conv.ConvOp((n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
hid = tensor.tanh(conv_op(x, w0)+b0)
hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1)
hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c]
gparams = tensor.grad(loss, params)
mode = theano.compile.ProfileMode()
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
for i, n in enumerate(train.maker.env.toposort()):
print i, n
xval = numpy.asarray(numpy.random.rand(*shape_img), dtype='float32')
yval = numpy.asarray(numpy.random.rand(n_batch,n_out), dtype='int32')
lr = numpy.asarray(0.01, dtype='float32')
for i in xrange(10):
rval = train(xval, yval, lr)
try:
mode.print_summary()
except:
pass
return rval
def test_conv_nnet2_classif():
numpy.random.seed(23456)
rval_cpu = run_conv_nnet2(shared)
numpy.random.seed(23456)
rval_gpu = run_conv_nnet2(tcn.shared_constructor)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论