提交 eb84fcb3 authored 作者: James Bergstra's avatar James Bergstra

added gpu implementation of Crossentropy gradient op

上级 85908603
...@@ -43,7 +43,8 @@ class GpuDot22(Op): ...@@ -43,7 +43,8 @@ class GpuDot22(Op):
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]) || (CudaNdarray_HOST_DIMS(cnda_%(z)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(y)s)[1])) || (CudaNdarray_HOST_DIMS(cnda_%(z)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(y)s)[1]))
{ {
if (cnda_%(z)s) Py_DECREF(cnda_%(z)s); //if (cnda_%(z)s) Py_DECREF(cnda_%(z)s);
Py_XDECREF(cnda_%(z)s);
npy_intp dims[2]; npy_intp dims[2];
dims[0] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]; dims[0] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(cnda_%(y)s)[1]; dims[1] = CudaNdarray_HOST_DIMS(cnda_%(y)s)[1];
...@@ -183,168 +184,6 @@ class GpuConv(Op): ...@@ -183,168 +184,6 @@ class GpuConv(Op):
subsample=self.subsample, subsample=self.subsample,
logical_img_shape=self.logical_img_hw, logical_img_shape=self.logical_img_hw,
logical_kern_shape=self.logical_kern_hw, logical_kern_shape=self.logical_kern_hw,
kern_align=self.logical_kern_align_top) kern_align=self.logical_kern_align_top,
verbose=1)
class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
nin=3
nout=3
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, x, b, y_idx):
nll = y_idx.type() #N.B. won't work when we don't cast y_idx to float anymore
sm = x.type()
am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am])
def c_support_code(self):
return """
__global__ void k_xent_sm_1hot_bias(int M, int N,
const float * x_data, int xs0, int xs1,
const float * b, int bs0,
const float * y_idx_data, int y_idxs0,
float * nll_data, int nlls0,
float * sm_data, int sms0, int sms1,
float * am_data, int ams0)
{
const int row = blockIdx.x;
const float * x = x_data + xs0 * row;
const int y_idx = (int)y_idx_data[row * y_idxs0];
float * sm = sm_data + sms0 * row;
float sum = 0.0;
int row_max_j = 0;
float row_max = x[0] + b[0];
for (int j = 1; j < N; ++j)
{
float row_ij = x[j*xs1] + b[j*bs0];
//todo: store to shared memory
row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max;
}
//compute the exp
for (int j = 0; j < N; ++j)
{
float row_ij = x[j*xs1] + b[j*bs0];
float sm_ij = exp(row_ij - row_max);
sum += sm_ij;
sm[j * sms1] = sm_ij;
}
float sum_inv = 1.0 / sum;
for (int j = 0; j < N; ++j)
{
sm[j * sms1] *= sum_inv;
}
if ((y_idx >= N) || (y_idx < 0))
{
//TODO: set raise an error bit in a global var?
nll_data[row*nlls0] = 0.0; // raise some suspicion at least...
}
else
{
nll_data[row*nlls0] = - x[y_idx*xs1]
- b[y_idx*bs0]
+ row_max
+ log(sum);
}
am_data[row*ams0] = row_max_j;
}
"""
def c_code(self, node, nodename, (x, b, y_idx), (nll, sm, am), sub):
classname=self.__class__.__name__
fail = sub['fail']
sio = StringIO.StringIO()
print >> sio, """
if (cnda_%(y_idx)s->nd != 1)
{
PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
%(fail)s;
}
if (cnda_%(x)s->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
%(fail)s;
}
if (cnda_%(b)s->nd != 1)
{
PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(cnda_%(x)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0])
{
PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,y_idx arguments");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(cnda_%(x)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(b)s)[0])
{
PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,b arguments");
%(fail)s;
}
if ((NULL == cnda_%(nll)s) //initial condition
|| (CudaNdarray_HOST_DIMS(cnda_%(nll)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0]))
{
Py_XDECREF(cnda_%(nll)s);
cnda_%(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(cnda_%(y_idx)s));
if(!cnda_%(nll)s)
{
%(fail)s;
}
}
if ((NULL == cnda_%(sm)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(sm)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]))
{
Py_XDECREF(cnda_%(sm)s);
cnda_%(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2, CudaNdarray_HOST_DIMS(cnda_%(x)s));
if(!cnda_%(sm)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc sm output");
// no need to decref cnda_nll, the cleanup code should pick it up.
%(fail)s;
}
}
if ((NULL == cnda_%(am)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(am)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0]))
{
Py_XDECREF(cnda_%(am)s);
cnda_%(am)s = (CudaNdarray*) CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(cnda_%(y_idx)s));
if(!cnda_%(am)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc am output");
// no need to decref nll amd sm, the cleanup code should pick it up.
%(fail)s;
}
}
{
int n_blocks = CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0];
int n_threads = 1; //TODO: launch more threads per row and do parallel sum and max reductions.
int n_shared_bytes = 0; //n_threads * sizeof(float);
k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(x)s), CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0], CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(b)s), CudaNdarray_HOST_STRIDES(cnda_%(b)s)[0],
CudaNdarray_DEV_DATA(cnda_%(y_idx)s), CudaNdarray_HOST_STRIDES(cnda_%(y_idx)s)[0],
CudaNdarray_DEV_DATA(cnda_%(nll)s), CudaNdarray_HOST_STRIDES(cnda_%(nll)s)[0],
CudaNdarray_DEV_DATA(cnda_%(sm)s), CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[0], CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[1],
CudaNdarray_DEV_DATA(cnda_%(am)s), CudaNdarray_HOST_STRIDES(cnda_%(am)s)[0]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %(classname)s %(nodename)s: %%s.\\n", cudaGetErrorString(err));
// no need to decref output vars the cleanup code should pick them up.
%(fail)s;
}
}
""" % locals()
return sio.getvalue()
def c_code_cache_version(self):
return (1,0)
差异被折叠。
...@@ -3,8 +3,10 @@ from theano import tensor, scalar, compile ...@@ -3,8 +3,10 @@ from theano import tensor, scalar, compile
from theano.gof import local_optimizer, EquilibriumDB, SequenceDB from theano.gof import local_optimizer, EquilibriumDB, SequenceDB
from theano_cuda_ndarray.basic_ops import * from theano_cuda_ndarray.basic_ops import *
from theano_cuda_ndarray.blas import gpu_dot22, gpu_gemm, GpuConv, GpuCrossentropySoftmaxArgmax1HotWithBias from theano_cuda_ndarray.blas import gpu_dot22, gpu_gemm, GpuConv
from theano_cuda_ndarray.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx)
from theano.compile import optdb from theano.compile import optdb
#optdb.print_summary() # this shows what is currently registered (in a so-far crude way...) #optdb.print_summary() # this shows what is currently registered (in a so-far crude way...)
...@@ -249,3 +251,19 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node): ...@@ -249,3 +251,19 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
host_from_gpu(gpu_sm), host_from_gpu(gpu_sm),
cast(host_from_gpu(gpu_am), am_dtype)] cast(host_from_gpu(gpu_am), am_dtype)]
return False return False
@register_opt()
@local_optimizer([])
def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
print 'REPLACING ', node, '??'
if isinstance(node.op, tensor.nnet.CrossentropySoftmax1HotWithBiasDx):
dnll,sm,yidx = node.inputs
if sm.owner and sm.owner.op == host_from_gpu:
gpu_sm, = sm.owner.inputs
gpu_dx = GpuCrossentropySoftmax1HotWithBiasDx()(
gpu_from_host(dnll),
gpu_sm,
gpu_from_host(cast(yidx, 'float32')))
print 'YEP ', node
return [host_from_gpu(gpu_dx)]
return False
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论