提交 43d45788 authored 作者: Frederic's avatar Frederic

Add GpuSqrSumAx0 to lower the memory usage on the GPU.

上级 73d8d175
from theano import Op, Apply
from theano.compat.six import StringIO
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda import GpuOp, as_cuda_ndarray_variable
from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
inline_softmax,
......@@ -709,3 +709,130 @@ class GpuSoftmaxWithBias (GpuOp):
return ret1 + "\n" + ret2
gpu_softmax_with_bias = GpuSoftmaxWithBias()
class GpuSqrSumAx0(GpuOp):
"""
sqr all element of the input then, sum on axis 0.
work only with matrix input.
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x):
x = as_cuda_ndarray_variable(x)
assert x.ndim == 2
out = x.type.__class__(dtype='float32', broadcastable=(False,))()
return Apply(self, [x], [out])
def c_code_cache_version(self):
return (1,)
def c_code(self, node, nodename, inp, out, sub):
x, = inp
z, = out
fail = sub['fail']
return """
if (%(x)s->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
if ((NULL == %(z)s) ||
(CudaNdarray_HOST_DIMS(%(z)s)[0] !=
CudaNdarray_HOST_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*)CudaNdarray_New();
if ((NULL == %(z)s)
|| CudaNdarray_alloc_contiguous(%(z)s, 1,
CudaNdarray_HOST_DIMS(%(x)s) + 1))
{
Py_XDECREF(%(z)s);
%(z)s = NULL;
%(fail)s;
}
}
{
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
32 * 1024);
//TODO, detect the maximum number of thread per block.
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], 512);
int n_shared_bytes = n_threads * sizeof(float);
if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0 &&
CudaNdarray_HOST_DIMS(%(x)s)[1] > 0)
{
KSqrSumAx0
<<<
n_blocks,
n_threads,
n_threads * sizeof(float)
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0]
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n Used %%d blocks,"
" %%d threads %%d bytes of shared memory",
"kSoftmax[_fixed_shared]%(nodename)s",
cudaGetErrorString(err),
n_blocks, n_threads, n_shared_bytes);
%(fail)s;
}
}
else if (CudaNdarray_HOST_DIMS(%(z)s)[0] > 0){
cudaMemset(%(z)s->devdata, 0, CudaNdarray_SIZE(%(z)s) * sizeof(float));
}
}
assert(%(z)s);
""" % locals()
def c_support_code(self):
return """
//Not well optimized, we don't read in contiguous blocks
__global__ void KSqrSumAx0(int nb_row, int nb_col,
const float* x, int x_str0, int x_str1, float* z, int z_str0) {
const int blockCount = gridDim.x;
const int blockNum = blockIdx.x;
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
float myresult = 0.0f;
for (int i = blockIdx.x; i < nb_col; i += gridDim.x) {
myresult = 0;
for (int j = threadIdx.x; j < nb_row; j += blockDim.x) {
float val = x[i + j*nb_col];
myresult += val * val;
}
__syncthreads();
buf[threadIdx.x] = myresult;
__syncthreads();
if(threadIdx.x==0){
for(int j=1;j<blockDim.x;j++)
myresult += buf[j];
z[i] = myresult;
}
__syncthreads();
}
}"""
gpu_sqr_sum_ax0 = GpuSqrSumAx0()
......@@ -35,7 +35,7 @@ from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmax, GpuSoftmaxWithBias)
GpuSoftmax, GpuSoftmaxWithBias, GpuSqrSumAx0)
from theano.sandbox.cuda.elemwise import SupportCodeError
from theano.scalar.basic_scipy import Erfinv
from theano.sandbox.cuda.elemwise import erfinv_gpu
......@@ -685,6 +685,19 @@ def local_gpu_careduce(node):
return False
@register_opt()#"fast_compile")
@local_optimizer([GpuCAReduce])
def local_gpu_sqr_sum_ax0(node):
if (isinstance(node.op, GpuCAReduce) and
isinstance(node.op.scalar_op, theano.scalar.basic.Add) and
node.op.reduce_mask == (1, 0) and
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuElemwise) and
isinstance(node.inputs[0].owner.op.scalar_op, theano.scalar.basic.Sqr)
):
return [GpuSqrSumAx0()(node.inputs[0].owner.inputs[0])]
@register_opt()
@local_optimizer([gpu_from_host, tensor.Reshape])
def local_gpu_reshape(node):
......
......@@ -247,3 +247,31 @@ def test_softmax():
cmp(2, 10000)
cmp(128, 16 * 1024)
cmp(128, 64 * 1024)
def test_sqr_sum_ax0():
x = T.fmatrix('x')
z = (x**2).sum(axis=0)
f = theano.function([x], z, mode=mode_without_gpu)
f_gpu = theano.function([x], z, mode=mode_with_gpu)
theano.printing.debugprint(f_gpu)
theano.printing.debugprint(f_gpu2)
assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
cuda.nnet.GpuSqrSumAx0)
def cmp(n, m):
#print "test_softmax",n,m
print n, m
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data)
gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout)
cmp(10, 15)
cmp(120000, 15)
cmp(15, 120000)
cmp(4000, 4000)
cmp(0, 15)
cmp(10, 0)
cmp(0, 0)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论