提交 961c15e0 authored 作者: Frederic's avatar Frederic

remove code not needed anymore

上级 750d7815
......@@ -710,131 +710,3 @@ class GpuSoftmaxWithBias(GpuOp):
return ret1 + "\n" + ret2
gpu_softmax_with_bias = GpuSoftmaxWithBias()
class GpuSqrSumAx0(GpuOp):
"""
sqr all element of the input then, sum on axis 0.
work only with matrix input.
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x):
x = as_cuda_ndarray_variable(x)
assert x.ndim == 2
out = x.type.__class__(dtype='float32', broadcastable=(False,))()
return Apply(self, [x], [out])
def c_code_cache_version(self):
return (1,)
def c_code(self, node, nodename, inp, out, sub):
x, = inp
z, = out
fail = sub['fail']
return """
if (%(x)s->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
if ((NULL == %(z)s) ||
(CudaNdarray_HOST_DIMS(%(z)s)[0] !=
CudaNdarray_HOST_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*)CudaNdarray_New();
if ((NULL == %(z)s)
|| CudaNdarray_alloc_contiguous(%(z)s, 1,
CudaNdarray_HOST_DIMS(%(x)s) + 1))
{
Py_XDECREF(%(z)s);
%(z)s = NULL;
%(fail)s;
}
}
{
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
32 * 1024);
//TODO, detect the maximum number of thread per block.
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], 512);
int n_shared_bytes = n_threads * sizeof(float);
if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0 &&
CudaNdarray_HOST_DIMS(%(x)s)[1] > 0)
{
KSqrSumAx0
<<<
n_blocks,
n_threads,
n_threads * sizeof(float)
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0]
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n Used %%d blocks,"
" %%d threads %%d bytes of shared memory",
"kSoftmax[_fixed_shared]%(nodename)s",
cudaGetErrorString(err),
n_blocks, n_threads, n_shared_bytes);
%(fail)s;
}
}
else if (CudaNdarray_HOST_DIMS(%(z)s)[0] > 0){
cudaMemset(%(z)s->devdata, 0,
CudaNdarray_SIZE(%(z)s) * sizeof(float));
}
}
assert(%(z)s);
""" % locals()
def c_support_code(self):
return """
//Not well optimized, we don't read in contiguous blocks
__global__ void KSqrSumAx0(int nb_row, int nb_col,
const float* x, int x_str0, int x_str1, float* z, int z_str0) {
const int blockCount = gridDim.x;
const int blockNum = blockIdx.x;
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
float myresult = 0.0f;
for (int i = blockIdx.x; i < nb_col; i += gridDim.x) {
myresult = 0;
for (int j = threadIdx.x; j < nb_row; j += blockDim.x) {
float val = x[i + j*nb_col];
myresult += val * val;
}
__syncthreads();
buf[threadIdx.x] = myresult;
__syncthreads();
if(threadIdx.x==0){
for(int j=1;j<blockDim.x;j++)
myresult += buf[j];
z[i] = myresult;
}
__syncthreads();
}
}"""
gpu_sqr_sum_ax0 = GpuSqrSumAx0()
......@@ -247,41 +247,3 @@ def test_softmax():
cmp(2, 10000)
cmp(128, 16 * 1024)
cmp(128, 64 * 1024)
def test_sqr_sum_ax0():
x = T.fmatrix('x')
z = (x**2).sum(axis=0)
f = theano.function([x], z, mode=mode_without_gpu)
f_gpu = theano.function([x], z, mode=mode_with_gpu)
assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
cuda.nnet.GpuSqrSumAx0)
def cmp(n, m):
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data)
gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout)
#cmp(10, 15)
#cmp(120000, 15)
#cmp(15, 120000)
#cmp(4000, 4000)
#cmp(0, 15)
#cmp(10, 0)
#cmp(0, 0)
m = mode_with_gpu.excluding("local_gpu_sqr_sum_ax0")
f_gpu2 = theano.function([x], z, mode=m)
n, m = 4000, 4000
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
import time
t0 = time.time()
for i in range(1000):
f_gpu(data)
t1 = time.time()
for i in range(1000):
f_gpu2(data)
t2 = time.time()
print t1 - t0, t2 - t1
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论