提交 f39777c1 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Fix errors in SparseBlockOuterSS c_code.

上级 3f1364db
...@@ -395,7 +395,7 @@ const npy_intp *yIdx, int yI_str_0 ...@@ -395,7 +395,7 @@ const npy_intp *yIdx, int yI_str_0
if (i >= maxi || j >= maxj) return; if (i >= maxi || j >= maxj) return;
int p = i + j * maxi + b * maxi * maxj; int p = i + j * maxi + b * maxi * maxj;
x_list[p] = &x[b * x_str_0 + i * x_str_1]; x_list[p] = &x[b * x_str_0 + i * x_str_1];
y_list[p] = &y[b * x_str_0 + j * y_str_1]; y_list[p] = &y[b * y_str_0 + j * y_str_1];
out_list[p] = &out[xIdx[b * xI_str_0 + i] * o_str_0 + out_list[p] = &out[xIdx[b * xI_str_0 + i] * o_str_0 +
yIdx[b * yI_str_0 + j] * o_str_1]; yIdx[b * yI_str_0 + j] * o_str_1];
} }
...@@ -408,7 +408,7 @@ __global__ void _sgerBH_gen_small(const float *x[], int incx, ...@@ -408,7 +408,7 @@ __global__ void _sgerBH_gen_small(const float *x[], int incx,
int b, int m, int n) { int b, int m, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y; int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i > m || j > n) return; if (i >= m || j >= n) return;
for (int p = blockIdx.z; p < b; p += gridDim.z) { for (int p = blockIdx.z; p < b; p += gridDim.z) {
atomicAdd(&A[p][j * lda + i], atomicAdd(&A[p][j * lda + i],
alpha * x[p][i * incx] * y[p][j * incy]); alpha * x[p][i * incx] * y[p][j * incy]);
...@@ -608,7 +608,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1], ...@@ -608,7 +608,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
alpha=alpha, fail=sub['fail']) alpha=alpha, fail=sub['fail'])
def c_code_cache_version(self): def c_code_cache_version(self):
return (8,) return (9,)
sparse_block_outer_ss = SparseBlockOuterSS(False) sparse_block_outer_ss = SparseBlockOuterSS(False)
......
...@@ -38,7 +38,7 @@ def blocksparse_data(): ...@@ -38,7 +38,7 @@ def blocksparse_data():
outputSize = 30 outputSize = 30
inputWindowSize = 7 inputWindowSize = 7
outputWindowSize = 9 outputWindowSize = 9
batchSize = 4 batchSize = 2
input = randn(batchSize, inputWindowSize, inputSize).astype('float32') input = randn(batchSize, inputWindowSize, inputSize).astype('float32')
inputIndice = numpy.vstack(numpy.random.permutation(nInputBlock)[:inputWindowSize] for _ in range(batchSize)) inputIndice = numpy.vstack(numpy.random.permutation(nInputBlock)[:inputWindowSize] for _ in range(batchSize))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论