提交 98a15fa1 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Small forgotten speedup in SparseBlockDotGemvSS.

上级 9841c0db
...@@ -57,6 +57,7 @@ def gemv(alpha, A, x, beta, y): ...@@ -57,6 +57,7 @@ def gemv(alpha, A, x, beta, y):
x.gpudata, x.strides[0], x.gpudata, x.strides[0],
beta, y.gpudata, y.strides[0]) beta, y.gpudata, y.strides[0])
def ger(alpha, x, y, A): def ger(alpha, x, y, A):
assert A.shape[1] == x.shape[0] assert A.shape[1] == x.shape[0]
assert A.shape[0] == y.shape[0] assert A.shape[0] == y.shape[0]
...@@ -168,10 +169,12 @@ const npy_intp *oIdx ...@@ -168,10 +169,12 @@ const npy_intp *oIdx
if (%(n)s_iIdx_len < b*i) { if (%(n)s_iIdx_len < b*i) {
cudaFree(%(n)s_iIdx); cudaFree(%(n)s_iIdx);
if (cudaMalloc(&%(n)s_iIdx, b*i*sizeof(npy_intp)) != cudaSuccess) return -1; if (cudaMalloc(&%(n)s_iIdx, b*i*sizeof(npy_intp)) != cudaSuccess) return -1;
%(n)s_iIdx_len = b*i;
} }
if (%(n)s_oIdx_len < b*j) { if (%(n)s_oIdx_len < b*j) {
cudaFree(%(n)s_oIdx); cudaFree(%(n)s_oIdx);
if (cudaMalloc(&%(n)s_oIdx, b*j*sizeof(npy_intp)) != cudaSuccess) return -1; if (cudaMalloc(&%(n)s_oIdx, b*j*sizeof(npy_intp)) != cudaSuccess) return -1;
%(n)s_oIdx_len = b*j;
} }
return 0; return 0;
} }
...@@ -319,7 +322,7 @@ CudaNdarray_HOST_DIMS(%(o)s)[1], ...@@ -319,7 +322,7 @@ CudaNdarray_HOST_DIMS(%(o)s)[1],
W=W, fail=sub['fail'], name=nodename) W=W, fail=sub['fail'], name=nodename)
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (3,)
def grad(self, inputs, grads): def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs o, W, h, inputIdx, outputIdx = inputs
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论