提交 2955b33c authored 作者: Frederic's avatar Frederic

Reuse pre allocated memory.

上级 998b9bc4
...@@ -577,8 +577,9 @@ class GpuConvMM(GpuOp): ...@@ -577,8 +577,9 @@ class GpuConvMM(GpuOp):
return ['cuda_ndarray.cuh', '<stdio.h>'] return ['cuda_ndarray.cuh', '<stdio.h>']
def c_code_cache_version(self): def c_code_cache_version(self):
return
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 22) return (0, 21)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
...@@ -591,8 +592,8 @@ class GpuConvMM(GpuOp): ...@@ -591,8 +592,8 @@ class GpuConvMM(GpuOp):
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp img, kern = inp
out, = out_ out, = out_
dx = self.subsample dx = self.subsample[0]
dy = self.subsample dy = self.subsample[1]
border_mode = self.border_mode border_mode = self.border_mode
sub = sub.copy() sub = sub.copy()
pad = self.pad pad = self.pad
...@@ -606,7 +607,9 @@ class GpuConvMM(GpuOp): ...@@ -606,7 +607,9 @@ class GpuConvMM(GpuOp):
//Optional args //Optional args
int dx = %(dx)s; int dx = %(dx)s;
int dy = %(dy)s; int dy = %(dy)s;
CudaNdarray * img = %(img)s;
CudaNdarray * kern = %(kern)s;
CudaNdarray * out2 = NULL;
int mode; int mode;
if (strcmp(mode_str, "full") == 0) if (strcmp(mode_str, "full") == 0)
{ {
...@@ -620,17 +623,45 @@ class GpuConvMM(GpuOp): ...@@ -620,17 +623,45 @@ class GpuConvMM(GpuOp):
{ {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"mode must be one of 'full' or 'valid'"); "mode must be one of 'full' or 'valid'");
return NULL; %(fail)s;
} }
//TODO: Send self.pad, stride, etc //TODO: Send self.pad, stride, etc
CudaNdarray * out2 = validMM(%(img)s, %(kern)s, %(out)s);
// TODO, make out be decref before we alloc out2!
Py_XDECREF(%(out)s);
%(out)s = out2;
if (%(out)s==NULL){ int out_dim[4];
%(fail)s out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0];
int logical_rows, logical_cols;
if (mode == 1)
{
logical_rows = CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1;
logical_cols = CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1;
}
else
{
logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1;
logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1;
}
out_dim[2] = ceil_intdiv(logical_rows, dx);
out_dim[3] = ceil_intdiv(logical_cols, dy);
if ( !(%(out)s
&& %(out)s->nd==4
&& CudaNdarray_is_c_contiguous(%(out)s)
&& CudaNdarray_HOST_DIMS(%(out)s)[0]==out_dim[0]
&& CudaNdarray_HOST_DIMS(%(out)s)[1]==out_dim[1]
&& CudaNdarray_HOST_DIMS(%(out)s)[2]==out_dim[2]
&& CudaNdarray_HOST_DIMS(%(out)s)[3]==out_dim[3]))
{
Py_XDECREF(%(out)s);
%(out)s = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
}
out2 = validMM(%(img)s, %(kern)s, %(out)s);
if (out2==NULL){
%(fail)s
} }
assert (out2 == %(out)s);
""" % sub """ % sub
......
...@@ -119,15 +119,18 @@ CudaNdarray* validMM(const CudaNdarray *input, ...@@ -119,15 +119,18 @@ CudaNdarray* validMM(const CudaNdarray *input,
long inputWidth = CudaNdarray_HOST_DIMS(input)[3]; long inputWidth = CudaNdarray_HOST_DIMS(input)[3];
long outputWidth = (inputWidth + 2*padding - kW) / dW + 1; long outputWidth = (inputWidth + 2*padding - kW) / dW + 1;
long outputHeight = (inputHeight + 2*padding - kH) / dH + 1; long outputHeight = (inputHeight + 2*padding - kH) / dH + 1;
// Allocate output, size (batchSize, nOutputPlane, // check output, size (batchSize, nOutputPlane,
// outputHeight, outputWidth); // outputHeight, outputWidth);
int out_dim[4];
out_dim[0] = batchSize;
out_dim[1] = nOutputPlane;
out_dim[2] = outputHeight;
out_dim[3] = outputWidth;
output = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim); if (batchSize != CudaNdarray_HOST_DIMS(output)[0] ||
nOutputPlane != CudaNdarray_HOST_DIMS(output)[1] ||
outputHeight != CudaNdarray_HOST_DIMS(output)[2] ||
outputWidth != CudaNdarray_HOST_DIMS(output)[3]){
PyErr_SetString(PyExc_ValueError,
"GpuConvMM outputs parameter don't have the good shape\n"
);
return NULL;
}
// Create temporary columns // Create temporary columns
int col_dim[2]; int col_dim[2];
col_dim[0] = nInputPlane*kW*kH; col_dim[0] = nInputPlane*kW*kH;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论