提交 4d2769d2 authored 作者: Frederic Bastien's avatar Frederic Bastien

fix white space.

上级 22583950
......@@ -44,59 +44,59 @@ class GpuConv3D(theano.Op):
H = outputs[0]
codeSource = """
///////////// < code generated by GpuConv3D >
///////////// < code generated by GpuConv3D >
//printf("\t\t\t\tConv3DGPU c code\\n");
//printf("\t\t\t\tConv3DGPU c code\\n");
//Check dimensionality of inputs
if (%(W)s->nd != 5)
{
//Check dimensionality of inputs
if (%(W)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray");
%(fail)s
}
}
if (%(V)s->nd != 5)
{
if (%(V)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray");
%(fail)s
}
}
if (%(b)s->nd != 1)
{
if (%(b)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray");
%(fail)s
}
}
if (%(d)s->nd != 1)
{
if (%(d)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
%(fail)s
}
if (%(d)s->dimensions[0] != 3)
{
}
if (%(d)s->dimensions[0] != 3)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]);
%(fail)s
}
}
{ //extra scope so fail doesn't jump over declarations
//Read and check sizes of inputs
const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0];
const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels)
{
//Read and check sizes of inputs
const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0];
const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels)
{
PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels);
%(fail)s
}
}
{ //extra scope so error handler jumps don't cause errors
const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1];
const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2];
const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3];
const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1];
const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2];
const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3];
if (vidHeight < filterHeight)
{
PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight);
......@@ -116,58 +116,58 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
}
{ // extra scope so fail works
//Read and check stride arguments
//Read and check stride arguments
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0)
{
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt);
%(fail)s
}
}
{ // extra scope so fail works
//Make correctly sized output
const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const int outputDur = int( (vidDur - filterDur) / dt ) +1;
npy_intp dims[5];
dims[0] = batchSize;
dims[4] = outputChannels;
dims[1] = outputHeight;
dims[2] = outputWidth;
dims[3] = outputDur;
if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){
Py_XDECREF(%(H)s);
%(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(H)s)) {
//Make correctly sized output
const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const int outputDur = int( (vidDur - filterDur) / dt ) +1;
npy_intp dims[5];
dims[0] = batchSize;
dims[4] = outputChannels;
dims[1] = outputHeight;
dims[2] = outputWidth;
dims[3] = outputDur;
if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){
Py_XDECREF(%(H)s);
%(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(H)s)) {
PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output");
%(fail)s
}
}
}
}
{ // extra scope so fail will not cross declarations
//#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )####################
const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];
// Compute H
//H[i,x,y,t,j] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i, dr*r+k,dc*c+l,dt*t+m,z]
//#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )####################
const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];
// Compute H
//H[i,x,y,t,j] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i, dr*r+k,dc*c+l,dt*t+m,z]
bool out_contiguous = CudaNdarray_is_c_contiguous(%(H)s);
int version = -1;
......@@ -196,7 +196,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 &
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
}
else
......@@ -217,7 +217,7 @@ if(!work_complete){
}
}}}}}}} //extra scope so error handler jumps don't cross declarations
///////////// < /code generated by GpuConv3D >
///////////// < /code generated by GpuConv3D >
"""
return strutil.renderString(codeSource,locals())
......@@ -231,14 +231,14 @@ __global__ void
//grid block size =(out_len*out_wid, nb kern *nb batch)
//
conv_rows_stack( float* img, float* kern, float* bias, float* out,
int img_len, int img_wid, int img_dur,
int img_len, int img_wid, int img_dur,
int kern_height, int kern_wid, int kern_dur,
int nkern, int input_channels,
int nkern, int input_channels,
int dr, int dc, int dt,
int img_stride_frame, int img_stride_col, int img_stride_row,
int img_stride_ochannel, int img_stride_batch,
int img_stride_ochannel, int img_stride_batch,
int kern_stride_frame, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_okern)
int kern_stride_stack, int kern_stride_okern)
{
int __shared__ out_len, out_wid, out_dur, batch_id, kern_id;
float __shared__ *d_img, *d_kern;
......@@ -257,18 +257,18 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out,
kern += kern_id*kern_stride_okern;
float sum = 0.0f;
for (int z = 0; z < input_channels; z++) {//1 for first layer
for (int k =0; k < kern_height; k++) {
for (int l = 0; l < kern_wid; l++) {
for (int m = 0; m < kern_dur; m++) {
sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] *
kern[kern_stride_stack*z+kern_stride_row*k+kern_stride_col*l+kern_stride_frame*m];
for (int k =0; k < kern_height; k++) {
for (int l = 0; l < kern_wid; l++) {
for (int m = 0; m < kern_dur; m++) {
sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] *
kern[kern_stride_stack*z+kern_stride_row*k+kern_stride_col*l+kern_stride_frame*m];
}
}
}
out[batch_id*nkern*out_len*out_wid*out_dur+//the good batch
out_frame*nkern+//the output frame
out_row*out_wid*out_dur*nkern+//the output row
out_frame*nkern+//the output frame
out_row*out_wid*out_dur*nkern+//the output row
out_col*out_dur*nkern + //the output_col
kern_id //the output image (channel)
] = sum + bias[kern_id];
......
......@@ -243,7 +243,7 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
}
else
......@@ -276,16 +276,16 @@ __global__ void
//grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3])
//
convgrad_rows_stack( float* img, float* dCdH, float* dCdW,
int img_len, int img_wid, int img_dur,
int img_len, int img_wid, int img_dur,
int dCdW_len, int dCdW_wid, int dCdW_dur,
int wsh0, int wsh1, int wsh2, int wsh3, int wsh4,
int out_len, int out_wid, int out_dur,
int batchSize, int nkern, int nstack,
int batchSize, int nkern, int nstack,
int dr, int dc, int dt,
int img_stride_frame, int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch,
int img_stride_stack, int img_stride_batch,
int dCdW_stride_frame, int dCdW_stride_col, int dCdW_stride_row,
int dCdW_stride_stack, int dCdW_stride_nkern)
int dCdW_stride_stack, int dCdW_stride_nkern)
{
int __shared__ kern_id, stack_id;
float __shared__ *d_img, *d_kern;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论