提交 4d2769d2 authored 作者: Frederic Bastien's avatar Frederic Bastien

fix white space.

上级 22583950
...@@ -44,59 +44,59 @@ class GpuConv3D(theano.Op): ...@@ -44,59 +44,59 @@ class GpuConv3D(theano.Op):
H = outputs[0] H = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by GpuConv3D > ///////////// < code generated by GpuConv3D >
//printf("\t\t\t\tConv3DGPU c code\\n"); //printf("\t\t\t\tConv3DGPU c code\\n");
//Check dimensionality of inputs //Check dimensionality of inputs
if (%(W)s->nd != 5) if (%(W)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray"); PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray");
%(fail)s %(fail)s
} }
if (%(V)s->nd != 5) if (%(V)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray"); PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray");
%(fail)s %(fail)s
} }
if (%(b)s->nd != 1) if (%(b)s->nd != 1)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray"); PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray");
%(fail)s %(fail)s
} }
if (%(d)s->nd != 1) if (%(d)s->nd != 1)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
%(fail)s %(fail)s
} }
if (%(d)s->dimensions[0] != 3) if (%(d)s->dimensions[0] != 3)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]); PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]);
%(fail)s %(fail)s
} }
{ //extra scope so fail doesn't jump over declarations { //extra scope so fail doesn't jump over declarations
//Read and check sizes of inputs //Read and check sizes of inputs
const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0]; const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0]; const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0];
const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4]; const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels) if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels)
{ {
PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels); PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels);
%(fail)s %(fail)s
} }
{ //extra scope so error handler jumps don't cause errors { //extra scope so error handler jumps don't cause errors
const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1]; const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2]; const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3]; const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1]; const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1];
const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2]; const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2];
const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3]; const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3];
if (vidHeight < filterHeight) if (vidHeight < filterHeight)
{ {
PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight); PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight);
...@@ -116,58 +116,58 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); ...@@ -116,58 +116,58 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
} }
{ // extra scope so fail works { // extra scope so fail works
//Read and check stride arguments //Read and check stride arguments
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0); const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1); const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2); const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0) if (dr <= 0 || dc <= 0 || dt <= 0)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt); PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt);
%(fail)s %(fail)s
} }
{ // extra scope so fail works { // extra scope so fail works
//Make correctly sized output //Make correctly sized output
const int outputHeight = int( (vidHeight - filterHeight) / dr )+1; const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
const int outputWidth = int( (vidWidth - filterWidth) / dc )+1; const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const int outputDur = int( (vidDur - filterDur) / dt ) +1; const int outputDur = int( (vidDur - filterDur) / dt ) +1;
npy_intp dims[5]; npy_intp dims[5];
dims[0] = batchSize; dims[0] = batchSize;
dims[4] = outputChannels; dims[4] = outputChannels;
dims[1] = outputHeight; dims[1] = outputHeight;
dims[2] = outputWidth; dims[2] = outputWidth;
dims[3] = outputDur; dims[3] = outputDur;
if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] || if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] || CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] || CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] || CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){ CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){
Py_XDECREF(%(H)s); Py_XDECREF(%(H)s);
%(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims); %(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(H)s)) { if (!(%(H)s)) {
PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output"); PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output");
%(fail)s %(fail)s
} }
} }
{ // extra scope so fail will not cross declarations { // extra scope so fail will not cross declarations
//#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )#################### //#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )####################
const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4]; const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4]; const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3]; const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3]; const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2]; const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2]; const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1]; const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1]; const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0]; const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0]; const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];
// Compute H // Compute H
//H[i,x,y,t,j] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i, dr*r+k,dc*c+l,dt*t+m,z] //H[i,x,y,t,j] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i, dr*r+k,dc*c+l,dt*t+m,z]
bool out_contiguous = CudaNdarray_is_c_contiguous(%(H)s); bool out_contiguous = CudaNdarray_is_c_contiguous(%(H)s);
int version = -1; int version = -1;
...@@ -196,7 +196,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 & ...@@ -196,7 +196,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 &
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack' version\\n"); if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
} }
else else
...@@ -217,7 +217,7 @@ if(!work_complete){ ...@@ -217,7 +217,7 @@ if(!work_complete){
} }
}}}}}}} //extra scope so error handler jumps don't cross declarations }}}}}}} //extra scope so error handler jumps don't cross declarations
///////////// < /code generated by GpuConv3D > ///////////// < /code generated by GpuConv3D >
""" """
return strutil.renderString(codeSource,locals()) return strutil.renderString(codeSource,locals())
...@@ -231,14 +231,14 @@ __global__ void ...@@ -231,14 +231,14 @@ __global__ void
//grid block size =(out_len*out_wid, nb kern *nb batch) //grid block size =(out_len*out_wid, nb kern *nb batch)
// //
conv_rows_stack( float* img, float* kern, float* bias, float* out, conv_rows_stack( float* img, float* kern, float* bias, float* out,
int img_len, int img_wid, int img_dur, int img_len, int img_wid, int img_dur,
int kern_height, int kern_wid, int kern_dur, int kern_height, int kern_wid, int kern_dur,
int nkern, int input_channels, int nkern, int input_channels,
int dr, int dc, int dt, int dr, int dc, int dt,
int img_stride_frame, int img_stride_col, int img_stride_row, int img_stride_frame, int img_stride_col, int img_stride_row,
int img_stride_ochannel, int img_stride_batch, int img_stride_ochannel, int img_stride_batch,
int kern_stride_frame, int kern_stride_col, int kern_stride_row, int kern_stride_frame, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_okern) int kern_stride_stack, int kern_stride_okern)
{ {
int __shared__ out_len, out_wid, out_dur, batch_id, kern_id; int __shared__ out_len, out_wid, out_dur, batch_id, kern_id;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
...@@ -257,18 +257,18 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out, ...@@ -257,18 +257,18 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out,
kern += kern_id*kern_stride_okern; kern += kern_id*kern_stride_okern;
float sum = 0.0f; float sum = 0.0f;
for (int z = 0; z < input_channels; z++) {//1 for first layer for (int z = 0; z < input_channels; z++) {//1 for first layer
for (int k =0; k < kern_height; k++) { for (int k =0; k < kern_height; k++) {
for (int l = 0; l < kern_wid; l++) { for (int l = 0; l < kern_wid; l++) {
for (int m = 0; m < kern_dur; m++) { for (int m = 0; m < kern_dur; m++) {
sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] * sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] *
kern[kern_stride_stack*z+kern_stride_row*k+kern_stride_col*l+kern_stride_frame*m]; kern[kern_stride_stack*z+kern_stride_row*k+kern_stride_col*l+kern_stride_frame*m];
} }
} }
} }
out[batch_id*nkern*out_len*out_wid*out_dur+//the good batch out[batch_id*nkern*out_len*out_wid*out_dur+//the good batch
out_frame*nkern+//the output frame out_frame*nkern+//the output frame
out_row*out_wid*out_dur*nkern+//the output row out_row*out_wid*out_dur*nkern+//the output row
out_col*out_dur*nkern + //the output_col out_col*out_dur*nkern + //the output_col
kern_id //the output image (channel) kern_id //the output image (channel)
] = sum + bias[kern_id]; ] = sum + bias[kern_id];
......
...@@ -243,7 +243,7 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple ...@@ -243,7 +243,7 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack' version\\n"); if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
} }
else else
...@@ -276,16 +276,16 @@ __global__ void ...@@ -276,16 +276,16 @@ __global__ void
//grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3]) //grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3])
// //
convgrad_rows_stack( float* img, float* dCdH, float* dCdW, convgrad_rows_stack( float* img, float* dCdH, float* dCdW,
int img_len, int img_wid, int img_dur, int img_len, int img_wid, int img_dur,
int dCdW_len, int dCdW_wid, int dCdW_dur, int dCdW_len, int dCdW_wid, int dCdW_dur,
int wsh0, int wsh1, int wsh2, int wsh3, int wsh4, int wsh0, int wsh1, int wsh2, int wsh3, int wsh4,
int out_len, int out_wid, int out_dur, int out_len, int out_wid, int out_dur,
int batchSize, int nkern, int nstack, int batchSize, int nkern, int nstack,
int dr, int dc, int dt, int dr, int dc, int dt,
int img_stride_frame, int img_stride_col, int img_stride_row, int img_stride_frame, int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
int dCdW_stride_frame, int dCdW_stride_col, int dCdW_stride_row, int dCdW_stride_frame, int dCdW_stride_col, int dCdW_stride_row,
int dCdW_stride_stack, int dCdW_stride_nkern) int dCdW_stride_stack, int dCdW_stride_nkern)
{ {
int __shared__ kern_id, stack_id; int __shared__ kern_id, stack_id;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论