提交 a8ce2352 authored 作者: James Bergstra's avatar James Bergstra

whitespace

上级 45b32281
...@@ -5,7 +5,7 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * ...@@ -5,7 +5,7 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray *
* version: -1, autodetect, >=0 a specific version to use. * version: -1, autodetect, >=0 a specific version to use.
* If it can't be executed, we revert to the reference implementation * If it can't be executed, we revert to the reference implementation
*/ */
int int
CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows, int subsample_cols, CudaNdarray * out, int subsample_rows, int subsample_cols,
int version = -1, int verbose=0) int version = -1, int verbose=0)
...@@ -38,8 +38,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -38,8 +38,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]); assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
// we now search through a few implementations until one applies to our arguments. // we now search through a few implementations until one applies to our arguments.
//TODO: make separate version as if all fill this is slower. //TODO: make separate version as if all fill this is slower.
//TODO: Make a switch with power of 2 max size as template //TODO: Make a switch with power of 2 max size as template
//TODO: make a parameter the number of division //TODO: make a parameter the number of division
//TODO: Should we make them in separate grid block instead? //TODO: Should we make them in separate grid block instead?
...@@ -149,7 +149,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -149,7 +149,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
img_len, img_wid, kern_len, kern_wid, nkern, nstack); img_len, img_wid, kern_len, kern_wid, nkern, nstack);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) printf("INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split); if (verbose) printf("INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split);
work_complete = true; work_complete = true;
...@@ -159,7 +159,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -159,7 +159,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split); if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split);
if (verbose) printf("INFO: impl 'conv_patch' failed (%s), trying next implementation\n", if (verbose) printf("INFO: impl 'conv_patch' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (!subsample && if (!subsample &&
out_contiguous && out_contiguous &&
...@@ -218,7 +218,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -218,7 +218,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose>1) if (verbose>1)
printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i," printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
...@@ -242,7 +242,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -242,7 +242,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
nb_split, preload_full_kernel); nb_split, preload_full_kernel);
if (verbose) printf("INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n", if (verbose) printf("INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
...@@ -277,7 +277,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -277,7 +277,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) printf("INFO: used 'conv_rows' version\n"); if (verbose) printf("INFO: used 'conv_rows' version\n");
...@@ -287,7 +287,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -287,7 +287,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_rows' failed (%s), trying next implementation\n", if (verbose) printf("INFO: impl 'conv_rows' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
(version==5||version==-1) && (version==5||version==-1) &&
...@@ -306,7 +306,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -306,7 +306,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
dim3 threads(out_wid,nb_row); dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern); dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float); int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
...@@ -332,7 +332,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -332,7 +332,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
...@@ -343,7 +343,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -343,7 +343,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n", if (verbose) printf("INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
...@@ -420,7 +420,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -420,7 +420,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,(version==9?2:3)); threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,(version==9?2:3));
if (verbose) printf("INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n", if (verbose) printf("INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
//version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient. //version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient.
...@@ -633,7 +633,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -633,7 +633,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) printf("INFO: used 'conv_reference_valid' version\n"); if (verbose) printf("INFO: used 'conv_reference_valid' version\n");
...@@ -651,7 +651,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -651,7 +651,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//return -1; //return -1;
} }
int int
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0) CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
{ {
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file. const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
...@@ -833,7 +833,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -833,7 +833,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose>1) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version); if (verbose>1) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version);
if (verbose) printf("INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false")); if (verbose) printf("INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false"));
...@@ -1035,7 +1035,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -1035,7 +1035,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
return 0; return 0;
} }
PyObject * PyObject *
CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
CudaNdarray * out, const int mode, CudaNdarray * out, const int mode,
const int subsample_rows, const int subsample_cols, const int subsample_rows, const int subsample_cols,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论