提交 b16cebc1 authored 作者: Frederic's avatar Frederic

indent for readability.

上级 9be069e2
...@@ -32,14 +32,29 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -32,14 +32,29 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (verbose>1) if (verbose>1)
{ {
fprintf(stderr, "INFO: Running conv_valid version=%d, MACRO kern_width=%d with inputs:\n",version,THEANO_KERN_WID); fprintf(stderr,
fprintf(stderr, "INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n", "INFO: Running conv_valid version=%d,"
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3], " MACRO kern_width=%d with inputs:\n",
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]); version, THEANO_KERN_WID);
fprintf(stderr, "INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n", fprintf(stderr,
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3], "INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]); CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
fprintf(stderr, "INFO: subsample_rows=%d, subsample_cols=%d\n", subsample_rows, subsample_cols); CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
fprintf(stderr,
"INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]);
fprintf(stderr,
"INFO: subsample_rows=%d, subsample_cols=%d\n",
subsample_rows, subsample_cols);
} }
//Check the output size is valid //Check the output size is valid
...@@ -98,9 +113,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -98,9 +113,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid); bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid); bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy //if the lower 2 dims are c_contiguous but flipped, unflipping the
// stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster) //allow to use a version that use less registers(so is faster)
//the unflipped version of variable haev the original value when we don't need to unflip it, but have the new value when we unflip it. //the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true; bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d; bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata; float * kern_data_unflipped = kern->devdata;
...@@ -115,8 +132,12 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -115,8 +132,12 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]); kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
} }
//if we remove the restriction img_size_byte+kern_size_byte>8*1024, we can enter in condition where we will lower the occupency due to shared memory and/or registers. //if we remove the restriction
if ((version == -1) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) && out_size<=256){ //img_size_byte+kern_size_byte>8*1024, we can enter in condition where
//we will lower the occupency due to shared memory and/or registers.
if ((version == -1) &&
(out_size<64 || img_size_byte+kern_size_byte>8*1024) &&
out_size<=256){
//condition for exec //condition for exec
if(!subsample && if(!subsample &&
out_contiguous && out_contiguous &&
...@@ -158,14 +179,25 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -158,14 +179,25 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) fprintf(stderr, "INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split); if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch' version %s nb_split=%d\n",
threads.y==out_len ? "no split": "split", nb_split);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_patch' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i, nb_split=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y, nb_split);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
if (out_contiguous && if (out_contiguous &&
...@@ -246,31 +278,48 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -246,31 +278,48 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
{ {
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i," " shared_size=%i, nb_threads=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i,", " kern_flipped=true, accumulate=false, kern_width=%i,"
" subsample_rows=%i, subsample_cols=%i\n", " img_c_contiguous_2d=%i,"
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, " kern_c_contiguous_2d=%i, nb_split=%i,"
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, " preload_full_kernel=%i,",
nb_split, preload_full_kernel, subsample_rows, subsample_cols); " subsample_rows=%i, subsample_cols=%i\n",
if (verbose) fprintf(stderr, threads.x, threads.y, grid.x, grid.y,
"INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i," shared_size, threads.x * threads.y,
" subsample_rows=%i, subsample_cols=%i\n", THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split,preload_full_kernel, subsample_rows, subsample_cols); nb_split, preload_full_kernel,
subsample_rows, subsample_cols);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch_stack' version with nb_split=%i"
" and preload_full_kernel=%i,"
" subsample_rows=%i, subsample_cols=%i\n",
nb_split, preload_full_kernel,
subsample_rows, subsample_cols);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i," fprintf(stderr,
" kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i,", " shared_size=%i, nb_threads=%i,"
" subsample_rows=%i, subsample_cols=%i\n", " kern_flipped=true, accumulate=false,"
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, " kern_width=%i, img_c_contiguous_2d=%i,"
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, " kern_c_contiguous_2d=%i, nb_split=%i,"
nb_split, preload_full_kernel, subsample_rows, subsample_cols); " preload_full_kernel=%i,"
if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n", " subsample_rows=%i, subsample_cols=%i\n",
cudaGetErrorString(sts)); threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel,
subsample_rows, subsample_cols);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch_stack' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
...@@ -309,13 +358,22 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -309,13 +358,22 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) fprintf(stderr, "INFO: used 'conv_rows' version\n"); if (verbose)
fprintf(stderr, "INFO: used 'conv_rows' version\n");
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_rows' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
...@@ -327,7 +385,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -327,7 +385,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
{ {
int nb_row=1; int nb_row=1;
int max_threads=512; int max_threads=512;
//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP. //TODO:if not c_contiguous, lower max_thread as we use 22
//registers by thread and we won't execute 2 block in one MP.
for(int i=2;i<=out_len;i++){ for(int i=2;i<=out_len;i++){
if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail) if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
nb_row=i; nb_row=i;
...@@ -345,10 +404,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -345,10 +404,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int, int); int, int);
if (0) if (0)
fprintf(stderr, "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n", fprintf(stderr,
img_contiguous_2d, kern_contiguous_2d, "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
threads.x, threads.y, threads.z, img_contiguous_2d, kern_contiguous_2d,
grid.x, grid.y, grid.z); threads.x, threads.y, threads.z,
grid.x, grid.y, grid.z);
if(!img_contiguous_2d || !kern_contiguous_2d) { if(!img_contiguous_2d || !kern_contiguous_2d) {
//fprintf(stderr, "using false version\n"); //fprintf(stderr, "using false version\n");
...@@ -373,14 +433,28 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -373,14 +433,28 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose>1)
if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack' version\n"); fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
...@@ -448,16 +522,32 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -448,16 +522,32 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", if (verbose>1)
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); fprintf(stderr,
if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_rows_stack2' version %s with"
" %d row(s).\n",
(version==9?"'load full kernel'":
"'load 1 kern row at a time'"),nb_row);
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n", if (verbose)
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,(version==9?2:3)); fprintf(stderr,
if (verbose) fprintf(stderr, "INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n", "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
cudaGetErrorString(sts)); " shared_size=%i, nb_threads=%i version=%d\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,(version==9?2:3));
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack2' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
...@@ -556,11 +646,18 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -556,11 +646,18 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>; else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID); CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
if (verbose) fprintf(stderr, "INFO: using 'conv_patch_stack_reduce' version kern_flipped=%i ccontig=%i nb_split=%d, preload_full_kern=%d\n", if (verbose)
kern_flipped,ccontig,nb_split,full_kern); fprintf(stderr,
if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", "INFO: using 'conv_patch_stack_reduce' version"
threads.x, threads.y, threads.z, grid.x, grid.y, " kern_flipped=%i ccontig=%i nb_split=%d,"
shared_size, threads.x * threads.y * threads.z); " preload_full_kern=%d\n",
kern_flipped, ccontig, nb_split, full_kern);
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i,"
" grid.y=%i, shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y,
shared_size, threads.x * threads.y * threads.z);
f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata, f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid, img_len, img_wid, kern_len, kern_wid,
nkern, nstack, nkern, nstack,
...@@ -575,9 +672,19 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -575,9 +672,19 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i,"
" nb_threads=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch_stack_reduce' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} // else no good nb_splits was found } // else no good nb_splits was found
} }
...@@ -651,13 +758,22 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -651,13 +758,22 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n"); if (verbose)
fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, shared_size=%i, nb_threads=%i\n", n_threads.x, n_threads.y, n_blocks, n_reduce_buf, n_threads.x * n_threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_valid_row_reduce' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i,"
" shared_size=%i, nb_threads=%i\n",
n_threads.x, n_threads.y, n_blocks,
n_reduce_buf, n_threads.x * n_threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_valid_row_reduce' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
...@@ -665,32 +781,61 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -665,32 +781,61 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
{ {
int outsize = CudaNdarray_SIZE(out); int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK); int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (1) if (1)
{ {
if (verbose) fprintf(stderr, "INFO: launching conv_reference_valid\n"); if (verbose)
if (verbose>1) fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, "INFO: launching conv_reference_valid\n");
nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid, if (verbose>1)
img->devdata, fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]); nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
if (verbose>1) fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n", img->devdata,
nkern, nstack, kern_len, kern_wid, CudaNdarray_HOST_STRIDES(img)[0],
kern->devdata, CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3] CudaNdarray_HOST_STRIDES(img)[2],
); CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose>1) fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n", if (verbose>1)
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid, fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
out->devdata, nkern, nstack, kern_len, kern_wid,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]); kern->devdata,
if (verbose>1) fprintf(stderr, " launch params: %i %i %i\n", outsize, n_blocks, n_threads); CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]);
if (verbose>1)
fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0],
CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose>1)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads);
} }
conv_reference_valid<<<n_blocks, n_threads>>>( nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1], conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
CudaNdarray_HOST_DIMS(img)[1],
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
out_len, out_wid, out_len, out_wid,
img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3], img->devdata,
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3], CudaNdarray_HOST_STRIDES(img)[0],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3], CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3],
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
...@@ -698,26 +843,29 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -698,26 +843,29 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) fprintf(stderr, "INFO: used 'conv_reference_valid' version\n"); if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
} }
else else
{ {
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for CudaNdarray_conv_valid! (%s)", PyErr_Format(PyExc_RuntimeError,
cudaGetErrorString(sts)); "ERROR: all implementations failed for"
" CudaNdarray_conv_valid! (%s)",
cudaGetErrorString(sts));
return -1; return -1;
} }
} }
assert (work_complete); assert (work_complete);
return 0; return 0;
//PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
//return -1;
} }
int int
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0) CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows,
int subsample_cols, int version = -1, int verbose=0)
{ {
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file. //144 is the biggest static shared size used with compiling this file.
const int shared_avail = SHARED_SIZE - 150;
int work_complete = 0; int work_complete = 0;
if (img->nd != 4) if (img->nd != 4)
...@@ -775,9 +923,12 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -775,9 +923,12 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
//const int out_size_byte = out_size*sizeof(float); // unused //const int out_size_byte = out_size*sizeof(float); // unused
if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){ if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) ||
PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for" (THEANO_KERN_WID == 0))){
" %d kernel columns, but the kernel we received had %d columns!", PyErr_Format(PyExc_ValueError,
"ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received"
" had %d columns!",
THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]); THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
return -1; return -1;
} }
...@@ -793,9 +944,11 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -793,9 +944,11 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy //if the lower 2 dims are c_contiguous but flipped, unflipping the
//stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster) //allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when we don't need to unflip it, but have the new value when we unflip it. //the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true; bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d; bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata; float * kern_data_unflipped = kern->devdata;
...@@ -812,13 +965,22 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -812,13 +965,22 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
if (verbose>1) if (verbose>1)
{ {
printf("INFO: Running conv_full version=%d, MACRO kern_width=%d with inputs:\n",version,THEANO_KERN_WID); printf("INFO: Running conv_full version=%d,"
" MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n", printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3], CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n", printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]); CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]);
} }
if (!subsample && if (!subsample &&
...@@ -840,13 +1002,16 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -840,13 +1002,16 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
} }
if(img_size_padded_byte+kern_size_byte>shared_avail) version=5; if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration. //we pass by ceil_intdiv in case the out_len is not a multiple
//of nb_split, we want nb_split the number of iteration.
//Max of 16k of shared memory //Max of 16k of shared memory
if(version==5) if(version==5)
while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++; while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
//327 as we use 25 register //327 as we use 25 register
//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512. //version 5 will have only 1 block running at a time, so we
//can use 32 registers per threads, but their is some other stuff that
//for the limit to bu lower then 512.
int max_thread = (version!=5?327:450); int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++; while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4; if(version==-1 && out_size>512)version=4;
...@@ -855,7 +1020,8 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -855,7 +1020,8 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
if(version==-1 && nb_split>1) version=4; if(version==-1 && nb_split>1) version=4;
else if(version==-1) version=3; else if(version==-1) version=3;
else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more than 1 split as to be always execute. //force version 4 when more than 1 split are needed to always execute.
else if(version==3 && nb_split!=1) version=4;
assert(version!=3 || nb_split==1); assert(version!=3 || nb_split==1);
assert(version!=5 || kern_len>1); assert(version!=5 || kern_len>1);
...@@ -901,16 +1067,40 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -901,16 +1067,40 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version); if (verbose>1)
if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false")); fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z,
out_len, nb_split, version);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_full_patch_stack_padded'"
" nb_split=%d low_mem=%s\n",
nb_split, (version==5?"true":"false"));
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch_stack_padded' %s %s failed (%s), trying next implementation\n", fprintf(stderr,
version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"), "threads.x=%i, threads.y=%i, threads.z=%i,"
cudaGetErrorString(sts)); " grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z,
out_len, nb_split, version);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_full_patch_stack_padded' %s %s"
" failed (%s), trying next implementation\n",
version==3?"no split": "split",
(version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts));
} }
} }
...@@ -943,9 +1133,17 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -943,9 +1133,17 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_full_patch' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
if (false && !subsample && //disabled as test fail for this kernel if (false && !subsample && //disabled as test fail for this kernel
...@@ -993,10 +1191,17 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -993,10 +1191,17 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_full_load_everything' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
} " shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
} }
if (!subsample && if (!subsample &&
...@@ -1034,14 +1239,21 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -1034,14 +1239,21 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n"); if (verbose)
fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
if (1 && !work_complete) //conv_reference_full if (1 && !work_complete) //conv_reference_full
...@@ -1050,52 +1262,100 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -1050,52 +1262,100 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
int outsize = CudaNdarray_SIZE(out); int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK); int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (0) if (0)
{ {
if (verbose) fprintf(stderr, "INFO: launching conv_reference_valid\n"); if (verbose)
if (verbose) fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, "INFO: launching conv_reference_valid\n");
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3], if (verbose)
img->devdata, fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_DIMS(img)[0],
if (verbose) fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n", CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(img)[2],
kern->devdata, CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3] img->devdata,
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose)
fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0],
CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2],
CudaNdarray_HOST_DIMS(kern)[3],
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]
); );
if (verbose) fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n", if (verbose)
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3], fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
out->devdata, CudaNdarray_HOST_DIMS(out)[0],
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]); CudaNdarray_HOST_DIMS(out)[1],
if (verbose) fprintf(stderr, " launch params: %i %i %i\n", outsize, n_blocks, n_threads); CudaNdarray_HOST_DIMS(out)[2],
if (verbose) fprintf(stderr, " subsample params: %i %i\n", subsample_rows, subsample_cols); CudaNdarray_HOST_DIMS(out)[3],
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads);
if (verbose)
fprintf(stderr, " subsample params: %i %i\n",
subsample_rows, subsample_cols);
} }
conv_reference_full<<<n_blocks, n_threads>>>(CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(img)[1], conv_reference_full<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3], img->devdata, CudaNdarray_HOST_STRIDES(img)[0],
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3], CudaNdarray_HOST_STRIDES(img)[1],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3], CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3],
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) fprintf(stderr, "INFO: used 'conv_reference_full' version ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d nkern=%d nstack=%d subsample=%d\n", if (verbose)
img_len,img_wid, kern_len, kern_wid, fprintf(stderr, "INFO: used 'conv_reference_full' version"
out_len, out_wid, nbatch, nkern, nstack, subsample); " ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d"
" nkern=%d nstack=%d subsample=%d\n",
img_len,img_wid, kern_len, kern_wid,
out_len, out_wid, nbatch, nkern, nstack, subsample);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", n_threads, 1, n_blocks, 1, 0, n_threads); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s), trying next implementation\n", fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
cudaGetErrorString(sts)); " shared_size=%i, nb_threads=%i\n",
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for CudaNdarray_conv_full! (%s)", n_threads, 1, n_blocks, 1, 0, n_threads);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for"
" CudaNdarray_conv_full! (%s)",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
return -1; return -1;
} }
} }
return 0; return 0;
...@@ -1110,8 +1370,16 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1110,8 +1370,16 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
// Re-use the out object if possible. If the out object it not used, then its refcount is not modified. // Re-use the out object if possible. If the out object it not used, then its refcount is not modified.
// If the out object is re-used then it is returned, and its refcount is incremented by 1. // If the out object is re-used then it is returned, and its refcount is incremented by 1.
// //
if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;} if (img->nd != 4)
if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;} {
PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
return NULL;
}
if (kern->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
return NULL;
}
int out_dim[4]; int out_dim[4];
out_dim[0] = CudaNdarray_HOST_DIMS(img)[0]; out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
...@@ -1145,7 +1413,10 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1145,7 +1413,10 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
} }
else else
{ {
if (out && verbose) fprintf(stderr, "INFO: Conv is ignoring 'out' argument with wrong structure.\n"); if (out && verbose)
fprintf(stderr,
"INFO: Conv is ignoring 'out' argument with wrong"
" structure.\n");
rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim); rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
//rval might be null //rval might be null
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论