// REMEMBER TO RAISE c_code_cache_version when changing this file
//
enum { ConvMode_FULL, ConvMode_VALID };
enum { ConvMode_FULL, ConvMode_VALID };
PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
if (verbose) printf("INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
}
}
else
else
{
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n",
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n",
if (verbose) printf("INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
if (verbose) fprintf(stderr, "INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
cudaGetErrorString(sts));
}
}
}
}
//version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient.
//version 8 is the same but we force the split.
// The split is need in case we have too much threads.
// This happen frequently if the kernel length is big.
// Big kernel is frequent in the gradient.
//version 8 need a minimum of kernel length as we force the split.
//version 8 need a minimum of kernel length as we force the split.
//version 8 is needed to test more easily this kernel template parameter.
//version 8 is needed to test more easily this kernel template parameter.
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
(version!=13||kern_len>1) && //version 13 need a minimal kernel length as big as the split.
//version 13 need a minimal kernel length as big as the split.
(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
{
{
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.