if (verbose) printf("INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
if (verbose) printf("INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
cudaGetErrorString(sts));
}
}
}
}
//version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient.
//version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient.
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
{
{
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.