提交 553b256e authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merge pull request #540 from nouiz/gpuconv

Gpuconv
...@@ -10,7 +10,7 @@ Documentation ...@@ -10,7 +10,7 @@ Documentation
Interface changes Interface changes
* In 0.5, we removed the deprecated sharedvar.value property. * In 0.5, we removed the deprecated sharedvar.value property.
Now we raise an error if you access it. Now we raise an error if you access it. (Frederic B.)
* theano.function does not accept duplicate inputs, so function([x, x], ...) * theano.function does not accept duplicate inputs, so function([x, x], ...)
does not work anymore. (Pascal L.) does not work anymore. (Pascal L.)
* theano.function now raises an error if some of the provided inputs are * theano.function now raises an error if some of the provided inputs are
...@@ -23,15 +23,16 @@ New Features ...@@ -23,15 +23,16 @@ New Features
* debugprint new param ids=["CHAR", "id", "int", ""] * debugprint new param ids=["CHAR", "id", "int", ""]
This makes the identifier printed to be the python id, a unique char, a This makes the identifier printed to be the python id, a unique char, a
unique int, or not have it printed. We changed the default to be "CHAR" unique int, or not have it printed. We changed the default to be "CHAR"
as this is more readable. as this is more readable. (Frederic B.)
* debugprint new param stop_on_name=[False, True]. If True, we don't print * debugprint new param stop_on_name=[False, True]. If True, we don't print
anything below an intermediate variable that has a name. Defaults to False. anything below an intermediate variable that has a name. Defaults to False.
* debugprint does not print anymore the "|" symbol in a column after the last input. (Frederic B.)
* debugprint does not print anymore the "|" symbol in a column after the last input. (Frederic B.)
* If you use Enthought Python Distribution (EPD) now we use its blas * If you use Enthought Python Distribution (EPD) now we use its blas
implementation by default. implementation by default. (Frederic B.)
Sparse Sandbox graduate Sparse Sandbox graduate
* Remove0 op: it remove store element with value 0. * Remove0 op: it remove store element with value 0. (Frederic B.)
Sparse Sandbox Addition (Not reviewed/documented/tested, but used by some people) Sparse Sandbox Addition (Not reviewed/documented/tested, but used by some people)
* They are all in the theano.sparse.sandbox.sp2 module * They are all in the theano.sparse.sandbox.sp2 module
...@@ -50,7 +51,9 @@ Crash Fix ...@@ -50,7 +51,9 @@ Crash Fix
empty string (Frederic B.) empty string (Frederic B.)
* When importing theano on a computer without GPU with the Theano * When importing theano on a computer without GPU with the Theano
flags 'device' or 'init_gpu_device' set to gpu* (Frederic B., reported by Luo Heng) flags 'device' or 'init_gpu_device' set to gpu* (Frederic B., reported by Luo Heng)
* Optimization print useless error when scipy is not available. (Frederic B.)
* Gpu conv crash/slowdown on newer hardware? (James B.)
* Better error handling in gpu conv (Frederic B.)
============= =============
Release Notes Release Notes
......
...@@ -704,7 +704,7 @@ class GpuConv(GpuOp): ...@@ -704,7 +704,7 @@ class GpuConv(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 17) return (0, 18)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
......
...@@ -32,14 +32,29 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -32,14 +32,29 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (verbose>1) if (verbose>1)
{ {
fprintf(stderr, "INFO: Running conv_valid version=%d, MACRO kern_width=%d with inputs:\n",version,THEANO_KERN_WID); fprintf(stderr,
fprintf(stderr, "INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n", "INFO: Running conv_valid version=%d,"
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3], " MACRO kern_width=%d with inputs:\n",
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]); version, THEANO_KERN_WID);
fprintf(stderr, "INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n", fprintf(stderr,
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3], "INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]); CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
fprintf(stderr, "INFO: subsample_rows=%d, subsample_cols=%d\n", subsample_rows, subsample_cols); CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
fprintf(stderr,
"INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]);
fprintf(stderr,
"INFO: subsample_rows=%d, subsample_cols=%d\n",
subsample_rows, subsample_cols);
} }
//Check the output size is valid //Check the output size is valid
...@@ -98,9 +113,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -98,9 +113,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid); bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid); bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy //if the lower 2 dims are c_contiguous but flipped, unflipping the
// stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster) //allow to use a version that use less registers(so is faster)
//the unflipped version of variable haev the original value when we don't need to unflip it, but have the new value when we unflip it. //the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true; bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d; bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata; float * kern_data_unflipped = kern->devdata;
...@@ -115,8 +132,12 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -115,8 +132,12 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]); kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
} }
//if we remove the restriction img_size_byte+kern_size_byte>8*1024, we can enter in condition where we will lower the occupency due to shared memory and/or registers. //if we remove the restriction
if ((version == -1) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) && out_size<=256){ //img_size_byte+kern_size_byte>8*1024, we can enter in condition where
//we will lower the occupency due to shared memory and/or registers.
if ((version == -1) &&
(out_size<64 || img_size_byte+kern_size_byte>8*1024) &&
out_size<=256){
//condition for exec //condition for exec
if(!subsample && if(!subsample &&
out_contiguous && out_contiguous &&
...@@ -158,13 +179,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -158,13 +179,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) fprintf(stderr, "INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split); if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch' version %s nb_split=%d\n",
threads.y==out_len ? "no split": "split", nb_split);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_patch' failed (%s), trying next implementation\n", fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i, nb_split=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y, nb_split);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -246,30 +278,47 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -246,30 +278,47 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
{ {
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i," " shared_size=%i, nb_threads=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i,", " kern_flipped=true, accumulate=false, kern_width=%i,"
" img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,",
" subsample_rows=%i, subsample_cols=%i\n", " subsample_rows=%i, subsample_cols=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel, subsample_rows, subsample_cols); nb_split, preload_full_kernel,
if (verbose) fprintf(stderr, subsample_rows, subsample_cols);
"INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i," if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch_stack' version with nb_split=%i"
" and preload_full_kernel=%i,"
" subsample_rows=%i, subsample_cols=%i\n", " subsample_rows=%i, subsample_cols=%i\n",
nb_split,preload_full_kernel, subsample_rows, subsample_cols); nb_split, preload_full_kernel,
subsample_rows, subsample_cols);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i," fprintf(stderr,
" kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i,", " shared_size=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false,"
" kern_width=%i, img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%i, subsample_cols=%i\n", " subsample_rows=%i, subsample_cols=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel, subsample_rows, subsample_cols); nb_split, preload_full_kernel,
if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n", subsample_rows, subsample_cols);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch_stack' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -309,12 +358,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -309,12 +358,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) fprintf(stderr, "INFO: used 'conv_rows' version\n"); if (verbose)
fprintf(stderr, "INFO: used 'conv_rows' version\n");
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_rows' failed (%s), trying next implementation\n", fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -327,7 +385,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -327,7 +385,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
{ {
int nb_row=1; int nb_row=1;
int max_threads=512; int max_threads=512;
//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP. //TODO:if not c_contiguous, lower max_thread as we use 22
//registers by thread and we won't execute 2 block in one MP.
for(int i=2;i<=out_len;i++){ for(int i=2;i<=out_len;i++){
if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail) if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
nb_row=i; nb_row=i;
...@@ -345,7 +404,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -345,7 +404,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int, int); int, int);
if (0) if (0)
fprintf(stderr, "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n", fprintf(stderr,
"IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
img_contiguous_2d, kern_contiguous_2d, img_contiguous_2d, kern_contiguous_2d,
threads.x, threads.y, threads.z, threads.x, threads.y, threads.z,
grid.x, grid.y, grid.z); grid.x, grid.y, grid.z);
...@@ -373,13 +433,27 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -373,13 +433,27 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose>1)
if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack' version\n"); fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n", fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -448,15 +522,31 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -448,15 +522,31 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", if (verbose>1)
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); fprintf(stderr,
if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_rows_stack2' version %s with"
" %d row(s).\n",
(version==9?"'load full kernel'":
"'load 1 kern row at a time'"),nb_row);
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n", if (verbose)
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,(version==9?2:3)); fprintf(stderr,
if (verbose) fprintf(stderr, "INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n", "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i version=%d\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,(version==9?2:3));
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack2' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -556,9 +646,16 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -556,9 +646,16 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>; else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID); CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
if (verbose) fprintf(stderr, "INFO: using 'conv_patch_stack_reduce' version kern_flipped=%i ccontig=%i nb_split=%d, preload_full_kern=%d\n", if (verbose)
kern_flipped,ccontig,nb_split,full_kern); fprintf(stderr,
if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", "INFO: using 'conv_patch_stack_reduce' version"
" kern_flipped=%i ccontig=%i nb_split=%d,"
" preload_full_kern=%d\n",
kern_flipped, ccontig, nb_split, full_kern);
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i,"
" grid.y=%i, shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y, threads.x, threads.y, threads.z, grid.x, grid.y,
shared_size, threads.x * threads.y * threads.z); shared_size, threads.x * threads.y * threads.z);
f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata, f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
...@@ -575,8 +672,18 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -575,8 +672,18 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n", fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i,"
" nb_threads=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch_stack_reduce' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} // else no good nb_splits was found } // else no good nb_splits was found
...@@ -651,12 +758,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -651,12 +758,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n"); if (verbose)
fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, shared_size=%i, nb_threads=%i\n", n_threads.x, n_threads.y, n_blocks, n_reduce_buf, n_threads.x * n_threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_valid_row_reduce' failed (%s), trying next implementation\n", fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i,"
" shared_size=%i, nb_threads=%i\n",
n_threads.x, n_threads.y, n_blocks,
n_reduce_buf, n_threads.x * n_threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_valid_row_reduce' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -665,32 +781,61 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -665,32 +781,61 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
{ {
int outsize = CudaNdarray_SIZE(out); int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK); int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (1) if (1)
{ {
if (verbose) fprintf(stderr, "INFO: launching conv_reference_valid\n"); if (verbose)
if (verbose>1) fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose>1)
fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid, nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
img->devdata, img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_STRIDES(img)[0],
if (verbose>1) fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n", CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose>1)
fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
nkern, nstack, kern_len, kern_wid, nkern, nstack, kern_len, kern_wid,
kern->devdata, kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3] CudaNdarray_HOST_STRIDES(kern)[0],
); CudaNdarray_HOST_STRIDES(kern)[1],
if (verbose>1) fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n", CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid, CudaNdarray_HOST_STRIDES(kern)[3]);
if (verbose>1)
fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0],
CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
out->devdata, out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]); CudaNdarray_HOST_STRIDES(out)[0],
if (verbose>1) fprintf(stderr, " launch params: %i %i %i\n", outsize, n_blocks, n_threads); CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose>1)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads);
} }
conv_reference_valid<<<n_blocks, n_threads>>>( nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1], conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
CudaNdarray_HOST_DIMS(img)[1],
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
out_len, out_wid, out_len, out_wid,
img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3], img->devdata,
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3], CudaNdarray_HOST_STRIDES(img)[0],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3], CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3],
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
...@@ -698,26 +843,37 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -698,26 +843,37 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) fprintf(stderr, "INFO: used 'conv_reference_valid' version\n"); if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
} }
else else
{ {
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for CudaNdarray_conv_valid! (%s)", PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for"
" CudaNdarray_conv_valid! (%s)",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
return -1; return -1;
} }
} }
assert (work_complete); if (!work_complete)
{
PyErr_Format(PyExc_RuntimeError,
"ERROR: no implementation(s) worked for"
" CudaNdarray_conv_valid!"
" Version asked(%d) (-1 mean use an heuristic)",
version);
return -1;
}
return 0; return 0;
//PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
//return -1;
} }
int int
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0) CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows,
int subsample_cols, int version = -1, int verbose=0)
{ {
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file. //144 is the biggest static shared size used with compiling this file.
const int shared_avail = SHARED_SIZE - 150;
int work_complete = 0; int work_complete = 0;
if (img->nd != 4) if (img->nd != 4)
...@@ -775,9 +931,12 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -775,9 +931,12 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
//const int out_size_byte = out_size*sizeof(float); // unused //const int out_size_byte = out_size*sizeof(float); // unused
if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){ if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) ||
PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for" (THEANO_KERN_WID == 0))){
" %d kernel columns, but the kernel we received had %d columns!", PyErr_Format(PyExc_ValueError,
"ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received"
" had %d columns!",
THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]); THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
return -1; return -1;
} }
...@@ -793,9 +952,11 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -793,9 +952,11 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy //if the lower 2 dims are c_contiguous but flipped, unflipping the
//stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster) //allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when we don't need to unflip it, but have the new value when we unflip it. //the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true; bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d; bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata; float * kern_data_unflipped = kern->devdata;
...@@ -812,13 +973,22 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -812,13 +973,22 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
if (verbose>1) if (verbose>1)
{ {
printf("INFO: Running conv_full version=%d, MACRO kern_width=%d with inputs:\n",version,THEANO_KERN_WID); printf("INFO: Running conv_full version=%d,"
" MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n", printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3], CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n", printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]); CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]);
} }
if (!subsample && if (!subsample &&
...@@ -840,13 +1010,16 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -840,13 +1010,16 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
} }
if(img_size_padded_byte+kern_size_byte>shared_avail) version=5; if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration. //we pass by ceil_intdiv in case the out_len is not a multiple
//of nb_split, we want nb_split the number of iteration.
//Max of 16k of shared memory //Max of 16k of shared memory
if(version==5) if(version==5)
while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++; while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
//327 as we use 25 register //327 as we use 25 register
//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512. //version 5 will have only 1 block running at a time, so we
//can use 32 registers per threads, but their is some other stuff that
//for the limit to bu lower then 512.
int max_thread = (version!=5?327:450); int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++; while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4; if(version==-1 && out_size>512)version=4;
...@@ -855,7 +1028,8 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -855,7 +1028,8 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
if(version==-1 && nb_split>1) version=4; if(version==-1 && nb_split>1) version=4;
else if(version==-1) version=3; else if(version==-1) version=3;
else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more than 1 split as to be always execute. //force version 4 when more than 1 split are needed to always execute.
else if(version==3 && nb_split!=1) version=4;
assert(version!=3 || nb_split==1); assert(version!=3 || nb_split==1);
assert(version!=5 || kern_len>1); assert(version!=5 || kern_len>1);
...@@ -901,15 +1075,39 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -901,15 +1075,39 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version); if (verbose>1)
if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false")); fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z,
out_len, nb_split, version);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_full_patch_stack_padded'"
" nb_split=%d low_mem=%s\n",
nb_split, (version==5?"true":"false"));
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch_stack_padded' %s %s failed (%s), trying next implementation\n", fprintf(stderr,
version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"), "threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z,
out_len, nb_split, version);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_full_patch_stack_padded' %s %s"
" failed (%s), trying next implementation\n",
version==3?"no split": "split",
(version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -943,8 +1141,16 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -943,8 +1141,16 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n", fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_full_patch' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -993,8 +1199,15 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -993,8 +1199,15 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_full_load_everything' failed (%s), trying next implementation\n", fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -1034,13 +1247,20 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -1034,13 +1247,20 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n"); if (verbose)
fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n", fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -1050,50 +1270,98 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -1050,50 +1270,98 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
int outsize = CudaNdarray_SIZE(out); int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK); int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (0) if (0)
{ {
if (verbose) fprintf(stderr, "INFO: launching conv_reference_valid\n"); if (verbose)
if (verbose) fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, "INFO: launching conv_reference_valid\n");
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3], if (verbose)
fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0],
CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2],
CudaNdarray_HOST_DIMS(img)[3],
img->devdata, img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_STRIDES(img)[0],
if (verbose) fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n", CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose)
fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0],
CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2],
CudaNdarray_HOST_DIMS(kern)[3],
kern->devdata, kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3] CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]
); );
if (verbose) fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n", if (verbose)
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3], fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0],
CudaNdarray_HOST_DIMS(out)[1],
CudaNdarray_HOST_DIMS(out)[2],
CudaNdarray_HOST_DIMS(out)[3],
out->devdata, out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]); CudaNdarray_HOST_STRIDES(out)[0],
if (verbose) fprintf(stderr, " launch params: %i %i %i\n", outsize, n_blocks, n_threads); CudaNdarray_HOST_STRIDES(out)[1],
if (verbose) fprintf(stderr, " subsample params: %i %i\n", subsample_rows, subsample_cols); CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads);
if (verbose)
fprintf(stderr, " subsample params: %i %i\n",
subsample_rows, subsample_cols);
} }
conv_reference_full<<<n_blocks, n_threads>>>(CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(img)[1], conv_reference_full<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3], img->devdata, CudaNdarray_HOST_STRIDES(img)[0],
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3], CudaNdarray_HOST_STRIDES(img)[1],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3], CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3],
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) fprintf(stderr, "INFO: used 'conv_reference_full' version ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d nkern=%d nstack=%d subsample=%d\n", if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_full' version"
" ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d"
" nkern=%d nstack=%d subsample=%d\n",
img_len,img_wid, kern_len, kern_wid, img_len,img_wid, kern_len, kern_wid,
out_len, out_wid, nbatch, nkern, nstack, subsample); out_len, out_wid, nbatch, nkern, nstack, subsample);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", n_threads, 1, n_blocks, 1, 0, n_threads); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s), trying next implementation\n", fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
n_threads, 1, n_blocks, 1, 0, n_threads);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for CudaNdarray_conv_full! (%s)", PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for"
" CudaNdarray_conv_full! (%s)",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
return -1; return -1;
} }
...@@ -1110,8 +1378,16 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1110,8 +1378,16 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
// Re-use the out object if possible. If the out object it not used, then its refcount is not modified. // Re-use the out object if possible. If the out object it not used, then its refcount is not modified.
// If the out object is re-used then it is returned, and its refcount is incremented by 1. // If the out object is re-used then it is returned, and its refcount is incremented by 1.
// //
if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;} if (img->nd != 4)
if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;} {
PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
return NULL;
}
if (kern->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
return NULL;
}
int out_dim[4]; int out_dim[4];
out_dim[0] = CudaNdarray_HOST_DIMS(img)[0]; out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
...@@ -1145,7 +1421,10 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1145,7 +1421,10 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
} }
else else
{ {
if (out && verbose) fprintf(stderr, "INFO: Conv is ignoring 'out' argument with wrong structure.\n"); if (out && verbose)
fprintf(stderr,
"INFO: Conv is ignoring 'out' argument with wrong"
" structure.\n");
rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim); rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
//rval might be null //rval might be null
} }
...@@ -1162,3 +1441,13 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1162,3 +1441,13 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
return (PyObject*)rval; return (PyObject*)rval;
} }
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
...@@ -442,3 +442,13 @@ conv_full_load_everything( float* img, float* kern, float* out, ...@@ -442,3 +442,13 @@ conv_full_load_everything( float* img, float* kern, float* out,
__syncthreads(); //don't start loading another kernel until we're done here __syncthreads(); //don't start loading another kernel until we're done here
} }
} }
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
...@@ -1030,3 +1030,13 @@ conv_reference_full(int nB, int nK, int stacklen, ...@@ -1030,3 +1030,13 @@ conv_reference_full(int nB, int nK, int stacklen,
} }
#endif // #ifndef CONV_KERNEL_CU #endif // #ifndef CONV_KERNEL_CU
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
...@@ -4132,7 +4132,6 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self) ...@@ -4132,7 +4132,6 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
mode:c++ mode:c++
c-basic-offset:4 c-basic-offset:4
c-file-style:"stroustrup" c-file-style:"stroustrup"
c-file-offsets:((innamespace . 0)(inline-open . 0))
indent-tabs-mode:nil indent-tabs-mode:nil
fill-column:79 fill-column:79
End: End:
......
...@@ -347,7 +347,6 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self); ...@@ -347,7 +347,6 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
mode:c++ mode:c++
c-basic-offset:4 c-basic-offset:4
c-file-style:"stroustrup" c-file-style:"stroustrup"
c-file-offsets:((innamespace . 0)(inline-open . 0))
indent-tabs-mode:nil indent-tabs-mode:nil
fill-column:79 fill-column:79
End: End:
......
...@@ -24,12 +24,13 @@ if cuda_ndarray.cuda_available == False: ...@@ -24,12 +24,13 @@ if cuda_ndarray.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
#needed as the gpu conv don't have a perform implementation. #needed as the gpu conv don't have a perform implementation.
if theano.config.mode=='FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu') theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else: else:
theano_mode = theano.compile.mode.get_default_mode().including('gpu') theano_mode = theano.compile.mode.get_default_mode().including('gpu')
cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False]*4) cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4)
def py_conv_valid_numpy(img, kern): def py_conv_valid_numpy(img, kern):
assert img.shape[1] == kern.shape[1] assert img.shape[1] == kern.shape[1]
...@@ -42,19 +43,27 @@ def py_conv_valid_numpy(img, kern): ...@@ -42,19 +43,27 @@ def py_conv_valid_numpy(img, kern):
for rr in xrange(out.shape[2]): for rr in xrange(out.shape[2]):
for cc in xrange(out.shape[3]): for cc in xrange(out.shape[3]):
#rr, cc is the upper-left corner of img patches #rr, cc is the upper-left corner of img patches
imgpatch = img[b,:,rr:rr+kern.shape[2], cc:cc+kern.shape[3]] imgpatch = img[b, :, rr:rr + kern.shape[2],
cc:cc + kern.shape[3]]
#print img.shape, kern.shape, imgpatch.shape, rr+kern.shape[2]-1, rr-1, -1 #print img.shape, kern.shape, imgpatch.shape, rr+kern.shape[2]-1, rr-1, -1
innerprod = (imgpatch[:,::-1,::-1] * kern[k,:,:,:]).sum() innerprod = (imgpatch[:, ::-1, ::-1] *
kern[k, :, :, :]).sum()
out[b, k, rr, cc] = innerprod out[b, k, rr, cc] = innerprod
return out return out
def py_conv_full_numpy(img, kern): def py_conv_full_numpy(img, kern):
# manually pad the img with zeros all around, and then run it through py_conv_valid # manually pad the img with zeros all around, and then run it
pad_rows = 2*(kern.shape[2]-1) + img.shape[2] # through py_conv_valid
pad_cols = 2*(kern.shape[3]-1) + img.shape[3] pad_rows = 2 * (kern.shape[2] - 1) + img.shape[2]
padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols), dtype=img.dtype) pad_cols = 2 * (kern.shape[3] - 1) + img.shape[3]
padded_img[:,:,kern.shape[2]-1:kern.shape[2]-1+img.shape[2],kern.shape[3]-1:kern.shape[3]-1+img.shape[3]] = img padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols),
dtype=img.dtype)
padded_img[:, :, kern.shape[2] - 1: kern.shape[2] - 1 + img.shape[2],
kern.shape[3] - 1: kern.shape[3] - 1 + img.shape[3]] = img
return py_conv_valid_numpy(padded_img, kern) return py_conv_valid_numpy(padded_img, kern)
def py_conv(img, kern, mode, subsample): def py_conv(img, kern, mode, subsample):
""" """
use a scipy or numpy implementation depending is scipy is available. use a scipy or numpy implementation depending is scipy is available.
...@@ -62,13 +71,16 @@ def py_conv(img, kern, mode, subsample): ...@@ -62,13 +71,16 @@ def py_conv(img, kern, mode, subsample):
""" """
if imported_scipy_convolve2d: if imported_scipy_convolve2d:
return py_conv_scipy(img, kern, mode, subsample) return py_conv_scipy(img, kern, mode, subsample)
elif mode=='valid': elif mode == 'valid':
return py_conv_valid_numpy(img,kern)[:,:,::subsample[0],::subsample[1]] return py_conv_valid_numpy(img, kern)[:, :, ::subsample[0],
elif mode=='full': ::subsample[1]]
return py_conv_full_numpy(img,kern)[:,:,::subsample[0],::subsample[1]] elif mode == 'full':
return py_conv_full_numpy(img, kern)[:, :, ::subsample[0],
::subsample[1]]
else: else:
raise Exception("Can't execute this kernel.") raise Exception("Can't execute this kernel.")
def py_conv_scipy(img, kern, mode, subsample): def py_conv_scipy(img, kern, mode, subsample):
assert img.shape[1] == kern.shape[1] assert img.shape[1] == kern.shape[1]
if mode == 'valid': if mode == 'valid':
...@@ -83,17 +95,20 @@ def py_conv_scipy(img, kern, mode, subsample): ...@@ -83,17 +95,20 @@ def py_conv_scipy(img, kern, mode, subsample):
for b in xrange(out.shape[0]): for b in xrange(out.shape[0]):
for k in xrange(out.shape[1]): for k in xrange(out.shape[1]):
for s in xrange(img.shape[1]): for s in xrange(img.shape[1]):
out[b,k,:,:] += convolve2d(img[b,s,:,:] out[b, k, :, :] += convolve2d(img[b, s, :, :],
, kern[k,s,:,:] kern[k, s, :, :],
, mode) mode)
return out[:,:,::subsample[0], ::subsample[1]] return out[:, :, ::subsample[0], ::subsample[1]]
def _params_allgood_header(): def _params_allgood_header():
print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup" print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup"
def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
kern_stride=(1,1), version=-1, verbose=0, random=True, print_=None, def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
id=None, rtol=1e-5, atol = 1e-8, nb_iter=0, ones=False, compile_kshp=None): kern_stride=(1, 1), version=-1, verbose=0, random=True,
print_=None, id=None, rtol=1e-5, atol=1e-8,
nb_iter=0, ones=False, compile_kshp=None):
# #
# This function is the core of several of the big unit-test drivers, # This function is the core of several of the big unit-test drivers,
# but it can also be used very directly on its own to test a specific # but it can also be used very directly on its own to test a specific
...@@ -111,22 +126,27 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ...@@ -111,22 +126,27 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
npy_img = theano._asarray(numpy.ones(ishape), dtype='float32') npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32') npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
elif random: elif random:
npy_img = theano._asarray(numpy.random.rand(*ishape)+1, dtype='float32') npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
npy_kern = theano._asarray(numpy.random.rand(*kshape)-2, dtype='float32') dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
dtype='float32')
else: else:
npy_img = theano._asarray(numpy.arange(numpy.prod(ishape)).reshape(ishape), dtype='float32')+1 npy_img = theano._asarray(numpy.arange(
npy_kern = -(theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32')+1) numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
npy_kern = -(theano._asarray(numpy.arange(
numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
img = cuda_ndarray.CudaNdarray(npy_img) img = cuda_ndarray.CudaNdarray(npy_img)
kern = cuda_ndarray.CudaNdarray(npy_kern) kern = cuda_ndarray.CudaNdarray(npy_kern)
#we take the stride after the transfert as we make c_contiguous data on the GPU. #we take the stride after the transfert as we make c_contiguous
if img_stride!=(1,1): #data on the GPU.
img=img[:,:,::img_stride[0],::img_stride[1]] if img_stride != (1, 1):
npy_img = npy_img[:,:,::img_stride[0],::img_stride[1]] img = img[:, :, ::img_stride[0], ::img_stride[1]]
if kern_stride!=(1,1): npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]]
kern=kern[:,:,::kern_stride[0],::kern_stride[1]] if kern_stride != (1, 1):
npy_kern = npy_kern[:,:,::kern_stride[0],::kern_stride[1]] kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]]
npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]
t2 = None t2 = None
rval = True rval = True
...@@ -139,20 +159,23 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ...@@ -139,20 +159,23 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode, op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
subsample=subsample, subsample=subsample,
version=version, version=version,
verbose=verbose, kshp=compile_kshp)(i,k) verbose=verbose,
f=theano.function([i,k],op, mode=theano_mode) kshp=compile_kshp)(i, k)
gpuval = f(img,kern) f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(img, kern)
t2 = time.time() t2 = time.time()
for i in range(nb_iter): for i in range(nb_iter):
gpuval2 = f(img,kern) gpuval2 = f(img, kern)
assert numpy.allclose(numpy.asarray(gpuval),numpy.asarray(gpuval2)) assert numpy.allclose(numpy.asarray(gpuval),
assert (numpy.asarray(gpuval)==numpy.asarray(gpuval2)).all() numpy.asarray(gpuval2))
assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
gpuval = numpy.asarray(gpuval) gpuval = numpy.asarray(gpuval)
if gpuval.shape != cpuval.shape: if gpuval.shape != cpuval.shape:
print >> sys.stdout, "ERROR: shape mismatch", gpuval.shape, cpuval.shape print >> sys.stdout, "ERROR: shape mismatch",
print >> sys.stdout, gpuval.shape, cpuval.shape
rval = False rval = False
if rval: if rval:
rval = numpy.allclose(cpuval, gpuval, rtol = rtol) rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
assert numpy.all(numpy.isfinite(gpuval)) assert numpy.all(numpy.isfinite(gpuval))
except NotImplementedError, e: except NotImplementedError, e:
print >> sys.stdout, '_params_allgood Failed allclose', e print >> sys.stdout, '_params_allgood Failed allclose', e
...@@ -164,49 +187,52 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ...@@ -164,49 +187,52 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
else: else:
approx_fp = ishape[0] * kshape[0] * kshape[1] * kshape[2] * kshape[3] * ishape[2] * ishape[3] * 2 approx_fp = ishape[0] * kshape[0] * kshape[1] * kshape[2] * kshape[3] * ishape[2] * ishape[3] * 2
approx_fp /= 1e6 approx_fp /= 1e6
cpu_mflops = approx_fp / (t1-t0) cpu_mflops = approx_fp / (t1 - t0)
gpu_mflops = approx_fp / (t2-t1) gpu_mflops = approx_fp / (t2 - t1)
if verbose>0: if verbose > 0:
print >> sys.stdout, '%15s'% str(ishape), '%15s'% str(kshape), print >> sys.stdout, '%15s' % str(ishape), '%15s' % str(kshape),
print >> sys.stdout, '%12.5f %7.2f %7.2f %7.1f' % (approx_fp, print >> sys.stdout, '%12.5f %7.2f %7.2f %7.1f' % (approx_fp,
cpu_mflops, gpu_mflops,(t1-t0)/(t2-t1)) cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1))
if not rval: if not rval:
print >> sys.stdout, 'test_'+mode+' id='+str(id)+' FAILED for ishape, kshape, mode, subsample, img_stride, kern_stride, version', ishape, kshape, mode, subsample, img_stride, kern_stride, version print >> sys.stdout, 'test_'+mode+' id='+str(id)+' FAILED for ishape, kshape, mode, subsample, img_stride, kern_stride, version', ishape, kshape, mode, subsample, img_stride, kern_stride, version
diff=cpuval-gpuval diff = cpuval - gpuval
diffabs=numpy.absolute(diff) diffabs = numpy.absolute(diff)
pr_diff=diffabs/numpy.absolute(cpuval) pr_diff = diffabs / numpy.absolute(cpuval)
nb_close=(diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum() nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
print "max absolute diff:",diffabs.max(),"avg abs diff:",numpy.average(diffabs) print "max absolute diff:",diffabs.max(),"avg abs diff:",numpy.average(diffabs)
print "median abs diff:", numpy.median(diffabs), "nb close:",nb_close, "/", diff.size print "median abs diff:", numpy.median(diffabs), "nb close:",nb_close, "/", diff.size
print "max relatif diff:",pr_diff.max(), "avg rel diff:", numpy.average(pr_diff) print "max relatif diff:",pr_diff.max(), "avg rel diff:", numpy.average(pr_diff)
if not rval and print_!=False: if not rval and print_ != False:
if npy_img.shape[0]>5: if npy_img.shape[0] > 5:
print "img",npy_img[0] print "img", npy_img[0]
print "kern",npy_kern[0] print "kern", npy_kern[0]
print "gpu",gpuval[0][0] print "gpu", gpuval[0][0]
print "cpu",cpuval[0][0] print "cpu", cpuval[0][0]
print "diff",diff[0][0] print "diff", diff[0][0]
else: else:
print "img",npy_img print "img", npy_img
print "kern",npy_kern print "kern", npy_kern
print "gpu",gpuval print "gpu", gpuval
print "cpu",cpuval print "cpu", cpuval
print "diff",diff print "diff", diff
return rval return rval
def exec_conv(version, shapes, verbose, random, mode, def exec_conv(version, shapes, verbose, random, mode,
print_=None, rtol=1e-5, ones=False): print_=None, rtol=1e-5, ones=False):
if verbose>0: if verbose > 0:
_params_allgood_header() _params_allgood_header()
nb_failed = 0 nb_failed = 0
nb_tests = 0 nb_tests = 0
failed_version=set() failed_version = set()
failed_id=[] failed_id = []
for ver in version:# I put -1 in case we forget to add version in the test to. # I put -1 in case we forget to add version in the test to.
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for ver in version:
ret=False for id, (ishape, kshape, subshape,
istride, kstride) in enumerate(shapes):
ret = False
try: try:
ret = _params_allgood(ishape, ret = _params_allgood(ishape,
kshape, kshape,
...@@ -222,19 +248,21 @@ def exec_conv(version, shapes, verbose, random, mode, ...@@ -222,19 +248,21 @@ def exec_conv(version, shapes, verbose, random, mode,
rtol=rtol, rtol=rtol,
ones=ones) ones=ones)
except Exception, e: except Exception, e:
print ver, id,(ishape, kshape, subshape, istride, kstride) print ver, id, (ishape, kshape, subshape, istride, kstride)
print e print e
pass pass
if not ret: if not ret:
failed_version.add(ver) failed_version.add(ver)
failed_id.append(id) failed_id.append(id)
nb_failed+=1 nb_failed += 1
nb_tests+=1 nb_tests += 1
if nb_failed>0: if nb_failed > 0:
print "nb_failed",nb_failed,"on",nb_tests, "failed_version",failed_version, "failed_id",failed_id print "nb_failed", nb_failed, "on", nb_tests,
assert nb_failed==0, nb_failed print "failed_version", failed_version, "failed_id", failed_id
assert nb_failed == 0, nb_failed
else: else:
print 'Executed',nb_tests,'different shapes' print 'Executed', nb_tests, 'different shapes'
def get_basic_shapes(): def get_basic_shapes():
return [ return [
...@@ -249,8 +277,12 @@ def get_basic_shapes(): ...@@ -249,8 +277,12 @@ def get_basic_shapes():
, ((1, 1, 4, 4), (1, 1, 3, 2), (1,1), (1,1), (1,1)) , ((1, 1, 4, 4), (1, 1, 3, 2), (1,1), (1,1), (1,1))
, ((1, 1, 4, 4), (1, 1, 2, 3), (1,1), (1,1), (1,1))] , ((1, 1, 4, 4), (1, 1, 2, 3), (1,1), (1,1), (1,1))]
def get_shapes(imshp=(1,1), kshp=(1,1), subsample=(1,1), img_stride=(1,1), kern_stride=(1,1)):
""" all possible case if we one or more of stack size, batch size, nkern. We use the gived image shape, kernel shape and subsmaple shape.""" def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
img_stride=(1, 1), kern_stride=(1, 1)):
""" all possible case if we one or more of stack size, batch size,
nkern. We use the gived image shape, kernel shape and subsmaple
shape."""
return [ ((1, 2)+imshp, (1, 2)+kshp,subsample, img_stride, kern_stride)#stack only return [ ((1, 2)+imshp, (1, 2)+kshp,subsample, img_stride, kern_stride)#stack only
, ((3, 1)+imshp, (1, 1)+kshp,subsample, img_stride, kern_stride)#batch only , ((3, 1)+imshp, (1, 1)+kshp,subsample, img_stride, kern_stride)#batch only
, ((1, 1)+imshp, (2, 1)+kshp,subsample, img_stride, kern_stride)#nkern only , ((1, 1)+imshp, (2, 1)+kshp,subsample, img_stride, kern_stride)#nkern only
...@@ -260,7 +292,10 @@ def get_shapes(imshp=(1,1), kshp=(1,1), subsample=(1,1), img_stride=(1,1), kern_ ...@@ -260,7 +292,10 @@ def get_shapes(imshp=(1,1), kshp=(1,1), subsample=(1,1), img_stride=(1,1), kern_
, ((2, 2)+imshp, (2, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack , ((2, 2)+imshp, (2, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack
, ((3, 2)+imshp, (4, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack , ((3, 2)+imshp, (4, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack
] ]
def get_shapes2(scales_img=(1,1), scales_kern=(1,1), subsample=(1,1), img_stride=(1,1), kern_stride=(1,1)):
def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
img_stride=(1, 1), kern_stride=(1, 1)):
#basic test of stack, batch and nkern paramter #basic test of stack, batch and nkern paramter
shapes =get_shapes((1*scales_img[0],1*scales_img[1]), shapes =get_shapes((1*scales_img[0],1*scales_img[1]),
(1*scales_kern[0],1*scales_kern[1]),subsample, img_stride, kern_stride) (1*scales_kern[0],1*scales_kern[1]),subsample, img_stride, kern_stride)
...@@ -284,19 +319,20 @@ def get_shapes2(scales_img=(1,1), scales_kern=(1,1), subsample=(1,1), img_stride ...@@ -284,19 +319,20 @@ def get_shapes2(scales_img=(1,1), scales_kern=(1,1), subsample=(1,1), img_stride
(2*scales_kern[0],3*scales_kern[1]),subsample, img_stride, kern_stride) (2*scales_kern[0],3*scales_kern[1]),subsample, img_stride, kern_stride)
return shapes return shapes
def get_valid_shapes(): def get_valid_shapes():
# img shape, kern shape, subsample shape # img shape, kern shape, subsample shape
shapes = get_basic_shapes() shapes = get_basic_shapes()
shapes +=get_shapes2() shapes += get_shapes2()
#test image stride #test image stride
shapes += get_shapes2(scales_img=(2,2),img_stride=(1,2)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
shapes += get_shapes2(scales_img=(2,2),img_stride=(2,1)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
shapes += get_shapes2(scales_img=(2,2),img_stride=(2,2)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
#test subsample done in a separate fct #test subsample done in a separate fct
...@@ -333,161 +369,192 @@ def get_valid_shapes(): ...@@ -333,161 +369,192 @@ def get_valid_shapes():
] ]
return shapes return shapes
def test_valid_0_2(): def test_valid_0_2():
shapes = get_valid_shapes() shapes = get_valid_shapes()
version=[0,2] version = [0, 2]
verbose=0 verbose = 0
random = True random = True
print_ = False print_ = False
ones = False ones = False
if ones: if ones:
random = False random = False
shapes2=[] shapes2 = []
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1])) oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
if oshape[3]> 512: numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
continue continue
if ishape[1]>1: if ishape[1] > 1:
continue continue
if (numpy.prod(ishape[2:])+numpy.prod(kshape[2:]))*4>(16*1024-150): if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
(16 * 1024 - 150)):
continue continue
if subshape==(1,1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid_1_3_11_12(): def test_valid_1_3_11_12():
shapes = get_valid_shapes() shapes = get_valid_shapes()
version=[1,3,11,12] version = [1, 3, 11, 12]
verbose=0 verbose = 0
random = True random = True
print_ = False print_ = False
ones = False ones = False
if ones: if ones:
random = False random = False
shapes2=[] shapes2 = []
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1])) oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
if oshape[3]> 512: numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
continue continue
if (numpy.prod(ishape[2:])+numpy.prod(kshape[2:]))*4>(16*1024-150): if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
(16 * 1024 - 150)):
continue continue
if subshape==(1,1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid_4(): def test_valid_4():
shapes = get_valid_shapes() shapes = get_valid_shapes()
version=[4] version = [4]
verbose=0 verbose = 0
random = True random = True
print_ = False print_ = False
ones = False ones = False
if ones: if ones:
random = False random = False
shapes2=[] shapes2 = []
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1])) oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
if oshape[3]> 512: numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
continue continue
if ishape[1]>1: if ishape[1] > 1:
continue continue
if (kshape[2]*ishape[3]*4+numpy.prod(kshape[2:])*4)>(16*1024-150): if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
(16 * 1024 - 150)):
continue continue
if subshape==(1,1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid_5(): def test_valid_5():
shapes = get_valid_shapes() shapes = get_valid_shapes()
version=[5] version = [5]
verbose=0 verbose = 0
random = True random = True
print_ = False print_ = False
ones = False ones = False
if ones: if ones:
random = False random = False
shapes2=[] shapes2 = []
print len(shapes) print len(shapes)
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1])) oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
if oshape[3]> 512: numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
continue continue
if (kshape[2]*ishape[3]*4+numpy.prod(kshape[2:])*4)>(16*1024-150): if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
(16 * 1024 - 150)):
continue continue
if subshape==(1,1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
print len(shapes2) print len(shapes2)
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid_7_8_13(): def test_valid_7_8_13():
shapes = get_valid_shapes() shapes = get_valid_shapes()
# This is to test the "new" lower shared memory usage. # This is to test the "new" lower shared memory usage.
shapes.append(((10,30,60,60),(20,30,40,40), (1,1), (1,1), (1,1))) shapes.append(((10, 30, 60, 60), (20, 30, 40, 40),
version=[7,8,13] (1, 1), (1, 1), (1, 1)))
verbose=0 version = [7, 8, 13]
verbose = 0
random = True random = True
print_ = False print_ = False
ones = False ones = False
if ones: if ones:
random = False random = False
shapes2=[] shapes2 = []
print len(shapes) print len(shapes)
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1])) oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
if oshape[2]*oshape[3]>512: numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[2] * oshape[3] > 512:
continue continue
if max(numpy.prod(ishape[2:])*4+2*kshape[3]*4, oshape[2]*oshape[3]*4*2)>(16*1024-150): if max(numpy.prod(ishape[2:]) * 4 + 2 * kshape[3] * 4,
oshape[2] * oshape[3] * 4 * 2) > (16 * 1024 - 150):
continue continue
if subshape==(1,1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
print len(shapes2) print len(shapes2)
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid_9_10(): def test_valid_9_10():
shapes = get_valid_shapes() shapes = get_valid_shapes()
version=[9,10] version = [9, 10]
verbose=0 verbose = 0
random = True random = True
print_ = False print_ = False
ones = False ones = False
if ones: if ones:
random = False random = False
shapes2=[] shapes2 = []
print len(shapes) print len(shapes)
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1])) oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
if oshape[3]> 512: numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
continue continue
if (kshape[3]*4+ishape[3])>(16*1024-150): if (kshape[3] * 4 + ishape[3]) > (16 * 1024 - 150):
continue continue
if subshape==(1,1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
print len(shapes2) print len(shapes2)
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid(): def test_valid():
shapes = get_valid_shapes() shapes = get_valid_shapes()
...@@ -495,8 +562,8 @@ def test_valid(): ...@@ -495,8 +562,8 @@ def test_valid():
#shapes=shapes[400:426] #shapes=shapes[400:426]
# I put -1 in case we forget to add version in the test to. # I put -1 in case we forget to add version in the test to.
# I put -2 to test the reference version. # I put -2 to test the reference version.
version=[-2,-1,6] version = [-2, -1, 6]
verbose=0 verbose = 0
# version=[1] # version=[1]
random = True random = True
...@@ -505,17 +572,19 @@ def test_valid(): ...@@ -505,17 +572,19 @@ def test_valid():
if ones: if ones:
random = False random = False
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_full(): def test_full():
shapes = get_basic_shapes() shapes = get_basic_shapes()
shapes +=get_shapes2() shapes += get_shapes2()
#test image stride #test image stride
shapes += get_shapes2(scales_img=(2,2),img_stride=(1,2)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
shapes += get_shapes2(scales_img=(2,2),img_stride=(2,1)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
shapes += get_shapes2(scales_img=(2,2),img_stride=(2,2)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
#test subsample done in a separate fct #test subsample done in a separate fct
...@@ -557,13 +626,14 @@ def test_full(): ...@@ -557,13 +626,14 @@ def test_full():
] ]
# shapes=shapes[:277] # shapes=shapes[:277]
version=[-2,-1,0,1,2,3,4,5] version = [-2, -1, 0, 1, 2, 3, 4, 5]
verbose=0 verbose = 0
# version=[4] # version=[4]
random=True random = True
exec_conv(version, shapes, verbose, random, 'full') exec_conv(version, shapes, verbose, random, 'full')
def test_subsample(): def test_subsample():
# implement when # implement when
shapes = [ shapes = [
...@@ -573,14 +643,14 @@ def test_subsample(): ...@@ -573,14 +643,14 @@ def test_subsample():
, ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1,1), (1,1)) , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1,1), (1,1))
, ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1,1), (1,1)) , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1,1), (1,1))
] ]
shapes += get_shapes2(scales_img=(2,2),subsample=(1,1)) shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 1))
shapes += get_shapes2(scales_img=(2,2),subsample=(1,2)) shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 2))
shapes += get_shapes2(scales_img=(2,2),subsample=(2,1)) shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 1))
shapes += get_shapes2(scales_img=(2,2),subsample=(2,2)) shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 2))
#We put only the version that implement the subsample to make the test faster. #We put only the version that implement the subsample to make the test faster.
version_valid = [-2,-1,1,3,11,12] version_valid = [-2, -1, 1, 3, 11, 12]
version_full = [-2,-1] version_full = [-2, -1]
verbose = 0 verbose = 0
random = True random = True
print_ = False print_ = False
...@@ -588,8 +658,10 @@ def test_subsample(): ...@@ -588,8 +658,10 @@ def test_subsample():
if ones: if ones:
random = False random = False
exec_conv(version_valid, shapes, verbose, random, 'valid', print_=print_, ones=ones) exec_conv(version_valid, shapes, verbose, random, 'valid',
exec_conv(version_full, shapes, verbose, random, 'full', print_=print_, ones=ones) print_=print_, ones=ones)
exec_conv(version_full, shapes, verbose, random, 'full',
print_=print_, ones=ones)
## See #616 ## See #616
#def test_logical_shapes(): #def test_logical_shapes():
...@@ -614,7 +686,8 @@ class TestConv2DGPU(unittest.TestCase): ...@@ -614,7 +686,8 @@ class TestConv2DGPU(unittest.TestCase):
theano_mode_orig = theano_mode theano_mode_orig = theano_mode
try: try:
if theano.config.mode in ['DebugMode', 'DEBUG_MODE']: if theano.config.mode in ['DebugMode', 'DEBUG_MODE']:
theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu') theano_mode = theano.compile.mode.get_mode(
'FAST_RUN').including('gpu')
for mode in ['valid', 'full']: for mode in ['valid', 'full']:
for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)), for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)),
((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)), ((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)),
...@@ -622,16 +695,21 @@ class TestConv2DGPU(unittest.TestCase): ...@@ -622,16 +695,21 @@ class TestConv2DGPU(unittest.TestCase):
# We use only the number of columns. # We use only the number of columns.
]: ]:
self.assertRaises(ValueError, _params_allgood, shapes[0], shapes[1], self.assertRaises(ValueError, _params_allgood,
verbose=verbose, random=random, mode=mode, shapes[0], shapes[1],
print_=print_, ones=ones, compile_kshp=shapes[2]) verbose=verbose, random=random,
mode=mode,
print_=print_, ones=ones,
compile_kshp=shapes[2])
finally: finally:
theano_mode = theano_mode_orig theano_mode = theano_mode_orig
def _test_dummy(): def _test_dummy():
ishape = (1, 1, 5, 5) ishape = (1, 1, 5, 5)
kshape = (1, 1, 3, 3) kshape = (1, 1, 3, 3)
mode = 'valid' mode = 'valid'
subsample = (1,1) subsample = (1, 1)
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
...@@ -696,14 +774,14 @@ def benchmark(): ...@@ -696,14 +774,14 @@ def benchmark():
,((2, 30,116,116), (20, 30, 9,9), (1,1), (1,1), (1,1))#full conv_reference_full ,((2, 30,116,116), (20, 30, 9,9), (1,1), (1,1), (1,1))#full conv_reference_full
] ]
# shapes_valid=shapes_valid[-1:] # shapes_valid=shapes_valid[-1:]
# shapes_full=shapes_full[-1:] # shapes_full=shapes_full[-1:]
version=[-1] version = [-1]
verbose=1 verbose = 1
random=True random = True
exec_conv(version, shapes_valid, verbose, random, 'valid', print_=None, rtol=1e-3) exec_conv(version, shapes_valid, verbose, random, 'valid',
print_=None, rtol=1e-3)
exec_conv(version, shapes_full, verbose, random, 'full') exec_conv(version, shapes_full, verbose, random, 'full')
...@@ -719,5 +797,3 @@ def test_stack_rows_segfault_070312(): ...@@ -719,5 +797,3 @@ def test_stack_rows_segfault_070312():
nkern=1, bsize=1) nkern=1, bsize=1)
f = theano.function([], [], updates={out: op(img, kern)}) f = theano.function([], [], updates={out: op(img, kern)})
f() f()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论