提交 42927b2b authored 作者: James Bergstra's avatar James Bergstra

merge

...@@ -453,7 +453,6 @@ class GpuSum(Op): ...@@ -453,7 +453,6 @@ class GpuSum(Op):
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output"); PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
%(fail)s; %(fail)s;
} }
} }
""" %locals() """ %locals()
...@@ -472,12 +471,10 @@ class GpuSum(Op): ...@@ -472,12 +471,10 @@ class GpuSum(Op):
#TODO: check if we are ccontiguous when we un-dimshuffle #TODO: check if we are ccontiguous when we un-dimshuffle
#TODO: if only some dims are ccontiguous, call version with less dims. #TODO: if only some dims are ccontiguous, call version with less dims.
print >> sio, 'if(CudaNdarray_is_c_contiguous(%(x)s)){'%locals() print >> sio, 'if(CudaNdarray_is_c_contiguous(%(x)s)){'%locals()
self.c_code_reduce_ccontig(sio, node, name, x, z, fail) self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
print >> sio, "}else{" print >> sio, "}else{"
getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
print >> sio, "}" print >> sio, "}"
else: else:
getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
...@@ -826,8 +823,16 @@ class GpuSum(Op): ...@@ -826,8 +823,16 @@ class GpuSum(Op):
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(1,CudaNdarray_HOST_DIMS(%(x)s)[1]); dim3 n_blocks(1,
if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n"); std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
NUM_VECTOR_OP_BLOCKS));
if (verbose) {
fprintf(stderr,
"running kernel_reduce_sum_10_%(name)s n_blocks=(%%i,%%i)\\n",
n_blocks.x,
n_blocks.y);
}
assert( CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]);
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
1, 1,
...@@ -1175,9 +1180,7 @@ class GpuSum(Op): ...@@ -1175,9 +1180,7 @@ class GpuSum(Op):
""" %locals() """ %locals()
def c_code_cache_version(self): def c_code_cache_version(self):
#return () return (20,)
return (19,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
sio = StringIO.StringIO() sio = StringIO.StringIO()
......
...@@ -363,9 +363,10 @@ class GpuConv(Op): ...@@ -363,9 +363,10 @@ class GpuConv(Op):
return ['cuda_ndarray.cuh','<stdio.h>'] return ['cuda_ndarray.cuh','<stdio.h>']
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,8) return (0,9) # raise this whenever modifying any of the support_code_files
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of these files
return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\ return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\
open(os.path.join(os.path.split(__file__)[0],'conv_full_kernel.cu')).read()+\ open(os.path.join(os.path.split(__file__)[0],'conv_full_kernel.cu')).read()+\
open(os.path.join(os.path.split(__file__)[0],'conv.cu')).read() open(os.path.join(os.path.split(__file__)[0],'conv.cu')).read()
...@@ -405,8 +406,7 @@ class GpuConv(Op): ...@@ -405,8 +406,7 @@ class GpuConv(Op):
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, %(out)s, CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, %(out)s,
mode, dx, dy, version, verbose); mode, dx, dy, version, verbose);
if(%(out)s && %(out)s==out2) Py_XDECREF(%(out)s);
Py_DECREF(out2);//CudaNdarray_Conv incremented the count to out
%(out)s = out2; %(out)s = out2;
"""%sub """%sub
......
// REMEMBER TO RAISE c_code_cache_version when changing this file
//
enum { ConvMode_FULL, ConvMode_VALID }; enum { ConvMode_FULL, ConvMode_VALID };
PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose); PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
...@@ -99,11 +101,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -99,11 +101,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (verbose>1) if (verbose>1)
{ {
printf("INFO: Running conv_valid version=%d, MACRO kern_width=%d with inputs:\n",version,THEANO_KERN_WID); fprintf(stderr, "INFO: Running conv_valid version=%d, MACRO kern_width=%d with inputs:\n",version,THEANO_KERN_WID);
printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n", fprintf(stderr, "INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3], CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]);
printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n", fprintf(stderr, "INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]); CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]);
} }
...@@ -151,13 +153,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -151,13 +153,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) printf("INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split); if (verbose) fprintf(stderr, "INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split); if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split);
if (verbose) printf("INFO: impl 'conv_patch' failed (%s), trying next implementation\n", if (verbose) fprintf(stderr, "INFO: impl 'conv_patch' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -221,26 +223,28 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -221,26 +223,28 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose>1) if (verbose>1)
printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i," fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i," " kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i\n", " kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel); nb_split, preload_full_kernel);
if (verbose) printf("INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i\n", if (verbose) fprintf(stderr,
"INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i\n",
nb_split,preload_full_kernel); nb_split,preload_full_kernel);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) if (verbose)
printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i," fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i," " kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i\n", " kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel); nb_split, preload_full_kernel);
if (verbose) printf("INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n", if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -280,12 +284,12 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -280,12 +284,12 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) printf("INFO: used 'conv_rows' version\n"); if (verbose) fprintf(stderr, "INFO: used 'conv_rows' version\n");
} }
else else
{ {
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_rows' failed (%s), trying next implementation\n", if (verbose) fprintf(stderr, "INFO: impl 'conv_rows' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -335,13 +339,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -335,13 +339,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack' version\n"); if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
} }
else else
{ {
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n", if (verbose) fprintf(stderr, "INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -410,20 +414,23 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -410,20 +414,23 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row); if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
} }
else else
{ {
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n", if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,(version==9?2:3)); threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,(version==9?2:3));
if (verbose) printf("INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n", if (verbose) fprintf(stderr, "INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
//version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient. //version 8 is the same but we force the split.
// The split is need in case we have too much threads.
// This happen frequently if the kernel length is big.
// Big kernel is frequent in the gradient.
//version 8 need a minimum of kernel length as we force the split. //version 8 need a minimum of kernel length as we force the split.
//version 8 is needed to test more easily this kernel template parameter. //version 8 is needed to test more easily this kernel template parameter.
//version 13 load only 1 kernel row at a time. //version 13 load only 1 kernel row at a time.
...@@ -432,8 +439,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -432,8 +439,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
out_size<512 &&//Maximum of 512 theads by block out_size<512 &&//Maximum of 512 theads by block
(version==7||version==8||version==13||version==-1) && (version==7||version==8||version==13||version==-1) &&
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split. (version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
(version!=13||kern_len>1) && //version 13 need a minimal kernel length as big as the split. //version 13 need a minimal kernel length as big as the split.
(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce! (version!=13||kern_len>1) &&
!work_complete) //conv_patch_stack_reduce !work_complete) //conv_patch_stack_reduce
{ {
int nb_split=1; int nb_split=1;
...@@ -441,25 +448,40 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -441,25 +448,40 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if(version==8||version==13) nb_split++;//force the split. if(version==8||version==13) nb_split++;//force the split.
if(version==13)full_kern=false; if(version==13)full_kern=false;
while(ceil_intdiv(kern_len,nb_split)>64)nb_split++;//device 1.3 have a max of 64 thread in z
while(out_size*ceil_intdiv(kern_len,nb_split)>512)nb_split++;
int shared_size=(img_size + kern_size + out_size*kern_len)*sizeof(float);
if(shared_size>=shared_avail){
//if we can't fit the kernel in shared memory, we can split it more.
full_kern=false;
assert((img_size+kern_wid*2+out_size*2)*sizeof(float)<=shared_avail);
shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
while(shared_size>=shared_avail || ceil_intdiv(kern_len,nb_split)>64){
nb_split++;
shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
}
}
//thread_z is going to be ceil_intdiv(kern_len, nb_split)
// we need enough splits so that
// a) thread_z fits in the 'z' threadIdx (i.e. is less than 64)
// b) thread_z * out_len * out_wid fits in the thread count
// c) the kernel doesn't need too much shared memory
// constraint (a)
// device 1.3 have a max of 64 thread in z
while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;
// constraint (b)
// (TODO: read the number of threads per block from the device)
while(out_size*ceil_intdiv(kern_len,nb_split)>512) nb_split++;
// tentative estimates (prior to contraint c)
int thread_z=ceil_intdiv(kern_len,nb_split); int thread_z=ceil_intdiv(kern_len,nb_split);
int shared_size = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z));
// constraint (c)
while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
//if we can't fit the kernel in shared memory, we must split it more.
nb_split++;
thread_z=ceil_intdiv(kern_len,nb_split);
shared_size=sizeof(float)*std::max(
img_size + kern_wid*thread_z,
out_size*thread_z);
}
if (nb_split <= kern_len)
{
assert(thread_z>0);//should not happen, but in case... assert(thread_z>0);//should not happen, but in case...
assert(shared_size<=shared_avail); if(!full_kern) assert(thread_z!=kern_len);
if(!full_kern)
assert(thread_z!=kern_len);
dim3 threads(out_wid, out_len, thread_z); dim3 threads(out_wid, out_len, thread_z);
dim3 grid(nbatch,nkern); dim3 grid(nbatch,nkern);
...@@ -495,9 +517,9 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -495,9 +517,9 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>; else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID); CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
if (verbose) printf("INFO: using 'conv_patch_stack_reduce' version kern_flipped=%i ccontig=%i nb_split=%d, preload_full_kern=%d\n", if (verbose) fprintf(stderr, "INFO: using 'conv_patch_stack_reduce' version kern_flipped=%i ccontig=%i nb_split=%d, preload_full_kern=%d\n",
kern_flipped,ccontig,nb_split,full_kern); kern_flipped,ccontig,nb_split,full_kern);
if (verbose>1) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y, threads.x, threads.y, threads.z, grid.x, grid.y,
shared_size, threads.x * threads.y * threads.z); shared_size, threads.x * threads.y * threads.z);
f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata, f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
...@@ -514,10 +536,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -514,10 +536,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
} }
else else
{ {
if (verbose) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z); if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z);
if (verbose) printf("INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n", if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} // else no good nb_splits was found
} }
if (1 && (version==6||version==-1) && if (1 && (version==6||version==-1) &&
...@@ -589,12 +612,12 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -589,12 +612,12 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) printf("INFO: used 'conv_valid_row_reduce' version\n"); if (verbose) fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
} }
else else
{ {
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, shared_size=%i, nb_threads=%i\n", n_threads.x, n_threads.y, n_blocks, n_reduce_buf, n_threads.x * n_threads.y); if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, shared_size=%i, nb_threads=%i\n", n_threads.x, n_threads.y, n_blocks, n_reduce_buf, n_threads.x * n_threads.y);
if (verbose) printf("INFO: impl 'conv_valid_row_reduce' failed (%s), trying next implementation\n", if (verbose) fprintf(stderr, "INFO: impl 'conv_valid_row_reduce' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -604,23 +627,23 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -604,23 +627,23 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int outsize = CudaNdarray_SIZE(out); int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK); int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (0) if (1)
{ {
if (verbose) printf("INFO: launching conv_reference_valid\n"); if (verbose) fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose) printf(" img : %i %i %i %i %p %i %i %i %i\n", if (verbose) fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid, nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
img->devdata, img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose) printf(" kern: %i %i %i %i %p %i %i %i %i\n", if (verbose) fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
nkern, nstack, kern_len, kern_wid, nkern, nstack, kern_len, kern_wid,
kern->devdata, kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3] CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
); );
if (verbose) printf(" out : %i %i %i %i %p %i %i %i %i\n", if (verbose) fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid, CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
out->devdata, out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]); CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose) printf(" launch params: %i %i %i\n", outsize, n_blocks, n_threads); if (verbose) fprintf(stderr, " launch params: %i %i %i\n", outsize, n_blocks, n_threads);
} }
conv_reference_valid<<<n_blocks, n_threads>>>( nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1], conv_reference_valid<<<n_blocks, n_threads>>>( nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
img_len, img_wid, img_len, img_wid,
...@@ -636,7 +659,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -636,7 +659,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) printf("INFO: used 'conv_reference_valid' version\n"); if (verbose) fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
} }
else else
{ {
...@@ -645,6 +668,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -645,6 +668,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
return -1; return -1;
} }
} }
assert (work_complete);
return 0; return 0;
//PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err)); //PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
...@@ -770,7 +794,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -770,7 +794,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
if(kern_len==1 && version==5){ if(kern_len==1 && version==5){
//version 5 don't support kern_len==1 as 1%0 return -1. //version 5 don't support kern_len==1 as 1%0 return -1.
version=-1; version=-1;
if(verbose)printf("WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n"); if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
} }
if(img_size_padded_byte+kern_size_byte>shared_avail) version=5; if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
...@@ -835,14 +859,14 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -835,14 +859,14 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose>1) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version); if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version);
if (verbose) printf("INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false")); if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false"));
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version); if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version);
if (verbose) printf("INFO: impl 'conv_full_patch_stack_padded' %s %s failed (%s), trying next implementation\n", if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch_stack_padded' %s %s failed (%s), trying next implementation\n",
version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"), version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
...@@ -872,13 +896,13 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -872,13 +896,13 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) printf("INFO: used 'conv_full_patch' version\n"); if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n");
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n", if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -922,13 +946,13 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -922,13 +946,13 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) printf("INFO: used 'conv_full_load_everything' version\n"); if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n");
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_full_load_everything' failed (%s), trying next implementation\n", if (verbose) fprintf(stderr, "INFO: impl 'conv_full_load_everything' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -968,41 +992,41 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -968,41 +992,41 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) printf("INFO: used 'conv_full_patch_stack' version\n"); if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n", if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (1 && !work_complete) //conv_reference_full if (1 && !work_complete) //conv_reference_full
{ {
if(verbose>1)printf("INFO: will start conv_reference_full\n"); if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");
int outsize = CudaNdarray_SIZE(out); int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK); int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (0) if (0)
{ {
if (verbose) printf("INFO: launching conv_reference_valid\n"); if (verbose) fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose) printf(" img : %i %i %i %i %p %i %i %i %i\n", if (verbose) fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3], CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
img->devdata, img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose) printf(" kern: %i %i %i %i %p %i %i %i %i\n", if (verbose) fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
kern->devdata, kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3] CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
); );
if (verbose) printf(" out : %i %i %i %i %p %i %i %i %i\n", if (verbose) fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3], CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
out->devdata, out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]); CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose) printf(" launch params: %i %i %i\n", outsize, n_blocks, n_threads); if (verbose) fprintf(stderr, " launch params: %i %i %i\n", outsize, n_blocks, n_threads);
if (verbose) printf(" subsample params: %i %i\n", subsample_rows, subsample_cols); if (verbose) fprintf(stderr, " subsample params: %i %i\n", subsample_rows, subsample_cols);
} }
conv_reference_full<<<n_blocks, n_threads>>>(CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(img)[1], conv_reference_full<<<n_blocks, n_threads>>>(CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
...@@ -1017,15 +1041,15 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -1017,15 +1041,15 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) printf("INFO: used 'conv_reference_full' version ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d nkern=%d nstack=%d subsample=%d\n", if (verbose) fprintf(stderr, "INFO: used 'conv_reference_full' version ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d nkern=%d nstack=%d subsample=%d\n",
img_len,img_wid, kern_len, kern_wid, img_len,img_wid, kern_len, kern_wid,
out_len, out_wid, nbatch, nkern, nstack, subsample); out_len, out_wid, nbatch, nkern, nstack, subsample);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", n_threads, 1, n_blocks, 1, 0, n_threads); if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", n_threads, 1, n_blocks, 1, 0, n_threads);
if (verbose) printf("INFO: impl 'conv_reference_full' failed (%s), trying next implementation\n", if (verbose) fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for CudaNdarray_conv_full! (%s)", PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for CudaNdarray_conv_full! (%s)",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
...@@ -1041,6 +1065,9 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1041,6 +1065,9 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
const int subsample_rows, const int subsample_cols, const int subsample_rows, const int subsample_cols,
const int version, const int verbose) const int version, const int verbose)
{ {
// Re-use the out object if possible. If the out object it not used, then its refcount is not modified.
// If the out object is re-used then it is returned, and its refcount is incremented by 1.
//
if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;} if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;} if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
...@@ -1061,18 +1088,24 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1061,18 +1088,24 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
out_dim[2] = ceil_intdiv(logical_rows, subsample_rows); out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
out_dim[3] = ceil_intdiv(logical_cols, subsample_cols); out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
CudaNdarray * rval = out; CudaNdarray * rval = NULL;
if(!(out && out->nd==4 && CudaNdarray_is_c_contiguous(out)
if ( out
&& out->nd==4
&& CudaNdarray_is_c_contiguous(out)
&& CudaNdarray_HOST_DIMS(out)[0]==out_dim[0] && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
&& CudaNdarray_HOST_DIMS(out)[1]==out_dim[1] && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
&& CudaNdarray_HOST_DIMS(out)[2]==out_dim[2] && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
&& CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])){ && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])
if (out)
{ {
fprintf(stderr, "Warning: Conv is ignoring 'out' argument with wrong structure.\n"); rval = out;
Py_DECREF(out); Py_INCREF(rval);
} }
else
{
if (verbose) fprintf(stderr, "INFO: Conv is ignoring 'out' argument with wrong structure.\n");
rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim); rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
//rval might be null
} }
if ((rval==NULL) if ((rval==NULL)
|| ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose)) || ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
...@@ -1081,12 +1114,9 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1081,12 +1114,9 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
{ {
// if rval is something we just allocated, // if rval is something we just allocated,
// and there was a problem, then we have to free it. // and there was a problem, then we have to free it.
if (rval != out) Py_XDECREF(rval); Py_XDECREF(rval);
return NULL; return NULL;
} }
//TODO: Get refcount story clearer!
// This function does a weird thing as work-around with Conv_VARARGS
if (rval == out) Py_INCREF(rval);
return (PyObject*)rval; return (PyObject*)rval;
} }
// REMEMBER TO RAISE c_code_cache_version when changing this file
//
//implement the valid convolution only //implement the valid convolution only
/* /*
...@@ -38,6 +40,8 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) { ...@@ -38,6 +40,8 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
#define BS(i, j) Bs[i][j] #define BS(i, j) Bs[i][j]
#endif #endif
*/ */
#define MAX(a,b) ((a)>(b)?(a):(b))
#define MIN(a,b) ((a)<(b)?(a):(b))
const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
#define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float)) #define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float))
...@@ -46,7 +50,8 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_ ...@@ -46,7 +50,8 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
if (nb_thread < 64) if (nb_thread < 64)
{ {
if(flipped) if(flipped)
//TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped. //TODO very slow on device before 1.3.
// make access to kern sequential and access to d_kern flipped.
for(int i=thread_id;i<N;i+=nb_thread) for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=src[N - 1 - i]; dst[i]=src[N - 1 - i];
//dst[N-1-i]=src[i]; //dst[N-1-i]=src[i];
...@@ -88,10 +93,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_ ...@@ -88,10 +93,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
const bool flipped=false, const bool c_contiguous=true){ const bool flipped=false, const bool c_contiguous=true){
if(flipped && ! c_contiguous){ if(flipped && ! c_contiguous){
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread) for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
dst[nb_row*nb_col-1-i]=src[i/nb_col*stride_row+i%nb_col*stride_col]; dst[nb_row*nb_col-1-i]=src[(i/nb_col)*stride_row+(i%nb_col)*stride_col];
}else if(c_contiguous){ }else if(c_contiguous){
load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped); load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
}else if(flipped){//c_contiguous==true }else if(flipped){//c_contiguous==true
//TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped. //TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
int N=nb_col*nb_row; int N=nb_col*nb_row;
...@@ -440,10 +444,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -440,10 +444,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id; //int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len - kern_len + 1; //out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1; //out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; const int out_wid = blockDim.x;
const int out_len = blockDim.y;
const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[]; extern __shared__ float s_data[];
...@@ -458,9 +464,16 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -458,9 +464,16 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
int out_row = ty;//output row int out_row = ty;//output row
const int thread_id = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx; const int thread_id = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID]; //d_img size [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]; float * d_img=&s_data[0];
float * d_reduce=&s_data[img_len*img_wid+(preload_full_kern?kern_len:blockDim.z)*kern_wid];
//d_kern size[(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]
float * d_kern=&s_data[img_len * img_wid];
//d_reduce size [n_threads]
//N.B. this overlaps with d_img and d_kern!
float * d_reduce=&s_data[0];
float sum = 0.0f; float sum = 0.0f;
kern+=kern_stride_nkern*blockIdx.y;//the good nkern kern+=kern_stride_nkern*blockIdx.y;//the good nkern
...@@ -471,30 +484,31 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -471,30 +484,31 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
__syncthreads(); __syncthreads();
load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len, load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
img_stride_col, img_stride_row, false, c_contiguous); img_stride_col, img_stride_row, false, c_contiguous);
if(!(split && ! preload_full_kern))
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
if(split && ! preload_full_kern){ if(split && ! preload_full_kern){
for(int first_row=0, row=tz;first_row<kern_len;row+=blockDim.z, first_row+=blockDim.z){ for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
int idx3; //N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
//TODO: test/check for flipped_kern // (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
if(flipped_kern) // details, but that seemed really weird. tricky bug to find too.
idx3=(kern_len-(first_row)-blockDim.z);//the current last row flipped int idx3 = flipped_kern
else ? max((kern_len - (int)blockDim.z - first_row),0)
idx3=first_row; : first_row;
int len3 = min(blockDim.z, kern_len - first_row);
__syncthreads(); __syncthreads();
load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, blockDim.z, load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
const float* idx_kern=&d_kern[tz*kern_stride_row]; const float* idx_kern=&d_kern[tz*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
float sum2 = 0; float sum2 = 0;
if(row<kern_len) if(tz<len3)
convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
sum+=sum2; sum+=sum2;
} }
}else if(split){ }else if(split){
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for(int row=tz;row<kern_len;row+=blockDim.z){ for(int row=tz;row<kern_len;row+=blockDim.z){
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
...@@ -504,18 +518,21 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -504,18 +518,21 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
int row = tz;//The row of the kernel. int row = tz;//The row of the kernel.
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
//reduce //reduce no sync because previous loop ends with sync
d_reduce[thread_id]=sum; d_reduce[thread_id]=sum;
__syncthreads(); __syncthreads();
if(thread_id<out_len*out_wid){ if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
sum=0; //sum=0;
for(int i=0;i<blockDim.z;i++){ for(int i=1;i<blockDim.z;i++){
sum+=d_reduce[thread_id+i*blockDim.x*blockDim.y]; sum+=d_reduce[thread_id+i*out_wid*out_len];
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image out_wid*out_len*blockIdx.y+//the output image
......
...@@ -134,7 +134,9 @@ CudaNdarray_uninit(CudaNdarray*self) ...@@ -134,7 +134,9 @@ CudaNdarray_uninit(CudaNdarray*self)
assert(self->devdata); assert(self->devdata);
if (device_free(self->devdata)) if (device_free(self->devdata))
{ {
std::cerr << "!!!! error freeing device memory\n"; fprintf(stderr,
"!!!! error freeing device memory %p (self=%p)\n",
self->devdata, self);
rval = -1; rval = -1;
} }
self->devdata = NULL; self->devdata = NULL;
...@@ -144,7 +146,9 @@ CudaNdarray_uninit(CudaNdarray*self) ...@@ -144,7 +146,9 @@ CudaNdarray_uninit(CudaNdarray*self)
{ {
if (device_free(self->dev_structure)) if (device_free(self->dev_structure))
{ {
std::cerr << "!!!! error freeing device memory\n"; fprintf(stderr,
"!!!! error freeing dev_structure memory %p (self=%p)\n",
self->dev_structure, self);
rval = -1; rval = -1;
} }
self->dev_structure = NULL; self->dev_structure = NULL;
...@@ -1848,6 +1852,8 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args) ...@@ -1848,6 +1852,8 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
} }
get_gpu_ptr_size<<<1,1>>>(gpu_data); get_gpu_ptr_size<<<1,1>>>(gpu_data);
if (cudaSuccess != cublasGetError()){ if (cudaSuccess != cublasGetError()){
device_free(gpu_data);
return PyErr_Format(PyExc_RuntimeError, return PyErr_Format(PyExc_RuntimeError,
"CudaNdarray_ptr_int_size: error when calling the gpu code."); "CudaNdarray_ptr_int_size: error when calling the gpu code.");
} }
......
...@@ -403,6 +403,11 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype ...@@ -403,6 +403,11 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
self->devdata = 0; self->devdata = 0;
return -1; return -1;
} }
if (0)
fprintf(stderr,
"Allocated devdata %p (self=%p)\n",
self->devdata,
self);
self->data_allocated = size; self->data_allocated = size;
} }
return 0; return 0;
......
...@@ -84,14 +84,36 @@ def py_conv_scipy(img, kern, mode, subsample): ...@@ -84,14 +84,36 @@ def py_conv_scipy(img, kern, mode, subsample):
def _params_allgood_header(): def _params_allgood_header():
print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup" print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup"
def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), kern_stride=(1,1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol = 1e-8, nb_iter=0, ones=False): def test_example():
# Test a specific configuration that was failing in one of the big unit-tests
# This configuration information was read from one of the 'FAIL' lines printed by
# _params_allgood during a nosetest run
#
# now it can be tested directly by nosetests test_conv_cuda_ndarray.py:test_example
assert _params_allgood(
(1,1,4,4),
(1,1,3,2),
'valid',
version=13,
random=False)
def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
kern_stride=(1,1), version=-1, verbose=0, random=True, print_=None,
id=None, rtol=1e-5, atol = 1e-8, nb_iter=0, ones=False):
#
# This function is the core of several of the big unit-test drivers,
# but it can also be used very directly on its own to test a specific
# kind of convolution.
#
# See `test_example` (above) for an example of how to use this directly.
#
if ones: if ones:
assert not random assert not random
npy_img = theano._asarray(numpy.ones(ishape), dtype='float32') npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32') npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
elif random: elif random:
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_img = theano._asarray(numpy.random.rand(*ishape)+1, dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape)-2, dtype='float32')
else: else:
npy_img = theano._asarray(numpy.arange(numpy.prod(ishape)).reshape(ishape), dtype='float32')+1 npy_img = theano._asarray(numpy.arange(numpy.prod(ishape)).reshape(ishape), dtype='float32')+1
npy_kern = -(theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32')+1) npy_kern = -(theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32')+1)
...@@ -155,8 +177,6 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ker ...@@ -155,8 +177,6 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ker
print "max absolute diff:",diffabs.max(),"avg abs diff:",numpy.average(diffabs) print "max absolute diff:",diffabs.max(),"avg abs diff:",numpy.average(diffabs)
print "median abs diff:", numpy.median(diffabs), "nb close:",nb_close, "/", diff.size print "median abs diff:", numpy.median(diffabs), "nb close:",nb_close, "/", diff.size
print "max relatif diff:",pr_diff.max(), "avg rel diff:", numpy.average(pr_diff) print "max relatif diff:",pr_diff.max(), "avg rel diff:", numpy.average(pr_diff)
print rval
if not rval and print_!=False: if not rval and print_!=False:
if npy_img.shape[0]>5: if npy_img.shape[0]>5:
print "img",npy_img[0] print "img",npy_img[0]
...@@ -185,9 +205,19 @@ def exec_conv(version, shapes, verbose, random, mode, print_=None, rtol=1e-5, on ...@@ -185,9 +205,19 @@ def exec_conv(version, shapes, verbose, random, mode, print_=None, rtol=1e-5, on
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
ret=False ret=False
try: try:
ret = _params_allgood(ishape, kshape, mode, ret = _params_allgood(ishape,
subsample=subshape, img_stride=istride, kern_stride=kstride, kshape,
version=ver, verbose=verbose, random=random, id=id,print_=print_,rtol=rtol,ones=ones) mode,
subsample=subshape,
img_stride=istride,
kern_stride=kstride,
version=ver,
verbose=verbose,
random=random,
id=id,
print_=print_,
rtol=rtol,
ones=ones)
except Exception, e: except Exception, e:
print ver, id,(ishape, kshape, subshape, istride, kstride) print ver, id,(ishape, kshape, subshape, istride, kstride)
print e print e
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论