提交 6d4633be authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3737 from lamblin/gpuarray_abstractconv

Gpuarray abstractconv
......@@ -2406,14 +2406,14 @@ if True:
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_abstractconv_cudnn(node):
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if (not isinstance(node.op, (AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs))):
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if (not isinstance(inp1.type, CudaNdarrayType) or
not isinstance(inp2.type, CudaNdarrayType)):
return None
......
......@@ -237,124 +237,3 @@ class TestConv2d(unittest.TestCase):
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cormm_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_without_gpu
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cpu_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_without_gpu.excluding('conv_gemm')
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
fwd_OK = True
gradweight_OK = True
gradinput_OK = True
if not flip:
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if b not in ('valid', 'full'):
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if fwd_OK:
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_fwd,
inputs_shape=i,
filters_shape=f,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
if gradweight_OK:
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradweight,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
if gradinput_OK:
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradinput,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
// REMEMBER TO INCREASE c_code_cache_version when changing this file
//
//TODO detect SHARED_SIZE dynamically
#define SHARED_SIZE (16*1024)
enum { ConvMode_FULL, ConvMode_VALID };
PyObject * PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
PyGpuArrayObject * out, const int mode,
const size_t subsample_rows,
const size_t subsample_cols,
const int version, const int verbose);
/*
* version: -1, autodetect, >=0 a specific version to use.
* If it can't be executed, we revert to the reference implementation
*/
int
PyGpuArray_conv_valid(const PyGpuArrayObject *img,
const PyGpuArrayObject * kern,
PyGpuArrayObject * out, size_t subsample_rows,
size_t subsample_cols,
int version = -1, int verbose=0,
int max_threads_dim0 = 512)
{
int work_complete = 0;
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
if (PyGpuArray_NDIM(img) != 4)
{
PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1;
}
if (PyGpuArray_NDIM(kern) != 4)
{
PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1;
}
if (PyGpuArray_NDIM(out) != 4)
{
PyErr_SetString(PyExc_ValueError, "required out of 4D");
return -1;
}
if (verbose>1)
{
fprintf(stderr,
"INFO: Running conv_valid version=%d,"
" MACRO kern_width=%d with inputs:\n",
version, THEANO_KERN_WID);
fprintf(stderr,
"INFO: img dim: %llu %llu %llu %llu "
"img stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(img)[0],
(unsigned long long)PyGpuArray_DIMS(img)[1],
(unsigned long long)PyGpuArray_DIMS(img)[2],
(unsigned long long)PyGpuArray_DIMS(img)[3],
(long long)PyGpuArray_STRIDES(img)[0]/4,
(long long)PyGpuArray_STRIDES(img)[1]/4,
(long long)PyGpuArray_STRIDES(img)[2]/4,
(long long)PyGpuArray_STRIDES(img)[3]/4);
fprintf(stderr,
"INFO: kern dim: %llu %llu %llu %llu "
"kern stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(kern)[0],
(unsigned long long)PyGpuArray_DIMS(kern)[1],
(unsigned long long)PyGpuArray_DIMS(kern)[2],
(unsigned long long)PyGpuArray_DIMS(kern)[3],
(long long)PyGpuArray_STRIDES(kern)[0]/4,
(long long)PyGpuArray_STRIDES(kern)[1]/4,
(long long)PyGpuArray_STRIDES(kern)[2]/4,
(long long)PyGpuArray_STRIDES(kern)[3]/4);
fprintf(stderr,
"INFO: out dim: %llu %llu %llu %llu "
"out stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1],
(unsigned long long)PyGpuArray_DIMS(out)[2],
(unsigned long long)PyGpuArray_DIMS(out)[3],
(long long)PyGpuArray_STRIDES(out)[0]/4,
(long long)PyGpuArray_STRIDES(out)[1]/4,
(long long)PyGpuArray_STRIDES(out)[2]/4,
(long long)PyGpuArray_STRIDES(out)[3]/4);
fprintf(stderr,
"INFO: subsample_rows=%llu, subsample_cols=%llu\n",
(unsigned long long)subsample_rows,
(unsigned long long)subsample_cols);
}
//Check the output size is valid
if (!(PyGpuArray_DIMS(out)[2] == ceil_intdiv(PyGpuArray_DIMS(img)[2]- PyGpuArray_DIMS(kern)[2] + 1, subsample_rows) ||
PyGpuArray_DIMS(out)[3] == ceil_intdiv(PyGpuArray_DIMS(img)[3]- PyGpuArray_DIMS(kern)[3] + 1, subsample_cols) ||
PyGpuArray_DIMS(out)[0] == PyGpuArray_DIMS(img)[0] ||
PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0] ||
PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1])) {
PyErr_SetString(PyExc_ValueError, "GpuConv: sizes don't match");
return -1;
}
// we now search through a few implementations until one applies to our arguments.
//TODO: make separate version as if all fill this is slower.
//TODO: Make a switch with power of 2 max size as template
//TODO: make a parameter the number of division
//TODO: Should we make them in separate grid block instead?
const int stack_len = PyGpuArray_DIMS(img)[1];
const int nstack=PyGpuArray_DIMS(kern)[1];
const int nbatch=PyGpuArray_DIMS(img)[0];
const int nkern=PyGpuArray_DIMS(kern)[0];
const int img_wid=PyGpuArray_DIMS(img)[3];
const int img_len=PyGpuArray_DIMS(img)[2];
const int kern_wid=PyGpuArray_DIMS(kern)[3];
const int kern_len=PyGpuArray_DIMS(kern)[2];
const int out_wid=PyGpuArray_DIMS(out)[3];
const int out_len=PyGpuArray_DIMS(out)[2];
const int img_stride_col= PyGpuArray_STRIDES(img)[3]/4;
const int img_stride_row=PyGpuArray_STRIDES(img)[2]/4;
const int img_stride_stack= PyGpuArray_STRIDES(img)[1]/4;
const int img_stride_batch=PyGpuArray_STRIDES(img)[0]/4;
const int kern_stride_col= PyGpuArray_STRIDES(kern)[3]/4;
const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;
const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid;
const int out_size=out_len*out_wid;
const int img_size_byte = img_size*sizeof(float);
const int kern_size_byte = kern_size*sizeof(float);
const int out_size_byte = out_size*sizeof(float);
if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received had %llu columns!",
THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]);
return -1;
}
bool subsample = subsample_rows!=1 || subsample_cols!=1;
bool img_contiguous = img->ga.flags & GA_C_CONTIGUOUS;
bool kern_contiguous = kern->ga.flags & GA_C_CONTIGUOUS;
bool out_contiguous = out->ga.flags & GA_C_CONTIGUOUS;
bool c_contiguous = img_contiguous && kern_contiguous && out_contiguous;
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
//if the lower 2 dims are c_contiguous but flipped, unflipping the
// stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern_flipped=false;
kern_contiguous_2d_unflipped = true;
}
//if we remove the restriction
//img_size_byte+kern_size_byte>8*1024, we can enter in condition where
//we will lower the occupency due to shared memory and/or registers.
if ((version == -1) &&
(out_size<64 || img_size_byte+kern_size_byte>8*1024) &&
out_size<=256){
//condition for exec
if(!subsample &&
out_contiguous &&
out_size<=max_threads_dim0 &&//Maximum of X threads by block
std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
}
if (!subsample && c_contiguous &&
(version==0||version==2||version==-1) &&
out_wid<=max_threads_dim0 &&//Maximum of X threads for block.x
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch
{
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
nb_split++;
size_t threads_per_block[3] = {(size_t)out_wid,
ceil_intdiv((size_t)out_len,(size_t)nb_split),
(size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
size_t shmem_sz = (img_size + kern_size)*sizeof(float);
GpuKernel *k = NULL;
if(threads_per_block[1]==out_len) k=&conv_patch_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else k=&conv_patch_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch' version %s nb_split=%d\n",
threads_per_block[1]==out_len ? "no split": "split", nb_split);
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch' failed (%s),"
" trying next implementation\n",
GpuKernel_error(k, err));
}
}
if (out_contiguous &&
(version==1||version==3||version==11||version==12||version==-1) &&
(version!=1 || out_size<=max_threads_dim0) &&//Maximum of X threads by block.x
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch_stack
{
//version 1 is without split and preload the full kernel
//version 3 is with split and preload the full kernel
//version 11 is without split and load only 1 kernel row at a time.
//version 12 is with split and load only 1 kernel row at a time.
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
size_t threads_per_block[3] = {(size_t)out_wid,
(size_t)ceil_intdiv(out_len,nb_split),
(size_t)1};
bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
if(version==11 || version==12) preload_full_kernel=false;
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
size_t shmem_sz = (img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
GpuKernel *k = NULL;
if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_64_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_65_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_66_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_67_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_68_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_69_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_70_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_71_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_72_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_73_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_74_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_75_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_76_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_77_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_78_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_79_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_80_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_81_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_82_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_83_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_84_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_85_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_86_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_87_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_88_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_89_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_90_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_91_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_92_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_93_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_94_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_95_node_<<<<HASH_PLACEHOLDER>>>>_0;}
void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&out_len, (void *)&out_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern,
(void *)&subsample_rows, (void *)&subsample_cols};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch_stack' version with nb_split=%i"
" and preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu\n",
nb_split, preload_full_kernel,
(unsigned long long)subsample_rows,
(unsigned long long)subsample_cols);
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch_stack' failed (%s),"
" trying next implementation\n",
GpuKernel_error(k, err));
}
}
if (!subsample && out_contiguous &&
(version==4||version==-1) &&
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
nstack == 1 &&// don't implement the stack in the kernel.
kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
!work_complete) //conv_rows
{
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)1, (size_t)1};
size_t n_blocks[3] = {(size_t)out_len, (size_t)nbatch*nkern, (size_t)1};
size_t shmem_sz = (kern_len*img_wid + kern_size)*sizeof(float);
GpuKernel *k = NULL;
if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else k=&conv_rows_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose)
fprintf(stderr, "INFO: used 'conv_rows' version\n");
}
else
{
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows' failed (%s),"
" trying next implementation\n",
GpuKernel_error(k, err));
}
}
if (!subsample && out_contiguous &&
(version==5||version==-1) &&
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_rows_stack
{
int nb_row=1;
//TODO:if not c_contiguous, lower max_thread as we use 22
//registers by thread and we won't execute 2 block in one MP.
for(int i=2;i<=out_len;i++){
if((i)*out_wid<=max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
nb_row=i;
}
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
(size_t)nbatch*nkern, (size_t)1};
size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
GpuKernel *k = NULL;
if(!img_contiguous_2d || !kern_contiguous_2d) {
k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
} else {
k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
}
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose)
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
}
else
{
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack' failed (%s),"
" trying next implementation\n",
GpuKernel_error(k, err));
}
}
if (!subsample && out_contiguous &&
(version==9||version==10||version==-1) &&
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
(version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
!work_complete) //conv_rows_stack2
{
// version 9:we preload the full kernel
// version 10: load only a few row at a time.
int nb_row=1;
int version_back = version;
//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
version = 9;
else if(version==-1)version = 10;
int k_size = kern_size;
if(version==10)
k_size=kern_wid;
for(int i=2;i<=out_len;i++){
if(i*out_wid<=max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
nb_row=i;
}
//to test the case when we don't have a thread by output pixel.
if((version_back!=-1)&& nb_row>1) nb_row--;
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
(size_t)nbatch*nkern, (size_t)1};
size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
GpuKernel *k = NULL;
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) k=&conv_rows_stack2_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==9) k=&conv_rows_stack2_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_stack2_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else k=&conv_rows_stack2_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose)
fprintf(stderr,
"INFO: used 'conv_rows_stack2' version %s with"
" %d row(s).\n",
(version==9?"'load full kernel'":
"'load 1 kern row at a time'"),nb_row);
}
else
{
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack2' failed (%s),"
" trying next implementation\n",
GpuKernel_error(k, err));
}
}
//version 8 is the same but we force the split.
// The split is need in case we have too much threads.
// This happen frequently if the kernel length is big.
// Big kernel is frequent in the gradient.
//version 8 need a minimum of kernel length as we force the split.
//version 8 is needed to test more easily this kernel template parameter.
//version 13 load only 1 kernel row at a time.
if (!subsample &&
out_contiguous &&
out_size<=max_threads_dim0 &&//Maximum of X threads by block
(version==7||version==8||version==13||version==-1) &&
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
//version 13 need a minimal kernel length as big as the split.
(version!=13||kern_len>1) &&
!work_complete) //conv_patch_stack_reduce
{
int nb_split=1;
int full_kern=true;
if(version==8||version==13) nb_split++;//force the split.
if(version==13)full_kern=false;
//check if we can fit the full kernel in the shared memory
if(sizeof(float)*std::max(img_size + kern_size, out_size*2) > shared_avail){
full_kern = false;
}
//thread_z is going to be ceil_intdiv(kern_len, nb_split)
// we need enough splits so that
// a) thread_z fits in the 'z' threadIdx (i.e. is less than 64)
// b) thread_z * out_len * out_wid fits in the thread count
// c) the kernel doesn't need too much shared memory
// constraint (a)
// device 1.3 have a max of 64 thread in z
while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;
// constraint (b)
// (TODO: read the number of threads per block from the device)
while(out_size*ceil_intdiv(kern_len,nb_split)>max_threads_dim0)
nb_split++;
// tentative estimates (prior to contraint c)
size_t thread_z=ceil_intdiv(kern_len,nb_split);
size_t shmem_sz = sizeof(float)*(full_kern
? std::max((size_t)img_size + kern_size, out_size*thread_z)
: std::max((size_t)img_size + thread_z*kern_wid, out_size*thread_z));
// constraint (c)
while ((shmem_sz >= shared_avail) && (nb_split <= kern_len)){
//if we can't fit the kernel in shared memory, we must split it more.
nb_split++;
thread_z=ceil_intdiv(kern_len,nb_split);
shmem_sz = sizeof(float)*(full_kern
? std::max((size_t)img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z));
}
if (nb_split <= kern_len)
{
assert(thread_z>0);//should not happen, but in case...
if(!full_kern) assert(thread_z!=kern_len);
size_t threads_per_block[3] = {(size_t)out_wid,
(size_t)out_len,
(size_t)thread_z};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
GpuKernel *k = NULL;
const bool split=thread_z!=kern_len;
const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
//We will always be split when we don't load the full kernel
/* if(!kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
/*else*/ if(!kern_flipped && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && !ccontig && split && !full_kern) k=&conv_patch_stack_reduce_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && !ccontig && split && full_kern) k=&conv_patch_stack_reduce_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(!kern_flipped && ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_4_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(!kern_flipped && ccontig && !split && full_kern) k=&conv_patch_stack_reduce_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && ccontig && split && !full_kern) k=&conv_patch_stack_reduce_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && ccontig && split && full_kern) k=&conv_patch_stack_reduce_7_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_8_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(kern_flipped && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && !ccontig && split && !full_kern) k=&conv_patch_stack_reduce_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && !ccontig && split && full_kern) k=&conv_patch_stack_reduce_11_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(kern_flipped && ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_12_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(kern_flipped && ccontig && !split && full_kern) k=&conv_patch_stack_reduce_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && ccontig && split && !full_kern) k=&conv_patch_stack_reduce_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && ccontig && split && full_kern) k=&conv_patch_stack_reduce_15_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col,
(void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch_stack_reduce' version"
" kern_flipped=%i ccontig=%i nb_split=%d,"
" preload_full_kern=%d\n",
kern_flipped, ccontig, nb_split, full_kern);
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch_stack_reduce' failed (%s),"
" trying next implementation\n",
GpuKernel_error(k, err));
}
} // else no good nb_splits was found
}
if ((version==6||version==-1) &&
kern_len<=320 &&
!work_complete) //conv_valid_row_reduce
{
size_t outsize = PyGpuArray_SIZE(out);
size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
(size_t)1, (size_t)1};
int block_nstack=nstack;
//Max of 512 threads per blocks.
//On old hardware, we have a max of 356 threads as we have only
//8k registers and the kernel use 23 register
//TODO: check if we have 8k or 16k of register...
while(block_nstack*kern_len>320)block_nstack--;
size_t threads_per_block[3] = {(size_t)block_nstack, (size_t)kern_len, (size_t)1};
size_t n_reduce_buf = block_nstack * kern_len * sizeof(float);
/* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
*
* if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
* */
int initial_reduce_boundary = (1 << (int)(log2((double)(n_reduce_buf/sizeof(float)))));
if (initial_reduce_boundary == (n_reduce_buf / sizeof(float)))
initial_reduce_boundary >>= 1;
if (n_reduce_buf == sizeof(float))
assert (initial_reduce_boundary == 0);
else
{
assert (initial_reduce_boundary * 2 >= n_reduce_buf/sizeof(float));
assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
}
GpuKernel *k = NULL;
//std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
//std::cerr << "kerns " << nstack << " " << kern_len << "\n";
//std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
if(block_nstack==nstack)
k=&conv_valid_row_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else
k=&conv_valid_row_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&out_len, (void *)&out_wid,
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)&img_stride_batch, (void *)&img_stride_stack,
(void *)&img_stride_row, (void *)&img_stride_col,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)&kern_stride_nkern, (void *)&kern_stride_stack,
(void *)&kern_stride_row, (void *)&kern_stride_col,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&out_stride_batch, (void *)&out_stride_nkern,
(void *)&out_stride_row, (void *)&out_stride_col,
(void *)&subsample_rows, (void *)&subsample_cols,
(void *)&initial_reduce_boundary};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, n_reduce_buf, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose)
fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
}
else
{
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_valid_row_reduce' failed (%s),"
" trying next implementation\n",
GpuKernel_error(k, err));
}
}
if (1 && !work_complete) //conv_reference_valid
{
size_t outsize = PyGpuArray_SIZE(out);
size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
(size_t)1, (size_t)1};
size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
(size_t)256),
(size_t)1, (size_t)1};
if (verbose)
fprintf(stderr, "INFO: launching conv_reference_valid\n");
void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&out_len, (void *)&out_wid,
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)&img_stride_batch, (void *)&img_stride_stack,
(void *)&img_stride_row, (void *)&img_stride_col,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)&kern_stride_nkern, (void *)&kern_stride_stack,
(void *)&kern_stride_row, (void *)&kern_stride_col,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&out_stride_batch, (void *)&out_stride_nkern,
(void *)&out_stride_row, (void *)&out_stride_col,
(void *)&subsample_rows, (void *)&subsample_cols};
int err = GpuKernel_call(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0,
3, threads_per_block, n_blocks, 0, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
}
else
{
if (verbose)
fprintf(stderr, "INFO: 'conv_reference_valid' failed\n");
PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for"
" PyGpuArray_conv_valid! (%s)",
GpuKernel_error(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
return -1;
}
}
if (!work_complete)
{
PyErr_Format(PyExc_RuntimeError,
"ERROR: no implementation(s) worked for"
" PyGpuArray_conv_valid!"
" Version asked(%d) (-1 mean use an heuristic)",
version);
return -1;
}
return 0;
}
int
PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
PyGpuArrayObject * out, size_t subsample_rows,
size_t subsample_cols, int version = -1, int verbose=0,
int max_threads_dim0=512)
{
//144 is the biggest static shared size used with compiling this file.
const int shared_avail = SHARED_SIZE - 150;
int work_complete = 0;
if (PyGpuArray_NDIM(img) != 4)
{
PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1;
}
if (PyGpuArray_NDIM(kern) != 4)
{
PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1;
}
if (PyGpuArray_NDIM(out) != 4)
{
PyErr_SetString(PyExc_ValueError, "required out of 4D");
return -1;
}
// check the size of the output matrix
assert (PyGpuArray_DIMS(out)[2] == ceil_intdiv(PyGpuArray_DIMS(img)[2] + PyGpuArray_DIMS(kern)[2] - 1, subsample_rows));
assert (PyGpuArray_DIMS(out)[3] == ceil_intdiv(PyGpuArray_DIMS(img)[3] + PyGpuArray_DIMS(kern)[3] - 1, subsample_cols));
assert (PyGpuArray_DIMS(out)[0] == PyGpuArray_DIMS(img)[0]);
assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]);
assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]);
const int stack_len=PyGpuArray_DIMS(img)[1];
const int nstack=PyGpuArray_DIMS(kern)[1];
const int nbatch=PyGpuArray_DIMS(img)[0];
const int nkern=PyGpuArray_DIMS(kern)[0];
const int img_wid=PyGpuArray_DIMS(img)[3];
const int img_len=PyGpuArray_DIMS(img)[2];
const int kern_wid=PyGpuArray_DIMS(kern)[3];
const int kern_len=PyGpuArray_DIMS(kern)[2];
const int out_wid=PyGpuArray_DIMS(out)[3];
const int out_len=PyGpuArray_DIMS(out)[2];
const int img_stride_col= PyGpuArray_STRIDES(img)[3]/4;
const int img_stride_row=PyGpuArray_STRIDES(img)[2]/4;
const int img_stride_stack=PyGpuArray_STRIDES(img)[1]/4;
const int img_stride_batch=PyGpuArray_STRIDES(img)[0]/4;
const int kern_stride_col= PyGpuArray_STRIDES(kern)[3]/4;
const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;
const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid;
const int out_size=out_len*out_wid;
const int img_size_byte = img_size*sizeof(float);
const int kern_size_byte = kern_size*sizeof(float);
//padded image sizes
const int img_wid_padded=img_wid+2*kern_wid-2;
const int img_len_padded=img_len+2*kern_len-2;
const int img_size_padded=img_len_padded * img_wid_padded;
const int img_size_padded_byte = img_size_padded*sizeof(float);
//const int out_size_byte = out_size*sizeof(float); // unused
if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) ||
(THEANO_KERN_WID == 0))){
PyErr_Format(PyExc_ValueError,
"ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received"
" had %llud columns!",
THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]);
return -1;
}
bool subsample = subsample_rows!=1 || subsample_cols!=1;
bool img_contiguous = img->ga.flags & GA_C_CONTIGUOUS;
bool kern_contiguous = kern->ga.flags & GA_C_CONTIGUOUS;
bool out_contiguous = out->ga.flags & GA_C_CONTIGUOUS;
bool c_contiguous = img_contiguous && kern_contiguous && out_contiguous;
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
//if the lower 2 dims are c_contiguous but flipped, unflipping the
//stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern_flipped=false;
kern_contiguous_2d_unflipped = true;
}
if (verbose>1)
{
printf("INFO: Running conv_full version=%d,"
" MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
printf("INFO: img dim: %llu %llu %llu %llu "
"img stride: %lld %lld %lld %lld\n",
(unsigned long long)nbatch,
(unsigned long long)stack_len,
(unsigned long long)img_len,
(unsigned long long)img_wid,
(long long)img_stride_batch,
(long long)img_stride_stack,
(long long)img_stride_row,
(long long)img_stride_col);
printf("INFO: kern dim: %llu %llu %llu %llu "
"kern stride: %lld %lld %lld %lld\n",
(unsigned long long)nkern,
(unsigned long long)nstack,
(unsigned long long)kern_len,
(unsigned long long)kern_wid,
(long long)kern_stride_nkern,
(long long)kern_stride_stack,
(long long)kern_stride_row,
(long long)kern_stride_col);
printf("INFO: out dim: %llu %llu %llu %llu "
"out stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1],
(unsigned long long)out_len,
(unsigned long long)out_wid,
(long long)out_stride_batch,
(long long)out_stride_nkern,
(long long)out_stride_row,
(long long)out_stride_col);
}
if (!subsample &&
out_contiguous &&
(version==3||version==4||version==5||version==-1) &&
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack_padded
{
//version 3 without split
//version 4 with split (more registers)
//version 5 with split (more registers) low mem version(some restriction and still more register)
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
if(kern_len==1 && version==5){
//version 5 don't support kern_len==1 as 1%0 return -1.
version=-1;
if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
}
if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
//we pass by ceil_intdiv in case the out_len is not a multiple
//of nb_split, we want nb_split the number of iteration.
//Max of 16k of shared memory
if(version==5)
while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
//327 as we use 25 register
//version 5 will have only 1 block running at a time, so we
//can use 32 registers per threads, but their is some other stuff that
//for the limit to bu lower then 512.
int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>max_threads_dim0)version=4;
if(version==-1)version=3;
if(version==-1 && nb_split>1) version=4;
else if(version==-1) version=3;
//force version 4 when more than 1 split are needed to always execute.
else if(version==3 && nb_split!=1) version=4;
assert(version!=3 || nb_split==1);
assert(version!=5 || kern_len>1);
assert(version!=-1);
size_t threads_per_block[3] = {(size_t)out_wid,
ceil_intdiv((size_t)out_len,(size_t)nb_split),
(size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
size_t shmem_sz=img_size_padded_byte + kern_size_byte;
if(version==5)
shmem_sz=((kern_len+threads_per_block[1]-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
GpuKernel *k = NULL;
if(version==3) k=&conv_full_patch_stack_padded_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==5) k=&conv_full_patch_stack_padded_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==4) k=&conv_full_patch_stack_padded_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) k=&conv_full_patch_stack_padded_4_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) k=&conv_full_patch_stack_padded_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) k=&conv_full_patch_stack_padded_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==3 && kern_flipped) k=&conv_full_patch_stack_padded_8_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==5 && kern_flipped)k=&conv_full_patch_stack_padded_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==4 && kern_flipped)k=&conv_full_patch_stack_padded_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) k=&conv_full_patch_stack_padded_12_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) k=&conv_full_patch_stack_padded_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) k=&conv_full_patch_stack_padded_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
else assert(false);
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose)
fprintf(stderr,
"INFO: used 'conv_full_patch_stack_padded'"
" nb_split=%d low_mem=%s\n",
nb_split, (version==5?"true":"false"));
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_full_patch_stack_padded' %s %s"
" failed (%s), trying next implementation\n",
version==3?"no split": "split",
(version==5?"low_mem":"not_low_mem"),
GpuKernel_error(k, err));
}
}
if (!subsample && c_contiguous &&
(version==0||version==-1) &&
out_size<=max_threads_dim0 &&//Maximum of X threads by block
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch
{
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
size_t shmem_sz = (img_size + kern_size)*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack};
int err = GpuKernel_call(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0,
3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n");
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_full_patch' failed (%s),"
" trying next implementation\n",
GpuKernel_error(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
}
}
if (false && !subsample && //disabled as test fail for this kernel
(version==1||version==-1) &&
out_size<=max_threads_dim0 &&//Maximum of X threads by block
(nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything
{
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)1, (size_t)1};
size_t shmem_sz = (img_size + kern_size)*nstack*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0,
3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n");
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation\n",
GpuKernel_error(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
}
}
if (!subsample &&
img_batch_stack_contiguous &&
out_contiguous &&
(version==2||version==-1) &&
out_size<=max_threads_dim0 &&//Maximum of X threads by block
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack
{
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
size_t shmem_sz = (img_size + kern_size)*sizeof(float);
GpuKernel *k = NULL;
if(!img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose)
fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
GpuKernel_error(k, err));
}
}
if (1 && !work_complete) //conv_reference_full
{
if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");
size_t outsize = PyGpuArray_SIZE(out);
size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
(size_t)1, (size_t)1};
size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
(size_t)256),
(size_t)1, (size_t)1};
void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&out_len, (void *)&out_wid,
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)&img_stride_batch, (void *)&img_stride_stack,
(void *)&img_stride_row, (void *)&img_stride_col,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)&kern_stride_nkern, (void *)&kern_stride_stack,
(void *)&kern_stride_row, (void *)&kern_stride_col,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&out_stride_batch, (void *)&out_stride_nkern,
(void *)&out_stride_row, (void *)&out_stride_col,
(void *)&subsample_rows, (void *)&subsample_cols};
int err = GpuKernel_call(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0,
3, threads_per_block, n_blocks, 0, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_full' version"
" ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d"
" nkern=%d nstack=%d subsample=%d\n",
img_len,img_wid, kern_len, kern_wid,
out_len, out_wid, nbatch, nkern, nstack, subsample);
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation\n",
GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for"
" CudaNdarray_conv_full! (%s)",
GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
return -1;
}
}
return 0;
}
PyObject *
PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
PyGpuArrayObject * out, const int mode,
const size_t subsample_rows, const size_t subsample_cols,
const int version, const int verbose,
const int max_threads_dim0 = 512
)
{
// Re-use the out object if possible. If the out object it not used, then its refcount is not modified.
// If the out object is re-used then it is returned, and its refcount is incremented by 1.
//
if (PyGpuArray_NDIM(img) != 4)
{
PyErr_SetString(PyExc_ValueError, "PyGpuArray 4-D tensor required");
return NULL;
}
if (PyGpuArray_NDIM(kern) != 4)
{
PyErr_SetString(PyExc_ValueError, "PyGpuArray 4-D tensor required");
return NULL;
}
size_t out_dim[4];
out_dim[0] = PyGpuArray_DIMS(img)[0];
out_dim[1] = PyGpuArray_DIMS(kern)[0];
size_t logical_rows, logical_cols;
if (mode == ConvMode_VALID)
{
logical_rows = PyGpuArray_DIMS(img)[2] - PyGpuArray_DIMS(kern)[2] + 1;
logical_cols = PyGpuArray_DIMS(img)[3] - PyGpuArray_DIMS(kern)[3] + 1;
}
else
{
logical_rows = PyGpuArray_DIMS(img)[2] + PyGpuArray_DIMS(kern)[2] - 1;
logical_cols = PyGpuArray_DIMS(img)[3] + PyGpuArray_DIMS(kern)[3] - 1;
}
out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
PyGpuArrayObject * rval = NULL;
if ( out
&& PyGpuArray_NDIM(out)==4
&& out->ga.flags & GA_C_CONTIGUOUS
&& PyGpuArray_DIMS(out)[0]==out_dim[0]
&& PyGpuArray_DIMS(out)[1]==out_dim[1]
&& PyGpuArray_DIMS(out)[2]==out_dim[2]
&& PyGpuArray_DIMS(out)[3]==out_dim[3])
{
rval = out;
Py_INCREF(rval);
if (verbose)
fprintf(stderr,
"INFO: Conv is reusing the 'out' argument"
" structure.\n");
}
else
{
if (out && verbose)
fprintf(stderr,
"INFO: Conv is ignoring 'out' argument with wrong"
" structure.\n");
else if(verbose)
fprintf(stderr,
"INFO: Conv don't have an 'out' argument"
" structure.\n");
rval = pygpu_zeros(4, out_dim,
img->ga.typecode, GA_C_ORDER,
img->context, Py_None);
//rval might be null
}
if ((rval==NULL)
|| ((mode==ConvMode_VALID) && PyGpuArray_conv_valid(img, kern, rval,
subsample_rows,
subsample_cols,
version, verbose,
max_threads_dim0))
|| ((mode==ConvMode_FULL) && PyGpuArray_conv_full(img, kern, rval,
subsample_rows,
subsample_cols,
version, verbose,
max_threads_dim0))
)
{
// if rval is something we just allocated,
// and there was a problem, then we have to free it.
Py_XDECREF(rval);
return NULL;
}
return (PyObject*)rval;
}
import copy
import os
from theano import gof
try:
from pygpu import gpuarray
except ImportError:
pass
from .type import GpuArrayType
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name)
from theano.gof import utils
class GpuConv(GpuKernelBase, gof.Op):
"""
Implement the batched and stacked 2d convolution on the gpu.
Parameters
----------
version
Each version of c_code implements many kernels for the convolution.
By default we try to guess the best one. You can force one version with
this parameter. This parameter is used by the tests.
direction_hint
'forward', 'bprop weights' or 'bprop inputs'. Serves as a hint for graph
optimizers replacing GpuConv by other implementations. If the GpuConv is
inserted automatically, we take its value from ConvOp.
verbose
For value of 1,2 and 3. Print more information during the execution of
the convolution. Mostly used for optimization or debugging.
kshp
The size of the kernel. If provided, can generate faster code. If the
GpuConv op is automatically inserted, we take its value automatically
from the Conv op.
imshp
The size of the image. Not used for code generation but allows to select
an experimental new version in another repo.
max_threads_dim0
The maximum number of threads for the block size dimensions 0
(blockDim.x) used by the GPU function.
nkern
The number of kernels. Not used for this op, but can be used by graph
optimizers to select a more optimal convolution implementation. If the
GpuConv op is inserted automatically, we take its value from the Conv
op.
bsize
The batch size. Not used for this op, but can be used by graph
optimizers to select a more optimal convolution implementation. If the
GpuConv op is inserted automatically, we take its value from the Conv
op.
fft_opt
Deactivate fft_opt optimization at the op level when set to False. Note
that by default fft optimization aren't enabled.
See :ref:`convolution documentation <libdoc_tensor_nnet_conv>` to enable
them.
"""
__props__ = ('border_mode', 'subsample', 'logical_img_hw',
'logical_kern_hw', 'logical_kern_align_top', 'version',
'verbose', 'kshp', 'imshp', 'max_threads_dim0')
@staticmethod
def logical_output_shape_2d(imshp, kshp, mode):
if mode == 'valid':
return imshp[0] - kshp[0] + 1, imshp[1] - kshp[1] + 1
if mode == 'full':
return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1
raise ValueError(mode)
def __init__(self, border_mode, subsample=(1, 1),
logical_img_hw=None, logical_kern_hw=None,
logical_kern_align_top=True,
version=-1, direction_hint=None,
verbose=0, kshp=None, imshp=None,
max_threads_dim0=None,
nkern=None, bsize=None, fft_opt=True):
self.border_mode = border_mode
self.subsample = subsample
if logical_img_hw is not None:
h, w = logical_img_hw
# TODO: reconsider this... since shapes are not given in
# constructor, maybe a multiplier + offset is a more
# appropriate way of passing this logical grid
logical_img_hw = tuple(logical_img_hw)
self.logical_img_hw = logical_img_hw
if logical_kern_hw is not None:
h, w = logical_kern_hw
# TODO: reconsider this... since shapes are not given in
# constructor, maybe a multiplier + offset is a more
# appropriate way of passing this logical grid
logical_kern_hw = tuple(logical_kern_hw)
self.logical_kern_hw = logical_kern_hw
self.logical_kern_align_top = logical_kern_align_top
self.version = version
self.direction_hint = direction_hint
self.verbose = verbose
self.kshp = kshp
self.imshp = imshp
self.max_threads_dim0 = max_threads_dim0
self.nkern = nkern
self.bsize = bsize
self.fft_opt = fft_opt
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, "imshp"):
self.imshp = None
if not hasattr(self, "max_threads_dim0"):
self.max_threads_dim0 = None
if not hasattr(self, "direction_hint"):
self.direction_hint = None
if not hasattr(self, "nkern"):
self.nkern = None
if not hasattr(self, "bsize"):
self.bsize = None
if not hasattr(self, "fft_opt"):
self.fft_opt = True
def make_node(self, img, kern):
if img.dtype != "float32" or kern.dtype != "float32":
raise NotImplementedError("GpuConv currently only work"
" with float32 dtype")
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
ctx_name = infer_context_name(img, kern)
img = as_gpuarray_variable(img, ctx_name)
kern = as_gpuarray_variable(kern, ctx_name)
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False]
out = GpuArrayType(img.dtype, broadcastable, context_name=ctx_name)()
return gof.Apply(self, [img, kern], [out])
def get_params(self, node):
return node.inputs[0].type.context
def flops(self, inputs, outputs):
"""
Useful with the hack in profilemode to print the MFlops.
"""
images, kerns = inputs
out, = outputs
assert images[1] == kerns[1]
flops = 0
if self.border_mode == "valid":
# nb mul and add by output pixel
flops = kerns[2] * kerns[3] * 2
# nb flops by output image
flops *= out[2] * out[3]
# nb patch multiplied
flops *= images[1] * kerns[0] * images[0]
else:
flops = (images[0] * kerns[0] * images[1] *
kerns[2] * kerns[3] *
images[2] * images[3] * 2)
return flops
def make_thunk(self, node, storage_map, compute_map, no_recycling):
node_ = copy.copy(node)
assert node.op is node_.op
if node_.op.max_threads_dim0 is None:
node_.op.max_threads_dim0 = node_.inputs[0].type.context.maxlsize
return super(GpuConv, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling)
def c_compile_args(self):
nb = 0
if self.kshp is not None:
nb = self.kshp[1]
return ['-DTHEANO_KERN_WID=' + str(nb)]
def c_headers(self):
return ['<stdio.h>', '<numpy_compat.h>', '<gpuarray/types.h>']
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 23)
def c_code(self, node, nodename, inp, out_, sub):
if node.inputs[0].type.context.kind != "cuda":
raise NotImplementedError("GpuConv only works for cuda devices")
img, kern = inp
out, = out_
dx = self.subsample[0]
dy = self.subsample[1]
border_mode = self.border_mode
version = self.version
verbose = self.verbose
sub = sub.copy()
max_threads_dim0 = self.max_threads_dim0
if max_threads_dim0 is None:
raise NotImplementedError("GpuConv.c_code should not be called "
"directly. It should be called by "
"make_thunk() that add some information "
"related to the selected GPU.")
sub.update(locals())
return """
//Mandatory args
const char *mode_str = "%(border_mode)s";
//Optional args
int version = %(version)s;
int verbose = %(verbose)s;
size_t dx = %(dx)s;
size_t dy = %(dy)s;
int mode;
if (strcmp(mode_str, "full") == 0)
{
mode = ConvMode_FULL;
}
else if (strcmp(mode_str, "valid") == 0)
{
mode = ConvMode_VALID;
}
else
{
PyErr_SetString(PyExc_ValueError,
"mode must be one of 'full' or 'valid'");
return 0;
}
// TODO, make out be decref before we alloc out2!
PyGpuArrayObject * out2 = (PyGpuArrayObject *)PyGpuArray_Conv(
%(img)s, %(kern)s,
%(out)s, mode,
dx, dy,
version, verbose,
%(max_threads_dim0)s);
Py_XDECREF(%(out)s);
%(out)s = out2;
if (%(out)s==NULL){
%(fail)s
}
""" % sub
def c_support_code_apply(self, node, name):
nb = 0
if self.kshp is not None:
nb = self.kshp[1]
kernels = self.gpu_kernels(node, name)
k = kernels[0]
code = """
#define THEANO_KERN_WID %(nb)d
""" % locals()
code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
bin = gk._binary
bcode = ','.join(hex(ord(c)) for c in bin)
code = code.replace('\\', '\\\\')
code = code.replace('"', '\\"')
code = code.replace('\n', '\\n')
mod = """
static const char conv_bcode[] = {%(bcode)s};
static const char *conv_code = "%(code)s";
""" % locals()
return mod
def c_support_code_struct(self, node, name):
mod = GpuKernelBase.c_support_code_struct(self, node, name)
with open(os.path.join(os.path.split(__file__)[0], "conv.cu")) as f:
mod += f.read()
return mod
@utils.memoize
def gpu_kernels(self, node, name):
dtypes = [i.dtype for i in node.inputs]
dtypes.extend([o.dtype for o in node.outputs])
flags = Kernel.get_flags(*dtypes)
kernels = self.conv_patch_kernels(name, flags)
kernels.extend(self.conv_patch_stack_kernels(name, flags))
kernels.extend(self.conv_patch_stack_reduce_kernels(name, flags))
kernels.extend(self.conv_rows_kernels(name, flags))
kernels.extend(self.conv_rows_stack_kernels(name, flags))
kernels.extend(self.conv_rows_stack2_kernels(name, flags))
kernels.extend(self.conv_valid_row_reduce_kernels(name, flags))
kernels.extend(self.conv_reference_valid_kernels(name, flags))
kernels.extend(self.conv_reference_full_kernels(name, flags))
kernels.extend(self.conv_full_patch_kernels(name, flags))
kernels.extend(self.conv_full_patch_stack_kernels(name, flags))
kernels.extend(self.conv_full_patch_stack_padded_kernels(name, flags))
kernels.extend(self.conv_full_load_everything_kernels(name, flags))
return kernels
def conv_patch_kernels(self, name, flags):
kname = "conv_patch_%d"
k_var = "conv_patch_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [2, 3]
]
def conv_patch_stack_kernels(self, name, flags):
kname = "conv_patch_stack_%d"
k_var = "conv_patch_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in range(64, 96)
]
def conv_patch_stack_reduce_kernels(self, name, flags):
kname = "conv_patch_stack_reduce_%d"
k_var = "conv_patch_stack_reduce_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15]
]
def conv_rows_kernels(self, name, flags):
kname = "conv_rows_%d"
k_var = "conv_rows_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_rows_stack_kernels(self, name, flags):
kname = "conv_rows_stack_%d"
k_var = "conv_rows_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_rows_stack2_kernels(self, name, flags):
kname = "conv_rows_stack2_%d"
k_var = "conv_rows_stack2_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 3]
]
def conv_valid_row_reduce_kernels(self, name, flags):
kname = "conv_valid_row_reduce_%d"
k_var = "conv_valid_row_reduce_%d_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_reference_valid_kernels(self, name, flags):
kname = "conv_reference_valid"
k_var = "conv_reference_valid_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_reference_full_kernels(self, name, flags):
kname = "conv_reference_full"
k_var = "conv_reference_full_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_full_patch_kernels(self, name, flags):
kname = "conv_full_patch"
k_var = "conv_full_patch_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_full_patch_stack_kernels(self, name, flags):
kname = "conv_full_patch_stack_%d"
k_var = "conv_full_patch_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 3]
]
def conv_full_patch_stack_padded_kernels(self, name, flags):
kname = "conv_full_patch_stack_padded_%d"
k_var = "conv_full_patch_stack_padded_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14]
]
def conv_full_load_everything_kernels(self, name, flags):
kname = "conv_full_load_everything"
k_var = "conv_full_load_everything_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
extern __shared__ float s_data[];
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len/nb_split
//grid block size=batch_id
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
extern "C" __global__ void
conv_full_patch_split(const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
int batch_id = blockIdx.x;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int out_col = tx;//output col
int out_row = ty;//output row
const int thread_id = out_row*out_wid + out_col;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
img+=img_len*img_wid*batch_id;//the good batch
load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid);
__syncthreads();
for(int out_row=ty;out_row<out_len;out_row+=out_len/nb_split){
float sum = 0.0f;
int img_row = out_row;
for (int row=0; row < kern_len; row++) {//loop over row
int inverse_row = (img_row-row);
if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image
const float* idx_in=&d_img[inverse_row*img_wid];
const float* idx_kern=&d_kern[row*kern_wid];
int img_col = out_col;
int col=0,last=0;
for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col
if(last<0 ||last>=(img_wid))continue;//col outside the image
sum+=idx_in[last]*idx_kern[col];
}
}
out[batch_id*out_len*out_wid+//the output image
out_row*out_wid+out_col] = sum;
}
}
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len
//grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
extern "C" __global__ void
conv_full_patch( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
int batch_id = blockIdx.x;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int out_col = tx;//output col
int out_row = ty;//output row
const int thread_id = out_row*out_wid + out_col;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
kern+=kern_len*kern_wid*nstack*blockIdx.y;//the good nkern
img+=img_len*img_wid*batch_id;//the good batch
load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid, true);
__syncthreads();
float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row
if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
const float* idx_kern=&d_kern[row*kern_wid];
int col=0;
int max_col=kern_wid;
int img_col=out_col-kern_wid+1;
max_col=min(max_col,img_wid-img_col);
if(img_col<0){col=-img_col;img_col+=col;}
for (; col < max_col; col++, img_col++) {//loop over col
sum+=idx_in[col]*idx_kern[col];
}
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum;
}
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len
//grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__device__ inline void
conv_full_patch_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
const float __shared__ *kern_, *img_;
const int batch_id = blockIdx.x;
const int nkern_id = blockIdx.y;
const int out_col = threadIdx.x;
const int out_row = threadIdx.y;
const int thread_id = threadIdx.y*blockDim.x+ threadIdx.x;
float* d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float* d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
kern_=kern+kern_stride_nkern*nkern_id;//the good nkern
img_=img+img_len*img_stride_row*(nstack*batch_id);//the good batch
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
load_to_shared(d_img, img_+stack*img_len*img_stride_row, thread_id,nb_thread_id,img_wid,img_len,img_stride_col, img_stride_row,false,img_c_contiguous_2d);
load_to_shared(d_kern, kern_+stack*kern_stride_stack, thread_id,nb_thread_id,kern_wid,kern_len,kern_stride_col,kern_stride_row,true,kern_c_contiguous_2d);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
const float* idx_kern=&d_kern[row*kern_wid];
int col=0;
int max_col=kern_wid;
int img_col=out_col-kern_wid+1;
max_col=min(max_col,img_wid-img_col);
if(img_col<0){col=-img_col;img_col+=col;}
for (; col < max_col; col++, img_col++) {//loop over col
sum+=idx_in[col]*idx_kern[col];
}
}
//Needed as not all thread finish at the same time the loop
//And we don't want to overwrite the shared memory.
__syncthreads();
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum;
}
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
__global__ void \
conv_full_patch_stack_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, \
int kern_len, int kern_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_full_patch_stack<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK(0, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(1, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK(2, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK
}
/**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
* thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
* grid block size=batch_id, nkern
* dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
* dynamic shared memory: low mem:((kern_len+nb_row-1)+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
*
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template c_contiguous: if true, the image and kernel have are c_contiguous.(use less registers)
* template split: if true, each thread compute more than 1 output pixel.
* template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
* if you set split and low_mem to true, we will use the low_mem version!
*/
template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem >
__device__ inline void
conv_full_patch_stack_padded( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid,
const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch,
int kern_stride_col, int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
kern_stride_col=1;
kern_stride_row=kern_wid;
}
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
__shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
batch_id = blockIdx.x;
kern_id = blockIdx.y;
nb_rows = blockDim.y;
// Thread index
const int tx = threadIdx.x;
const int ty = threadIdx.y;
int out_col = tx;//output col
const int thread_id = ty*blockDim.x + tx;
float * d_kern=&s_data[0];//size of [KERNEL_LEN * KERNEL_WID];
float * d_img=&s_data[kern_len*kern_wid];//size of [see fct doc];
kern+=kern_stride_nkern*kern_id;//the good nkern
img+=img_stride_batch*batch_id;//the good batch
img_wid_valid=img_wid+2*kern_wid-2;
if(!split && !low_mem){
fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
const int out_row = ty;//output row
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){
__syncthreads();
load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img,
thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, kern_wid-1,
c_contiguous);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
}
}
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}else if(split && !low_mem){
fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
//out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
__shared__ int out_len_max;
//TODO pass a parameter nb_split
out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
__syncthreads();
//TODO: load only the part of the image needed or put the partial result in shared memory
load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),
img+img_stride_stack*stack,
thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, kern_wid-1,
c_contiguous);
load_to_shared(d_kern, kern+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
//The if is needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code
//as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len)
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
}
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col] = sum;
}
}
}else{//low_mem version
//don't need to fill the last rows padding as this is done later.
fill(d_img,img_wid_valid*((kern_len+nb_rows-1)+2*kern_len-2), 0, thread_id, nb_thread_id);
//out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
__shared__ int out_len_max;
//TODO pass a parameter nb_split
if(thread_id==0)
out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows;
__syncthreads();
for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
out_row+=nb_rows, out_row_iter++){
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
__syncthreads();
const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start
//we need to reload some row as when we change of out_row we lost the last load du to the stack.
const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload
load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid,
img+img_stride_stack*stack//the good stack image
+(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
thread_id,nb_thread_id,img_wid,
len_to_load+previous_row,
img_stride_col, img_stride_row, kern_wid-1,
c_contiguous);
//TODO: fill the last row padding only when needed.
//We always fill the last rows padding event when not needed.
int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load;
row_to_fill = min(row_to_fill,kern_len-1);
fill(d_img+(kern_len-1+len_to_load)*img_wid_valid,
img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
load_to_shared(d_kern, kern+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
}
}
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col] = sum;
}
}
}
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
__global__ void \
conv_full_patch_stack_padded_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_full_patch_stack_padded<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(0, false, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(1, false, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(2, false, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(4, false, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(5, false, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(6, false, true, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(8, true, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(9, true, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(10, true, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(12, true, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(13, true, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
}
template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy)
{
return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
//return x[0] * y[0] + everything_dot<i-1>(x+sx, sx, y+sy, sy);
}
template <> __device__ float everything_dot<0>(const float * x, const int sx, const float * y, const int sy)
{
return 0;
}
template <> __device__ float everything_dot<1>(const float * x, const int sx, const float * y, const int sy)
{
return x[0] * y[0];
}
extern "C" __global__ void
conv_full_load_everything( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;
int batch_id = blockIdx.x;
const int out_col = threadIdx.x;//output col
const int out_row = threadIdx.y;//output row
const int thread_id = out_row*out_wid + out_col;
float * d_img=&s_data[0]; //size [nstack * IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[nstack * img_len * img_wid];//size [nstack * KERNEL_LEN * KERNEL_WID];
img += blockIdx.x * img_stride_batch;//the good batch
// load the image to shared memory
for (int i = thread_id; i < nstack * img_len * img_wid; i += nb_thread_id)
{
int stack = i / (img_wid*img_len);
int row = (i % (img_wid*img_len)) / img_wid;
int col = (i % (img_wid*img_len)) % img_wid;
d_img[i] = img[stack*img_stride_stack +row*img_stride_row +col*img_stride_col];
}
for (int kern_idx = 0; kern_idx < nkern; ++kern_idx, kern += kern_stride_nkern)
{
// load the kernel into shared memory and flip it
for (int i = thread_id; i < nstack * kern_len * kern_wid; i += nb_thread_id)
{
int stack = i / (kern_wid*kern_len);
int row = (i % (kern_wid*kern_len)) / kern_wid;
int col = (i % (kern_wid*kern_len)) % kern_wid;
d_kern[stack*kern_len*kern_wid + (kern_len-1-row)*kern_wid + (kern_wid-1-col)]
= kern[stack*kern_stride_stack +row*kern_stride_row +col*kern_stride_col];
}
__syncthreads();
float sum = 0.0f;
for (int row=0; row < kern_len; ++row)
{
int irow = out_row - kern_len+1+row;
if (irow < 0 || irow > img_len) continue;
for (int col = 0; col < kern_wid; ++col)
{
int icol = out_col - kern_wid+1+col;
if (icol < 0 || icol > img_wid) continue;
if (THEANO_KERN_WID > 0)
{
sum += everything_dot<THEANO_KERN_WID>(d_img + irow*img_wid + icol, img_len*img_wid,
d_kern + row*kern_wid+col, kern_len*kern_wid);
}
else
{
for (int stack = 0; stack < nstack; ++stack)
{
sum += d_img[stack*img_len*img_wid + irow*img_wid + icol] * d_kern[stack*kern_len*kern_wid+row*kern_wid+col];
}
}
}
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_idx+//the output image
out_row*out_wid+out_col] = sum;
__syncthreads(); //don't start loading another kernel until we're done here
}
}
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
// REMEMBER TO INCREASE c_code_cache_version when changing this file
//
//implement the valid convolution only
/*
for (int iter_m=0; iter_m < Os[0]; iter_m++) {
// Reposition index into input image based on requested output size
int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
int new_m = (pos_m+dim_ker[0]-1);
for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s;
%(type)s sum=0;
// Sum over kernel, if index into image is out of bounds
// fill with the value
for (int j=0; j < dim_ker[0]; j++) {
int inverse_row = (new_m-j);
const %(type)s* idx_in=&in[inverse_row*dim_im[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
const %(type)s* idx_kern=&hvals[j*dim_ker[1]];
int new_n = (pos_n+dim_ker[1]-1);
for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
sum+=idx_kern[k]*idx_in[last];
}
}//for j
out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
}//for n
}//for m
*/
#ifndef CONV_KERNEL_CU
#define CONV_KERNEL_CU
/*
#define CHECK_BANK_CONFLICTS 0
#if CHECK_BANK_CONFLICTS
#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
#else
#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]
#endif
*/
#define MIN(a, b) ((a) < (b) ? (a) : (b) )
#define MAX(a, b) ((a) < (b) ? (b) : (a) )
//Must be the same size as a ptr. We can't use unsigned long as on Windows 64
//bit, it is 32 bit.
const size_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
if (nb_thread < 64)
{
if(flipped)
//TODO very slow on device before 1.3.
// make access to kern sequential and access to d_kern flipped.
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=src[N - 1 - i];
//dst[N-1-i]=src[i];
else
{
for(int i = thread_id; i < N; i += nb_thread)
{
dst[i] = src[i];
}
}
}
else
{
nb_thread = nb_thread & 0xFFFFFFE0; //make nb_thread a multiple of 32
// Global memory:
// <-------------------------------------->
// A A A A A // points of 256-byte alignment
// dddddddddddddddddddddd // layout of src in global memory
if (thread_id < nb_thread)
{
const float * my_src_ptr = (const float *)(
((size_t)src) & COALESCED_ALIGN);
my_src_ptr += thread_id;
while (my_src_ptr < src + N)
{
if (my_src_ptr >= src)
{
int i = my_src_ptr - src;
if (flipped)
{
dst[N - 1 - i] = *my_src_ptr;
}
else
{
dst[i] = *my_src_ptr;
}
}
my_src_ptr += nb_thread;
}
}
}
}
/*
* We load from global memory to shared memory. The outer if is optimized away at compilation.
*/
__device__ void load_to_shared(float * dst, const float * src, const int thread_id,
int nb_thread, const int nb_col, const int nb_row,
const int stride_col, const int stride_row,
const bool flipped=false, const bool c_contiguous=true){
if (c_contiguous)
{
load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
}
else
{
if (flipped)
{
int LAST = nb_row * nb_col - 1;
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
{
// XXX
// THIS IS SLOW - use whatever blocks are in the the
// threads to avoid division and modulo
dst[LAST - i] \
= src[(i/nb_col)*stride_row+(i%nb_col)*stride_col];
}
}
else
{
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
{
// XXX
// THIS IS SLOW - use whatever blocks are in the the
// threads to avoid division and modulo
dst[i]=src[i/nb_col*stride_row+i%nb_col*stride_col];
}
}
}
}
__device__ void fill(float * dst, int N, float value, int thread_id, int nb_thread){
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=value;
}
/*
* We load from global memory to shared memory. The outer if is optimized away at compilation.
* We put the image at the center of another one. Usefull to padd an image with 0.
*/
__device__ void load_padded_col_to_shared(float * dst, const float * src,
const int thread_id, const int nb_thread,
const int nb_col, const int nb_row,
const int stride_col, const int stride_row,
const int wid_pad, const bool c_contiguous=true){
if(c_contiguous){//flipped==false
for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
int col=i%nb_col;
int row=i/nb_col;
dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
}
}else{
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread){
int col=i%nb_col;
int row=i/nb_col;
dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[row*stride_row+col*stride_col];
}
}
}
template<int i> __device__ float convolutionRowNoFlip(const float *data,
const float *kern){
return convolutionRowNoFlip<i/2>(data, kern)+ convolutionRowNoFlip<(i+1)/2>(data+i/2, kern+i/2) ;
//return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
}
template<> __device__ float convolutionRowNoFlip<1>(const float *data,
const float *kern){
return data[0]*kern[0];
}
template<> __device__ float convolutionRowNoFlip<0>(const float *data,
const float *kern){
return 0;
}
__device__ void convolutionRowNoFlip(float& sum,
const float *data,
const float *kern, const int kern_wid){
if(THEANO_KERN_WID>0)
sum+=convolutionRowNoFlip<THEANO_KERN_WID>(data,kern);
else
#pragma unroll 8
for (int col=0; col < kern_wid; col++) {//loop over col
sum+=data[col]*kern[col];
}
}
template<bool accumulate>
__device__ void store_or_accumulate(float& dst,const float value ){
if(accumulate){
dst += value;
}else
dst = value;
}
/**
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* Don't implement the stack.
* each thread compute only one value for the output if split is false
* thread block size=out_wid, out_len(or less then out_len if split is true)
* grid block size=batch_id, nkern
* dynamic shared memory: img_len*img_wid+kern_len*kern_wid
*
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template split: if true, each thread computes more than 1 output pixel
* When true, allow for output image bigger then 512 pixel.
* Use more registers.
*/
template<bool flipped_kern, bool split>
__device__ inline void
conv_patch( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
__shared__ int batch_id, kern_id;
batch_id = blockIdx.x;
kern_id = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int out_col = tx;//output col
const int thread_id = ty*blockDim.x + tx;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
kern+=kern_len*kern_wid*nstack*kern_id;
img+=img_len*img_wid*(nstack*batch_id);
load_to_shared(d_img, img, thread_id,nb_thread_id,img_len*img_wid);
load_to_shared(d_kern, kern, thread_id,nb_thread_id,kern_len*kern_wid,flipped_kern);
__syncthreads();
if(!split){
int out_row = ty;//output row
float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
blockIdx.y*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}else{
for(int out_row=ty;out_row<out_len;out_row+=blockDim.y){
float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}
}
}
extern "C" {
#define __INSTANTIATE_CONV_PATCH(suffix, ...) \
__global__ void \
conv_patch_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack) \
{ \
conv_patch<__VA_ARGS__>(img, img_offset, kern, kern_offset, \
out, out_offset, img_len, img_wid, kern_len, \
kern_wid, nkern, nstack); \
}
__INSTANTIATE_CONV_PATCH(2, true, false)
__INSTANTIATE_CONV_PATCH(3, true, true)
#undef __INSTANTIATE_CONV_PATCH
}
/**
* As conv_patch, but implement the stack in the kernel.
* I keep it separated from conv_patch as we take more registers and this could lower the occupency.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* each thread compute only one value for the output if split==false else it compute more than 1 values
* thread block size=out_wid, out_len/X (X is any number, optimized value is ceil(out_len/N)
* grid block size=batch_id, nkern
* dynamic shared memory: img_len*img_wid+(preload_full_kern?KERNEL_LEN:1)*kern_wid
*
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* dx: patch stride rows(1 for normal convolution)
* dy: patch stride cols(1 for normal convolution)
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template accumulate: if true, we add the result, else we override the result
* template img_c_contiguous_2d: if true, the img have are collon and row contiguous
* template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
* template split: if true, each thread generate more than 1 output pixel, but use more registers.
* template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
* template subsample: if false, remove some computation needed when dx or dy!=1.
*/
template<bool flipped_kern, bool accumulate, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__device__ inline void
conv_patch_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
{
int __shared__ nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
int kern_id = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int out_col = tx;//output col
int out_row = ty;//output row
const int thread_id = out_row*out_wid + out_col;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
if(!split){
kern+=kern_stride_nkern*kern_id;//the good nkern
img+=img_stride_batch*batch_id;//the good batch
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){
load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, false, img_c_contiguous_2d);
if(preload_full_kern)
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
if(!preload_full_kern){
__syncthreads();
int idx2;
if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row;
else idx2=(row)*kern_stride_row;
load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads();
}
const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern;
const float* idx_in;
if(subsample)
idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
else
idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
store_or_accumulate<accumulate>(
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col],sum);
}else{
const float __shared__ *kern_, *img_;
int __shared__ out_len_max;
kern_=kern+kern_stride_nkern*kern_id;//the good nkern
img_=img+img_stride_batch*batch_id;//the good batch
//out_len_max must by higher then out_len as we need all thread when we load the image as the blockDim.y is not always a multiple of out_len.
out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
//TODO: inverse the out_row and stack loop to don't load the date as frequently!
//TODO: do this happen elsewhere?
for(;out_row<out_len_max;out_row+=blockDim.y){
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
//TODO: load only the part of the image needed or put the partial result in shared memory
int idx1=img_stride_stack*stack;
load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, false, img_c_contiguous_2d);
if(preload_full_kern){
int idx2=kern_stride_stack*stack;
load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
}
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
if(!preload_full_kern){
__syncthreads();
int idx2=kern_stride_stack*stack;
if(flipped_kern)
idx2+=(kern_len-row-1)*kern_stride_row;
else
idx2+=(row)*kern_stride_row;
load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads();
}
const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern;
const float* idx_in;
if(subsample)
idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
else
idx_in=&d_img[(row+out_row)*img_wid+out_col];
//if needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code
//as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len)
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
if(out_row<out_len)
store_or_accumulate<accumulate>(
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col],sum);
}
}
}
extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK(suffix, ...) \
__global__ void \
conv_patch_stack_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int out_len, int out_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern, \
int dx, int dy) \
{ \
conv_patch_stack<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, out_len, \
out_wid, nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern, dx, dy); \
}
__INSTANTIATE_CONV_PATCH_STACK(64, true, false, false, false, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(65, true, false, false, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(66, true, false, false, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(67, true, false, false, false, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(68, true, false, false, false, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(69, true, false, false, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(70, true, false, false, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(71, true, false, false, false, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(72, true, false, false, true, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(73, true, false, false, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(74, true, false, false, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(75, true, false, false, true, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(76, true, false, false, true, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(77, true, false, false, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(78, true, false, false, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(79, true, false, false, true, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(80, true, false, true, false, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(81, true, false, true, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(82, true, false, true, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(83, true, false, true, false, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(84, true, false, true, false, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(85, true, false, true, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(86, true, false, true, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(87, true, false, true, false, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(88, true, false, true, true, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(89, true, false, true, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(90, true, false, true, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(91, true, false, true, true, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(92, true, false, true, true, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(93, true, false, true, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(94, true, false, true, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
#undef __INSTANTIATE_CONV_PATCH_STACK
}
/**
* As conv_patch_stack, but kern_len thread for each output pixel
* I keep it separated as use more register.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* thread block size=out_wid, out_len, ceil_intdiv(kern_len/nb_split)
* grid block size=batch_id, nkern
* dynamic shared memory: img_len*img_wid+kern_wid*(preload_full_kern?kern_len:thread_z)+out_size*thread_z
*
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template img_contiguous: if true, the img have are collon and row contiguous
* template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
*/
template<bool flipped_kern, bool c_contiguous, bool split, bool preload_full_kern>
__device__ inline void
conv_patch_stack_reduce( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
//int __shared__ out_len, out_wid, nb_thread_id;
//out_len = img_len - kern_len + 1;
//out_wid = img_wid - kern_wid + 1;
const int out_wid = blockDim.x;
const int out_len = blockDim.y;
const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
kern_stride_col=1;
kern_stride_row=kern_wid;
}
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int tz = threadIdx.z;
int out_col = tx;//output col
int out_row = ty;//output row
const int thread_id = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;
//d_img size [IMAGE_LEN * IMAGE_WID];
float * d_img=&s_data[0];
//d_kern size[(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]
float * d_kern=&s_data[img_len * img_wid];
//d_reduce size [n_threads]
//N.B. this overlaps with d_img and d_kern!
float * d_reduce=&s_data[0];
float sum = 0.0f;
kern+=kern_stride_nkern*blockIdx.y;//the good nkern
img+=img_stride_batch*batch_id;//the good batch
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){
__syncthreads();
load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
img_stride_col, img_stride_row, false, c_contiguous);
if(split && ! preload_full_kern){
for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
//N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
// (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
// details, but that seemed really weird. tricky bug to find too.
int idx3 = flipped_kern
? max((kern_len - (int)blockDim.z - first_row),0)
: first_row;
int len3 = min(blockDim.z, kern_len - first_row);
__syncthreads();
load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
const float* idx_kern=&d_kern[tz*kern_wid];
const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
float sum2 = 0;
if(tz<len3)
convolutionRowNoFlip(sum2,idx_in,idx_kern,kern_wid);
sum+=sum2;
}
}else if(split){
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for(int row=tz;row<kern_len;row+=blockDim.z){
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
}else{
int row = tz;//The row of the kernel.
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
//reduce no sync because previous loop ends with sync
d_reduce[thread_id]=sum;
__syncthreads();
if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
//sum=0;
for(int i=1;i<blockDim.z;i++){
sum+=d_reduce[thread_id+i*out_wid*out_len];
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum;
}
}
extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK_REDUCE(suffix, ...) \
__global__ void \
conv_patch_stack_reduce_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_patch_stack_reduce<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(0, false, false, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(1, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(2, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(3, false, false, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(4, false, true, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(5, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(6, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(7, false, true, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(9, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(10, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(11, true, false, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(13, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(14, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true)
#undef __INSTANTIATE_CONV_PATCH_STACK_REDUCE
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* we store kern_len row of the image and the full kernel in the shared memory
* each thread compute only one value for the output
* Don't implement the stack and nkern in the kernel.
* thread block size=out_wid
* grid block size=out_len,batch_id
* dynamic shared memory: kern_len*img_wid+kern_len*kern_wid
* Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<bool c_contiguous>
__device__ inline void
conv_rows( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack,
int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
batch_id= blockIdx.y/nkern;
kern_id = blockIdx.y%nkern;
extern __shared__ float s_data[];
const int out_col = threadIdx.x;//output col
const int out_row = blockIdx.x;;//output row
const int thread_id = threadIdx.x;
d_img=&s_data[0];//size of [KERN_LEN * IMAGE_WID];
d_kern=&s_data[kern_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
img+=img_stride_batch*batch_id;//selection the good image from the batch
img+=out_row*img_stride_row;//select the good top row.
kern+=kern_stride_nkern*kern_id;//the good nkern
load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,kern_len,
img_stride_col, img_stride_row, false, c_contiguous);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads();
float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row)*img_wid+out_col];
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}
extern "C" {
#define __INSTANTIATE_CONV_ROWS(suffix, ...) \
__global__ void \
conv_rows_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_rows<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS(0, false)
__INSTANTIATE_CONV_ROWS(1, true)
#undef __INSTANTIATE_CONV_ROWS
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows, but implement the stack. Separate as this use more register.
* we store kern_len row of the image and the full kernel in the shared memory
* each thread compute only one value for the output
* thread block size=out_wid, block_len
* grid block size=intceil(out_len/block_len),nb_batch*nb_kern
* dynamic shared memory: (kern_len+block_len-1)*img_wid+kern_len*kern_wid
* Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<bool c_contiguous>
__device__ inline void
conv_rows_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
batch_id= blockIdx.y/nkern;
kern_id = blockIdx.y%nkern;
nb_rows = blockDim.y;
int rows_to_read = MIN(
kern_len + nb_rows - 1,
img_len - blockIdx.x * nb_rows);
/**
* Every thread ultimately computes one value in the output, at coordinates
* out[ batch_id, kern_id, out_row, out_col]
*
* The batch_id and kern_id are packed into blockIdx.y. out_row and out_col
* are the threadIdx.x and threadIdx.y.
*
* Every thread block deals only with one image, and one filter kernel.
*/
extern __shared__ float s_data[];
const int out_col = threadIdx.x;//output col
const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
const int shared_row = threadIdx.y;
const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
/*
* The kernel works by looping over channels (aka colours, aka the stack).
* On each iteration, a thread block loads one channel of all the image rows that
* it needs to use, and one channel slice of one kernel.
*/
d_img=&s_data[0];//size of [(KERN_LEN+block_len-1) * IMAGE_WID];
d_kern=&s_data[(kern_len+nb_rows-1) * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
float sum = 0.0f;
for (int stack = 0; stack < nstack; stack++){
int offset =
img_stride_batch * batch_id
+ img_stride_stack * stack
//blockIdx.x is which chunk of nb_rows this thread block deals with
+ img_stride_row * (blockIdx.x * nb_rows);
load_to_shared(
d_img, // dst
img+offset, // src
thread_id, // linear position in block
nb_thread_id, // number of threads
img_wid, // cols in image to read
rows_to_read, // number of rows to read
img_stride_col, // img[i, j, k, l] to img[i, j, k, l + 1]
img_stride_row, // img[i, j, k, l] to img[i, j, k + 1, l]
false, // flip while reading
c_contiguous);
offset = kern_stride_nkern * kern_id + kern_stride_stack * stack;
load_to_shared(d_kern, kern+offset, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads();//to be sure all thread have finished before we modif the shared memory.
}
if (out_row < out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}
extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK(suffix, ...) \
__global__ void \
conv_rows_stack_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS_STACK(0, false)
__INSTANTIATE_CONV_ROWS_STACK(1, true)
#undef __INSTANTIATE_CONV_ROWS_STACK
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
* we store block_len row of the image(at a time) and one or all kernel row in the shared memory
* each thread compute only one value for the output
* thread block size=out_wid, block_len
* grid block size=intceil(out_len/block_len),nb_batch*nb_kern
* dynamic shared memory: block_len * img_wid+(preload_full_kern?kern_len:1)*kern_wid
* Diff with conv_patch: don't store the full image and kernel in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<bool c_contiguous, bool preload_full_kern>
__device__ inline void
conv_rows_stack2(const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
batch_id= blockIdx.y/nkern;
kern_id = blockIdx.y%nkern;
nb_rows = blockDim.y;
extern __shared__ float s_data[];
const int out_col = threadIdx.x;//output col
const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
const int shared_row = threadIdx.y;
const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
d_img=&s_data[0];//size of [nb_rows * IMAGE_WID];
d_kern=&s_data[nb_rows*img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
int _idx2=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
_idx2+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
__syncthreads();
load_to_shared(d_img,img+_idx2,thread_id,nb_thread_id,img_wid,nb_rows-1,
img_stride_col, img_stride_row, false, c_contiguous);
if(preload_full_kern)
load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
__syncthreads();
if((blockIdx.x*nb_rows+row+nb_rows-1)<img_len){
int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
_idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
_idx1+=(row+nb_rows-1)*img_stride_row;//the current last row
load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid,
img+_idx1, thread_id, nb_thread_id, img_wid, 1,
img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer.
}
if(!preload_full_kern){
int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack
_idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped
load_to_shared(d_kern, kern+_idx3,
thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, true, c_contiguous);
}
__syncthreads();
//if needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code
//as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len){
const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern;
const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
float sum_ =0.0f;
convolutionRowNoFlip(sum_,idx_in,idx_kern,kern_wid);
sum+=sum_;//We pass by an intermediate variable to have more precission.
}
}
}
__syncthreads();
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}
extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK2(suffix, ...) \
__global__ void \
conv_rows_stack2_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack2<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS_STACK2(0, false, false)
__INSTANTIATE_CONV_ROWS_STACK2(1, false, true)
__INSTANTIATE_CONV_ROWS_STACK2(2, true, false)
__INSTANTIATE_CONV_ROWS_STACK2(3, true, true)
#undef __INSTANTIATE_CONV_ROWS_STACK2
}
/**
* Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
* kernel-image inner-product in parallel.
*
* This implementation uses shared memory for the reduce, so it is limited by the product of stacklen x kern_len
*
* template stack_loop: if true, we accept that blockDim.x < nstack and we add a loop for this(use 3 more registers, so lower occupency when true, but accept nstack*kern_len>512)
* TODO: explain parameters, preconditions
*/
template<bool stack_loop>
__device__ inline void
conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical
const float *img, const size_t img_offset, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, const size_t kern_offset, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, const size_t out_offset, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols,
const int initial_reduce_boundary)
{
const int outsize = nB * nK * out_len * out_wid;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
extern __shared__ float reducebuf[];
for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
{
//figure out what output element we're in charge of computing
int ii = i;
int iB = ii % nB; // output batch index
ii = ii / nB;
int iK = ii % nK; // output kernel index
ii = ii / nK;
int iR_physical = ii % out_len; //output kernel row
int iC_physical = ii / out_len; // output kernel column
int iR_logical = iR_physical * subsample_rows;
int iC_logical = iC_physical * subsample_cols;
int ss = threadIdx.x;
int rr = threadIdx.y;
int img_rr = iR_logical + kern_len - 1 - rr;
int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y;
float sum = 0.0f;
if(stack_loop){
for (; ss < stacklen; ss+=blockDim.x){
const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc)
{
sum += kk_0[0] * ii_0[0];
kk_0 += kern_str_C;
ii_0 -= img_str_C;
}
}
}else{
const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc)
{
sum += kk_0[0] * ii_0[0];
kk_0 += kern_str_C;
ii_0 -= img_str_C;
}
}
if (blockDim.x * blockDim.y == 1)
{
out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
}
else
{
reducebuf[reduceIdx] = sum;
__syncthreads();
int reduce_boundary = initial_reduce_boundary;
// add in the terms above the reduce boundary
if (reduceIdx + reduce_boundary < (blockDim.x * blockDim.y))
reducebuf[reduceIdx] += reducebuf[reduce_boundary +reduceIdx];
reduce_boundary >>= 1;
// there are an equal number of terms above and below the reduce_boundary
while (reduce_boundary)
{
__syncthreads();
if (reduceIdx < reduce_boundary)
{
reducebuf[reduceIdx] += reducebuf[reduce_boundary + reduceIdx];
}
reduce_boundary >>= 1;
}
if (reduceIdx == 0)
{
out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = reducebuf[0];
}
}
}
}
extern "C" {
#define __INSTANTIATE_CONV_VALID_ROW_REDUCE(suffix, ...) \
__global__ void \
conv_valid_row_reduce_##suffix( \
int nB, int nK, int stacklen, int img_len, int img_wid, \
int kern_len, int kern_wid, int out_len, int out_wid, \
const float *img, const size_t img_offset, \
int img_str_B, int img_str_S, int img_str_R, int img_str_C, \
const float *kern, const size_t kern_offset, \
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \
float *out, const size_t out_offset, \
int out_str_B, int out_str_K, int out_str_R, int out_str_C, \
int subsample_rows, int subsample_cols, \
const int initial_reduce_boundary) \
{ \
conv_valid_row_reduce<__VA_ARGS__>( \
nB, nK, stacklen, img_len, img_wid, \
kern_len, kern_wid, out_len, out_wid, \
img, img_offset, img_str_B, img_str_S, img_str_R, img_str_C, \
kern, kern_offset, kern_str_K, kern_str_S, kern_str_R, kern_str_C, \
out, out_offset, out_str_B, out_str_K, out_str_R, out_str_C, \
subsample_rows, subsample_cols, initial_reduce_boundary); \
}
__INSTANTIATE_CONV_VALID_ROW_REDUCE(0, false)
__INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
#undef __INSTANTIATE_CONV_VALID_ROW_REDUCE
}
/**
* Reference implementation of 'valid' mode convolution (with stack)
*
* This implementation works for any size of image and kernel. It does not use shared memory.
*
* TODO: explain parameters, preconditions
*/
extern "C" __global__ void
conv_reference_valid(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical
const float *img, const size_t img_offset,
int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, const size_t kern_offset,
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, const size_t out_offset,
int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, outsize;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
numThreads = blockDim.x * gridDim.x;
outsize = nB * nK * out_len * out_wid;
for (int i = idx; i < outsize; i += numThreads) //physical
{
//figure out what output element we're in charge of computing
int ii = i;
int iB = ii % nB; // output batch index
ii = ii / nB;
int iK = ii % nK; // output kernel index
ii = ii / nK;
int iR_physical = ii % out_len; //output kernel row
int iC_physical = ii / out_len; // output kernel column
int iR_logical = iR_physical * subsample_rows;
int iC_logical = iC_physical * subsample_cols;
float sum = 0.0f;
for (int ss = 0; ss < stacklen; ++ss)
{
for (int rr = 0; rr < kern_len; ++rr)
{
int img_rr = iR_logical + kern_len - 1 - rr;
for (int cc = 0; cc < kern_wid; ++cc)
{
int img_cc = iC_logical + kern_wid-1-cc;
float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
sum += k_0 * i_0;
}
}
}
//coords[i*5+0] = iB;
//coords[i*5+1] = iK;
//coords[i*5+2] = iR;
//coords[i*5+3] = iC;
//coords[i*5+4] = iB * out_str_B + iK * out_str_K + iR * out_str_R + iC * out_str_C;
out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
}
}
/**
* Reference implementation of 'full' mode convolution (with stack)
*
* This implementation works for any size of image and kernel. It does not use shared memory.
*
* TODO: explain parameters, preconditions
*/
extern "C" __global__ void
conv_reference_full(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical dimensions
const float *img, const size_t img_offset,
int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, const size_t kern_offset,
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, const size_t out_offset,
int out_str_B, int out_str_K, int out_str_R, int out_str_C,
int subsample_rows, int subsample_cols)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, physical_outsize;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
numThreads = blockDim.x * gridDim.x;
physical_outsize = nB * nK * out_len * out_wid;
for (int i = idx; i < physical_outsize; i += numThreads)
{
//figure out what output element we're in charge of computing
int ii = i;
int iB = ii % nB; // output batch index
ii = ii / nB;
int iK = ii % nK; // output kernel index
ii = ii / nK;
int iR_physical = ii % out_len; //output kernel row
int iC_physical = ii / out_len; // output kernel column
int iR_logical = iR_physical * subsample_rows;
int iC_logical = iC_physical * subsample_cols;
float sum = 0.0f;
for (int ss = 0; ss < stacklen; ++ss)
{
for (int rr = 0; rr < kern_len; ++rr)
{
int img_rr = iR_logical - rr;
if ((img_rr >= 0) && (img_rr < img_len))
{
for (int cc = 0; cc < kern_wid; ++cc)
{
int img_cc = iC_logical - cc;
if ((img_cc >= 0) && (img_cc < img_wid))
{
float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
sum += k_0 * i_0;
}
}
}
}
}
out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
}
}
#endif // #ifndef CONV_KERNEL_CU
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
......@@ -5,6 +5,7 @@ import warnings
import theano
from theano import Op, Apply, tensor, config, Variable
from theano.scalar import as_scalar, constant, Log
from theano.tensor import as_tensor_variable
from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.cmodule import GCC_compiler
......@@ -12,17 +13,19 @@ from theano.gof.type import CDataType, Generic
from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.nnet.abstract_conv import get_conv_output_shape
from theano.tensor.signal.downsample import (
DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs,
get_conv_output_shape)
from theano.tensor.signal.downsample import (DownsampleFactorMax,
MaxPoolGrad, AveragePoolGrad)
from . import pygpu
from .type import get_context, gpu_context_type, list_contexts
from .type import get_context, gpu_context_type, list_contexts, GpuArrayType
from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, HostFromGpu,
GpuAllocEmpty, empty_like)
from .elemwise import GpuElemwise
from .conv import GpuConv
# These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
......@@ -819,6 +822,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
return GpuDnnConv(algo=algo)(img, kerns, out, desc)
def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
subsample=(1, 1), conv_mode='conv'):
ctx_name = infer_context_name(img, topgrad)
img = gpu_contiguous(img)
topgrad = gpu_contiguous(topgrad)
kerns_shp = as_tensor_variable(kerns_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns_shp)
out = GpuAllocEmpty(img.dtype, ctx_name)(*kerns_shp)
return GpuDnnConvGradW()(img, topgrad, out, desc)
def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
subsample=(1, 1), conv_mode='conv'):
ctx_name = infer_context_name(kerns, topgrad)
kerns = gpu_contiguous(kerns)
topgrad = gpu_contiguous(topgrad)
img_shp = as_tensor_variable(img_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns.shape)
out = GpuAllocEmpty(kerns.dtype, ctx_name)(*img_shp)
return GpuDnnConvGradI()(kerns, topgrad, out, desc)
class GpuDnnPoolDesc(Op):
"""
This Op builds a pooling descriptor for use in the other
......@@ -1188,57 +1215,53 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
return Apply(self, [dy, sm], [sm.type()])
# @register_opt('cudnn') # this optimizer is registered in opt.py instead.
@local_optimizer([GpuConv])
def local_conv_dnn(node):
if isinstance(node.op, GpuConv):
if not dnn_available(node.outputs[0].type.context_name):
return
if node.op.border_mode not in ['full', 'valid']:
return
img, kern = node.inputs
border_mode = node.op.border_mode
subsample = node.op.subsample
direction_hint = node.op.direction_hint
rval = dnn_conv(img, kern,
border_mode=border_mode, subsample=subsample,
direction_hint=direction_hint)
return [rval]
# This optimizer is registered in opt.py as part of the meta-optimizer.
# It tries exactly the opposite code path of what local_conv_dnn() uses,
# because for some input/kernel shape configurations, this is faster.
@local_optimizer([GpuConv])
def local_conv_dnn_alternative(node):
if isinstance(node.op, GpuConv):
if not dnn_available(node.outputs[0].type.context_name):
return
border_mode = node.op.border_mode
subsample = node.op.subsample
if border_mode not in ['full', 'valid'] or subsample != (1, 1):
return
img, kern = node.inputs
direction_hint = node.op.direction_hint
if border_mode == 'full':
# for a full convolution, try using the forward pass instead
# of the backward pass wrt. inputs
direction_hint = 'forward!'
elif border_mode == 'valid':
# for a valid convolution, try using the backward pass wrt.
# weights instead of the forward pass and vice versa
if direction_hint == 'bprop weights':
direction_hint = 'forward'
else:
direction_hint = 'bprop weights'
rval = dnn_conv(img, kern,
border_mode=border_mode, subsample=subsample,
direction_hint=direction_hint)
return [rval]
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_abstractconv_cudnn(node):
if (not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs))):
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if (not isinstance(inp1.type, GpuArrayType) or
not isinstance(inp2.type, GpuArrayType)):
return None
if not dnn_available(inp1.type.context_name):
return None
conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')
if node.op.filter_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
if isinstance(node.op, AbstractConv2d):
rval = dnn_conv(inp1, inp2,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
direction_hint='forward!',
conv_mode=conv_mode)
if isinstance(node.op, AbstractConv2d_gradWeights):
shape = (inp2.shape[1], inp1.shape[1],
node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradweight(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode=conv_mode)
if isinstance(node.op, AbstractConv2d_gradInputs):
shape = (inp2.shape[0], inp1.shape[1],
node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradinput(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode=conv_mode)
return [rval]
conv_groupopt.register('local_abstractconv_cudnn',
local_abstractconv_cudnn, 20,
'fast_compile', 'fast_run',
'gpuarray', 'conv_dnn', 'cudnn')
@inplace_allocempty(GpuDnnConv, 2)
......
......@@ -14,7 +14,13 @@ from theano.gof.optdb import LocalGroupDB
from theano.scalar.basic import Scalar, Pow, Cast
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor import as_tensor_variable
from theano.tensor.nnet.conv import ConvOp
from theano.tensor.nnet.abstract_conv import (BaseAbstractConv2d,
AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs)
from theano.tests.breakpoint import PdbBreakpoint
from .type import (GpuArrayType, GpuArrayConstant, get_context,
......@@ -27,7 +33,6 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
GpuEye, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
gpugemm_no_inplace)
from .conv import GpuConv
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmaxWithBias, GpuSoftmax)
......@@ -786,77 +791,49 @@ def local_assert(node, context_name):
@register_opt('fast_compile')
@op_lifter([ConvOp])
def local_gpu_conv(node, context_name):
def GpuConvOp_from_ConvOp(op):
logical_img_hw = None
if op.kshp_logical is not None and op.kshp_logical != op.kshp:
return None
ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy),
logical_img_hw=logical_img_hw,
logical_kern_hw=op.kshp_logical,
logical_kern_align_top=op.kshp_logical_top_aligned,
kshp=op.kshp,
version=op.version,
direction_hint=op.direction_hint,
verbose=op.verbose,
imshp=op.imshp,
nkern=op.nkern,
bsize=op.bsize,
fft_opt=op.fft_opt)
if op.imshp_logical is not None:
logical_img_hw = op.imshp_logical[1:3]
if logical_img_hw != op.imshp[1:3]:
rstride = int(numpy.ceil(op.imshp_logical[1] /
float(op.imshp[1])))
cstride = int(numpy.ceil(op.imshp_logical[2] /
float(op.imshp[2])))
def make_graph(img, kern):
buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype),
img.shape[0], *op.imshp_logical)
img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
img)
img = GpuFromHost(context_name)(img)
return ret(img, kern)
return make_graph
return ret
def values_eq_approx(a, b):
"""
This fct is needed to don't have DebugMode raise useless
error due to ronding error.
This happen as We reduce on the two last dimensions, so this
can raise the absolute error if the number of element we
reduce on is significant.
"""
assert a.ndim == 4
atol = None
if a.shape[-1] * a.shape[-2] > 100:
# For float32 the default atol is 1e-5
atol = 3e-5
return GpuArrayType.values_eq_approx(a, b, atol=atol)
img, kern = node.inputs
gpu_conv = GpuConvOp_from_ConvOp(node.op)
if gpu_conv is None:
return
out = gpu_conv(GpuFromHost(context_name)(img),
GpuFromHost(context_name)(kern))
assert isinstance(out.type, GpuArrayType)
# Make sure to keep the broadcastable pattern of the original
# convolution even if we might gain or lose some due to different
# information at the node level.
out = tensor.patternbroadcast(out, node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx
return [out]
# Register this here so that it goes after 'local_gpu_conv'
def local_error_convop(node, context_name):
assert False, """
ConvOp does not work with the gpuarray backend.
Use the new convolution interface to have GPU convolution working:
theano.tensor.nnet.conv2d()
"""
# This deals with any abstract convs that have a transfer somewhere
@register_opt('fast_compile')
@op_lifter([AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_lift_abstractconv2d(node, context_name):
inps = list(node.inputs)
inps[0] = as_gpuarray_variable(node.inputs[0],
context_name=context_name)
inps[1] = as_gpuarray_variable(node.inputs[1],
context_name=context_name)
return [node.op(*inps)]
# This will deal with ops that don't have an explicit transfer but
# have one of their inputs on the GPU already and the other not on the
# GPU (to avoid endlessly replacing things).
@register_opt('fast_compile')
@local_optimizer([AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_gpu_abstractconv2d(node):
if isinstance(node.op, BaseAbstractConv2d):
if ((isinstance(node.inputs[0].type, GpuArrayType) or
isinstance(node.inputs[1].type, GpuArrayType)) and
not (isinstance(node.inputs[0].type, GpuArrayType) or
isinstance(node.inputs[1].type, GpuArrayType))):
inps = list(node.inputs)
ctx_name = infer_context_name(inps[0], inps[1])
inps[0] = as_gpuarray_variable(inps[0], context_name=ctx_name)
inps[1] = as_gpuarray_variable(inps[1], context_name=ctx_name)
return as_tensor_variable(node.op(*inps))
# Register this here so that it goes after the abstract lifting
register_opt()(conv_groupopt)
......
import unittest
import numpy
import itertools
from nose.plugins.skip import SkipTest
import theano
from theano.tests import unittest_tools as utt
import theano.tensor.nnet.abstract_conv as conv
from theano.compile import shared as cpu_shared
from ..type import gpuarray_shared_constructor as gpu_shared
from ..dnn import dnn_available, dnn_conv, dnn_gradweight, dnn_gradinput
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
class TestConv2d(unittest.TestCase):
def setUp(self):
super(TestConv2d, self).setUp()
self.inputs_shapes = [(8, 1, 12, 12), (8, 1, 18, 18), (2, 1, 4, 4),
(6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
self.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
(1, 1, 2, 5), (4, 1, 2, 2), (4, 5, 2, 2)]
self.subsamples = [(1, 1), (2, 2), (2, 4)]
self.border_modes = ["valid", "full", (0, 0), (1, 1), (5, 5), (5, 2)]
self.filter_flip = [True, False]
def get_output_shape(self, inputs_shape, filters_shape, subsample,
border_mode):
if border_mode == "valid":
border_mode = (0, 0)
if border_mode == "full":
border_mode = (filters_shape[2] - 1, filters_shape[3] - 1)
batch_size = inputs_shape[0]
num_filters = filters_shape[0]
return ((batch_size, num_filters,) +
tuple(None if i is None or k is None
else ((i + 2 * pad - k) // d + 1)
for i, k, d, pad in zip(inputs_shape[2:],
filters_shape[2:],
subsample, border_mode)))
def run_fwd(self, inputs_shape, filters_shape, ref=dnn_conv,
subsample=(1, 1), verify_grad=True, mode=mode_without_gpu,
border_mode='valid', filter_flip=True, device='cpu',
provide_shape=False):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
if device == 'gpu':
inputs = gpu_shared(inputs_val)
filters = gpu_shared(filters_val)
else:
inputs = theano.tensor.as_tensor_variable(cpu_shared(inputs_val))
filters = theano.tensor.as_tensor_variable(cpu_shared(filters_val))
if provide_shape:
imshp = inputs_shape
kshp = filters_shape
else:
imshp = None
kshp = None
if filter_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
c_ref = ref(inputs, filters,
border_mode=border_mode,
subsample=subsample,
conv_mode=conv_mode)
c = conv.conv2d(inputs, filters,
border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip,
input_shape=imshp,
filter_shape=kshp)
f_ref = theano.function([], c_ref, mode=mode)
f = theano.function([], c, mode)
res_ref = numpy.array(f_ref())
res = numpy.array(f())
utt.assert_allclose(res_ref, res)
if verify_grad:
utt.verify_grad(conv.AbstractConv2d(border_mode="valid", imshp=imshp, kshp=kshp,
subsample=subsample),
[inputs_val, filters_val],
mode=mode)
def run_gradweight(self, inputs_shape, filters_shape, output_shape,
ref=dnn_gradweight, subsample=(1, 1), filter_flip=True,
verify_grad=True, mode=mode_without_gpu, border_mode='valid',
device='cpu', provide_shape=False):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
output_val = numpy.random.random(output_shape).astype('float32')
if device == 'gpu':
inputs = gpu_shared(inputs_val)
output = gpu_shared(output_val)
else:
inputs = theano.tensor.as_tensor_variable(cpu_shared(inputs_val))
output = theano.tensor.as_tensor_variable(cpu_shared(output_val))
if provide_shape:
imshp = inputs_shape
kshp = filters_shape
else:
imshp = None
kshp = None
if filter_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
c = conv.AbstractConv2d_gradWeights(border_mode=border_mode,
filter_flip=filter_flip,
subsample=subsample,
imshp=imshp, kshp=kshp)
c = c(inputs, output, filters_shape[-2:])
c_ref = ref(inputs, output,
filters_shape,
border_mode=border_mode,
subsample=subsample,
conv_mode=conv_mode)
f = theano.function([], c, mode)
f_ref = theano.function([], c_ref, mode)
res_ref = numpy.array(f_ref())
res = numpy.array(f())
utt.assert_allclose(res_ref, res)
def abstract_conv2d_gradweight(inputs_val, output_val):
conv_op = conv.AbstractConv2d_gradWeights(border_mode=border_mode, subsample=subsample)
return conv_op(inputs_val, output_val, filters_shape[-2:])
if verify_grad:
utt.verify_grad(abstract_conv2d_gradweight, [inputs_val, output_val],
mode=mode, eps=1)
def run_gradinput(self, inputs_shape, filters_shape, output_shape, ref=dnn_gradinput,
subsample=(1, 1), filter_flip=True, verify_grad=True, mode=mode_without_gpu,
border_mode='valid', device='cpu', provide_shape=False):
output_val = numpy.random.random(output_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
if device == 'gpu':
output = gpu_shared(output_val)
filters = gpu_shared(filters_val)
else:
output = theano.tensor.as_tensor_variable(cpu_shared(output_val))
filters = theano.tensor.as_tensor_variable(cpu_shared(filters_val))
if provide_shape:
imshp = inputs_shape
kshp = filters_shape
else:
imshp = None
kshp = None
if filter_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
c = conv.AbstractConv2d_gradInputs(border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip,
imshp=imshp, kshp=kshp)
c = c(filters, output, inputs_shape[-2:])
c_ref = ref(filters, output, inputs_shape,
border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)
f = theano.function([], c, mode)
f_ref = theano.function([], c_ref, mode)
res_ref = numpy.array(f_ref())
res = numpy.array(f())
utt.assert_allclose(res_ref, res)
def abstract_conv2d_gradinputs(filters_val, output_val):
conv_op = conv.AbstractConv2d_gradInputs(border_mode=border_mode, subsample=subsample)
return conv_op(filters_val, output_val, inputs_shape[-2:])
if verify_grad:
utt.verify_grad(abstract_conv2d_gradinputs, [filters_val, output_val],
mode=mode, eps=1)
def test_dnn_conv(self):
if not dnn_available(test_ctx_name):
raise SkipTest(dnn_available.msg)
mode = mode_with_gpu
# provide_shape is not used by the CuDNN impementation
provide_shape = False
for (i, f), s, b, flip in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip):
o = self.get_output_shape(i, f, s, b)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cormm_conv(self):
if not dnn_available(test_ctx_name):
raise SkipTest(dnn_available.msg)
mode = mode_without_gpu
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cpu_conv(self):
if not dnn_available(test_ctx_name):
raise SkipTest(dnn_available.msg)
mode = mode_without_gpu.excluding('conv_gemm')
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
fwd_OK = True
gradweight_OK = True
gradinput_OK = True
if not flip:
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if b not in ('valid', 'full'):
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if fwd_OK:
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_fwd,
inputs_shape=i,
filters_shape=f,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
if gradweight_OK:
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradweight,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
if gradinput_OK:
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradinput,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
"""
Tests for GPU convolution
"""
from __future__ import print_function
import sys
import time
import unittest
import numpy
from six.moves import xrange
import theano
from theano import tensor
from theano.tests.unittest_tools import seed_rng
# We let that import do the init of the back-end if needed.
from .config import mode_with_gpu, test_ctx_name
from ..type import GpuArrayType, get_context
from ..conv import GpuConv
from theano.sandbox.gpuarray import dnn
import pygpu
imported_scipy_convolve2d = False
try:
from scipy.signal import convolve2d
imported_scipy_convolve2d = True
except ImportError:
pass
gftensor4 = GpuArrayType('float32', [False] * 4, context_name=test_ctx_name)
def py_conv_valid_numpy(img, kern):
assert img.shape[1] == kern.shape[1]
outshp = (img.shape[0], kern.shape[0],
img.shape[2] - kern.shape[2] + 1,
img.shape[3] - kern.shape[3] + 1)
out = numpy.zeros(outshp, dtype='float32')
for b in xrange(out.shape[0]):
for k in xrange(out.shape[1]):
for rr in xrange(out.shape[2]):
for cc in xrange(out.shape[3]):
# rr, cc is the upper-left corner of img patches
imgpatch = img[b, :, rr:rr + kern.shape[2],
cc:cc + kern.shape[3]]
innerprod = (imgpatch[:, ::-1, ::-1] *
kern[k, :, :, :]).sum()
out[b, k, rr, cc] = innerprod
return out
def py_conv_full_numpy(img, kern):
# manually pad the img with zeros all around, and then run it
# through py_conv_valid
pad_rows = 2 * (kern.shape[2] - 1) + img.shape[2]
pad_cols = 2 * (kern.shape[3] - 1) + img.shape[3]
padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols),
dtype=img.dtype)
padded_img[:, :, kern.shape[2] - 1: kern.shape[2] - 1 + img.shape[2],
kern.shape[3] - 1: kern.shape[3] - 1 + img.shape[3]] = img
return py_conv_valid_numpy(padded_img, kern)
def py_conv(img, kern, mode, subsample):
"""
use a scipy or numpy implementation depending is scipy is available.
The scipy version is faster.
"""
if imported_scipy_convolve2d:
return py_conv_scipy(img, kern, mode, subsample)
elif mode == 'valid':
return py_conv_valid_numpy(img, kern)[:, :, ::subsample[0],
::subsample[1]]
elif mode == 'full':
return py_conv_full_numpy(img, kern)[:, :, ::subsample[0],
::subsample[1]]
else:
raise Exception("Can't execute this kernel.")
def py_conv_scipy(img, kern, mode, subsample):
assert img.shape[1] == kern.shape[1]
if mode == 'valid':
outshp = (img.shape[0], kern.shape[0],
img.shape[2] - kern.shape[2] + 1,
img.shape[3] - kern.shape[3] + 1)
else:
outshp = (img.shape[0], kern.shape[0],
img.shape[2] + kern.shape[2] - 1,
img.shape[3] + kern.shape[3] - 1)
out = numpy.zeros(outshp, dtype='float32')
for b in xrange(out.shape[0]):
for k in xrange(out.shape[1]):
for s in xrange(img.shape[1]):
out[b, k, :, :] += convolve2d(img[b, s, :, :],
kern[k, s, :, :],
mode)
return out[:, :, ::subsample[0], ::subsample[1]]
def _params_allgood_header():
print("ishape kshape #Mflops CPU Mflops GPU Mflops Speedup")
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
kern_stride=(1, 1), version=-1, verbose=0, random=True,
print_=None, id=None, rtol=1e-5, atol=1e-8,
nb_iter=0, ones=False, compile_kshp=None):
#
# This function is the core of several of the big unit-test drivers,
# but it can also be used very directly on its own to test a specific
# kind of convolution.
#
# See `test_example` (above) for an example of how to use this directly.
#
# :param kshape: (4d)The shape of the kernel at run time.
# :param compile_kshp: (2d) hardcode the shape of the kernel in
# the generated code This is supposed to be
# faster, but we need to check That we raise
# an error if the input have the wrong shape.
#
if ones:
assert not random
npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
elif random:
npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
dtype='float32')
else:
npy_img = theano._asarray(numpy.arange(
numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
npy_kern = -(theano._asarray(numpy.arange(
numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
img = pygpu.array(npy_img, context=get_context(test_ctx_name))
kern = pygpu.array(npy_kern, context=get_context(test_ctx_name))
# we take the stride after the transfert as we make c_contiguous
# data on the GPU.
if img_stride != (1, 1):
img = img[:, :, ::img_stride[0], ::img_stride[1]]
npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]]
if kern_stride != (1, 1):
kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]]
npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]
t2 = None
rval = True
try:
t0 = time.time()
cpuval = py_conv(npy_img, npy_kern, mode, subsample)
t1 = time.time()
i = gftensor4()
k = gftensor4()
op = GpuConv(border_mode=mode,
subsample=subsample,
version=version,
verbose=verbose,
kshp=compile_kshp)(i, k)
f = theano.function([i, k], op, mode=mode_with_gpu)
gpuval = f(img, kern)
t2 = time.time()
for i in range(nb_iter):
gpuval2 = f(img, kern)
assert numpy.allclose(numpy.asarray(gpuval),
numpy.asarray(gpuval2))
assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
gpuval = numpy.asarray(gpuval)
if gpuval.shape != cpuval.shape:
print("ERROR: shape mismatch", end=' ', file=sys.stdout)
print(gpuval.shape, cpuval.shape, file=sys.stdout)
rval = False
if rval:
rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
assert numpy.all(numpy.isfinite(gpuval))
except NotImplementedError as e:
print('_params_allgood Failed allclose', e, file=sys.stdout)
rval = False
if (t2 is not None):
if mode == 'valid':
approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2
else:
approx_fp = (ishape[0] * kshape[0] * kshape[1] * kshape[2] *
kshape[3] * ishape[2] * ishape[3] * 2)
approx_fp /= 1e6
cpu_mflops = approx_fp / (t1 - t0)
gpu_mflops = approx_fp / (t2 - t1)
if verbose > 0:
print('%15s' % str(ishape), '%15s' % str(kshape), end=' ',
file=sys.stdout)
print('%12.5f %7.2f %7.2f %7.1f' %
(approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)),
file=sys.stdout)
if not rval:
print('test_' + mode + ' id=' + str(id) +
' FAILED for ishape, kshape, mode, subsample,' +
' img_stride, kern_stride, version', ishape,
kshape, mode, subsample, img_stride, kern_stride,
version, file=sys.stdout)
diff = cpuval - gpuval
diffabs = numpy.absolute(diff)
pr_diff = diffabs / numpy.absolute(cpuval)
nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
print("max absolute diff:", (diffabs.max(), "avg abs diff:",
numpy.average(diffabs)))
print("median abs diff:", (numpy.median(diffabs), "nb close:",
nb_close, "/", diff.size))
print("max relatif diff:", (pr_diff.max(), "avg rel diff:",
numpy.average(pr_diff)))
if not rval and print_ is not False:
if npy_img.shape[0] > 5:
print("img", npy_img[0])
print("kern", npy_kern[0])
print("gpu", gpuval[0][0])
print("cpu", cpuval[0][0])
print("diff", diff[0][0])
else:
print("img", npy_img)
print("kern", npy_kern)
print("gpu", gpuval)
print("cpu", cpuval)
print("diff", diff)
return rval
def exec_conv(version, shapes, verbose, random, mode,
print_=None, rtol=1e-5, ones=False):
if verbose > 0:
_params_allgood_header()
nb_failed = 0
nb_tests = 0
failed_version = set()
failed_id = []
# I put -1 in case we forget to add version in the test to.
for ver in version:
for id, (ishape, kshape, subshape,
istride, kstride) in enumerate(shapes):
ret = False
try:
ret = _params_allgood(
ishape,
kshape,
mode,
subsample=subshape,
img_stride=istride,
kern_stride=kstride,
version=ver,
verbose=verbose,
random=random,
id=id,
print_=print_,
rtol=rtol,
ones=ones)
except Exception as e:
print(ver, id, (ishape, kshape, subshape, istride, kstride))
print(e)
pass
if not ret:
failed_version.add(ver)
failed_id.append(id)
nb_failed += 1
nb_tests += 1
if nb_failed > 0:
print("nb_failed", nb_failed, "on", nb_tests, end=' ')
print("failed_version", failed_version, "failed_id", failed_id)
assert nb_failed == 0, nb_failed
else:
print('Executed', nb_tests, 'different shapes')
def get_basic_shapes():
# basic test of image and kernel shape
return [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
((1, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 3, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
# basic test for unsquare kernel and image
((1, 1, 2, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 3, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 4, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 4, 4), (1, 1, 3, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))]
def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
img_stride=(1, 1), kern_stride=(1, 1)):
""" all possible case if we one or more of stack size, batch size,
nkern. We use the gived image shape, kernel shape and subsmaple
shape."""
return [
# stack only
((1, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
# batch only
((3, 1) + imshp, (1, 1) + kshp, subsample, img_stride, kern_stride),
# nkern only
((1, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
# batch and nkern
((3, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
# batch and stack
((3, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
# stack and nkern
((1, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
# batch, nkern and stack
((2, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
# batch, nkern and stack
((3, 2) + imshp, (4, 2) + kshp, subsample, img_stride, kern_stride)
]
def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
img_stride=(1, 1), kern_stride=(1, 1)):
# basic test of stack, batch and nkern paramter
shapes = get_shapes((1 * scales_img[0], 1 * scales_img[1]),
(1 * scales_kern[0], 1 * scales_kern[1]),
subsample, img_stride, kern_stride)
# basic test of stack, batch and nkern paramter with image and kernel shape
shapes += get_shapes((2 * scales_img[0], 2 * scales_img[1]),
(2 * scales_kern[0], 2 * scales_kern[1]),
subsample, img_stride, kern_stride)
# basic test of stack, batch and nkern paramter with image and kernel shape
shapes += get_shapes((3 * scales_img[0], 3 * scales_img[1]),
(2 * scales_kern[0], 2 * scales_kern[1]),
subsample, img_stride, kern_stride)
# basic test of stack, batch and nkern paramter with not square image.
shapes += get_shapes((4 * scales_img[0], 3 * scales_img[1]),
(2 * scales_kern[0], 2 * scales_kern[1]),
subsample, img_stride, kern_stride)
# basic test of stack, batch and nkern paramter with not square image.
shapes += get_shapes((3 * scales_img[0], 4 * scales_img[1]),
(2 * scales_kern[0], 2 * scales_kern[1]),
subsample, img_stride, kern_stride)
# basic test of stack, batch and nkern paramter with not square kernel.
shapes += get_shapes((4 * scales_img[0], 4 * scales_img[1]),
(3 * scales_kern[0], 2 * scales_kern[1]),
subsample, img_stride, kern_stride)
# basic test of stack, batch and nkern paramter with not square kernel.
shapes += get_shapes((4 * scales_img[0], 4 * scales_img[1]),
(2 * scales_kern[0], 3 * scales_kern[1]),
subsample, img_stride, kern_stride)
return shapes
def get_valid_shapes():
# img shape, kern shape, subsample shape
shapes = get_basic_shapes()
shapes += get_shapes2()
# test image stride
shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
# test subsample done in a separate fct
shapes += [
# other test
((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize
((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image
((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image, non-square kern
((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)), # a big one
((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # MNIST LeNET layer 1
((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)), # layer 1 backprop to weights
((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1)), # added a test case that fail from test_nnet.py.test_conv_nnet2
((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1)), # test precedent but reduced that triger the error
# Test more than maxThreadsDim0
((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
]
shapes += [((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 1 layers
((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 2 layers
((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 1 full
((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 2 valid
((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 1 layers
((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 2 layers
((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 full
]
return shapes
def test_valid():
seed_rng()
shapes = get_valid_shapes()
version = [-1]
verbose = 0
random = True
print_ = False
ones = False
if ones:
random = False
exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_full():
seed_rng()
shapes = get_basic_shapes()
shapes += get_shapes2()
# test image stride
shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
# test subsample done in a separate fct
shapes += [
# other test
((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize
((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image
((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image, non-square kern
((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)), # a big one
((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # MNIST LeNET layer 1
((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)), # layer 1 backprop to weights
# other test
((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1)), # kernel bigger then image
((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1)),
((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1)),
((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1)), # kernel bigger then image
((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1)),
]
shapes += [
((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 1 full
((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 full
# Test more than maxThreadsDim0
((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
]
version = [-1]
verbose = 0
random = True
exec_conv(version, shapes, verbose, random, 'full')
def test_subsample():
seed_rng()
# implement when
shapes = [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
((1, 1, 1, 1), (1, 1, 1, 1), (2, 2), (1, 1), (1, 1)),
((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1, 1), (1, 1)),
((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1, 1), (1, 1)),
((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1, 1), (1, 1))
]
shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 1))
shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 2))
shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 1))
shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 2))
version_valid = [-1]
version_full = [-1]
verbose = 0
random = True
print_ = False
ones = False
if ones:
random = False
exec_conv(version_valid, shapes, verbose, random, 'valid',
print_=print_, ones=ones)
exec_conv(version_full, shapes, verbose, random, 'full',
print_=print_, ones=ones)
class TestConv2DGPU(unittest.TestCase):
conv_ops = (GpuConv,
dnn.DnnBase)
def test_logical_shapes(self):
seed_rng()
for stride in range(1, 4):
kshp = (10, 2, 10, 10)
featshp = (3, 10, 11, 11)
a = tensor.ftensor4()
A = tensor.ftensor4()
# Need to transpose first two dimensions of kernel, and reverse
# index kernel image dims (for correlation)
kernel_rotated = tensor.transpose(A, axes=[1, 0, 2, 3])
featshp_logical = (featshp[0], featshp[1], featshp[2] * stride,
featshp[3] * stride)
kshp_rotated = (kshp[1], kshp[0], kshp[2], kshp[3])
# print featshp, kshp_rotated, featshp_logical[1:], kshp[2:]
image_estimate = tensor.nnet.conv2d(a, kernel_rotated,
border_mode='full',
image_shape=featshp,
filter_shape=kshp_rotated,
imshp_logical=featshp_logical[1:],
kshp_logical=kshp[2:])
func = theano.function([a, A], image_estimate, mode=mode_with_gpu)
# theano.printing.debugprint(func,)
assert any([isinstance(node.op, self.conv_ops)
for node in func.maker.fgraph.toposort()])
a_in = numpy.random.randn(*featshp).astype("float32")
A_in = numpy.random.randn(*kshp).astype("float32")
func(a_in, A_in)
def test_invalid_input_shape(self):
"""
Tests that when the shape gived at build time is not the same as
run time we raise an error
"""
seed_rng()
verbose = 0
random = True
print_ = False
ones = False
if ones:
random = False
global mode_with_gpu
mode_with_gpu_orig = mode_with_gpu
try:
if theano.config.mode in ['DebugMode', 'DEBUG_MODE']:
mode_with_gpu = theano.compile.mode.get_mode(
'FAST_RUN').including('gpu')
for mode in ['valid', 'full']:
for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)),
((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)),
# We use only the number of columns.
]:
self.assertRaises(ValueError, _params_allgood,
shapes[0], shapes[1],
verbose=verbose, random=random,
mode=mode,
print_=print_, ones=ones,
compile_kshp=shapes[2])
finally:
mode_with_gpu = mode_with_gpu_orig
def benchmark():
shapes_valid = [
# test_lenet_28 shape
((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)), # valid
((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_32 shape
((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1)), # valid
((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_64 shape
((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1)), # valid
((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_108 shape
((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1)), # valid
((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_256 shape
((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1)), # valid
((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1)), # valid
((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1)), # valid
((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1)), # valid
]
shapes_full = [
# test_lenet_28 shape
((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # full
# test_lenet_32 shape
((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # full conv_full_patch_stack_padded' N=1
# test_lenet_64 shape
((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # full conv_full_patch_stack_padded' N=3
# test_lenet_108 shape
((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # full 'conv_full_patch_stack_padded' N=9
# test_lenet_256 shape
((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1)), # full conv_reference_full
]
version = [-1]
verbose = 1
random = True
exec_conv(version, shapes_valid, verbose, random, 'valid',
print_=None, rtol=1e-3)
exec_conv(version, shapes_full, verbose, random, 'full')
def test_stack_rows_segfault_070312():
seed_rng()
# 07/03/2012
# Running this unittest with cuda-memcheck exposes an illegal read.
# THEANO_FLAGS=device=gpu cuda-memcheck nosetests \
# test_conv_cuda_ndarray.py:test_stack_rows_segfault_070312
img = theano.shared(numpy.random.rand(1, 80, 96, 96).astype('float32'))
kern = theano.shared(numpy.random.rand(1, 80, 9, 9).astype('float32'))
out = theano.shared(numpy.random.rand(1, 2, 2, 3).astype('float32'))
op = theano.tensor.nnet.conv.ConvOp(imshp=(80, 96, 96), kshp=(9, 9),
nkern=1, bsize=1)
f = theano.function([], [], updates=[(out, op(img, kern))], mode=mode_with_gpu)
f()
......@@ -32,7 +32,7 @@ def test_local_remove_all_assert():
a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any())
# By default `unsafe` should not be there
f = theano.function([x], a, mode=mode_with_gpu)
f = theano.function([x], a, mode=mode_with_gpu.excluding('unsafe'))
topo = f.maker.fgraph.toposort()
a_op = [n for n in topo if isinstance(n.op, theano.tensor.opt.Assert)]
assert len(a_op) == 1
......
......@@ -16,6 +16,7 @@ from theano.tensor.nnet.blocksparse import (
from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs)
from theano.tensor.nnet.abstract_conv import get_conv_output_shape
from theano.tensor.opt import register_specialize_device
from theano.tensor import TensorType
......@@ -199,12 +200,10 @@ def local_conv2d_gradweight_cpu(node):
# Determine gradient on kernels
assert len(op_imshp) == 4 and len(op_kshp) == 4
outshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], node.op.subsample,
node.op.border_mode)
fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], (1, 1),
node.op.border_mode)
outshp = get_conv_output_shape(op_imshp, op_kshp,
node.op.border_mode, node.op.subsample)[2:]
fulloutshp = get_conv_output_shape(op_imshp, op_kshp,
node.op.border_mode, (1, 1))[2:]
newimg = img.dimshuffle((1, 0, 2, 3))
newtopgrad = topgrad.dimshuffle((1, 0, 2, 3))
......@@ -307,12 +306,11 @@ def local_conv2d_gradinputs_cpu(node):
filters = kern.dimshuffle((1, 0, 2, 3))
filters = filters[:, :, ::-1, ::-1]
outshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], node.op.subsample,
node.op.border_mode)
fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], (1, 1),
node.op.border_mode)
outshp = get_conv_output_shape(op_imshp, op_kshp,
node.op.border_mode, node.op.subsample)[2:]
fulloutshp = get_conv_output_shape(op_imshp, op_kshp,
node.op.border_mode, (1, 1))[2:]
nkern = op_imshp[1]
imshp = (op_kshp[0], outshp[0], outshp[1])
imshp_logical = (op_kshp[0], fulloutshp[0], fulloutshp[1])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论