提交 c032ac66 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Attempt at fixing convolution. It does not crash but gives wrong results.

上级 4f1c2697
...@@ -134,7 +134,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -134,7 +134,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
const int out_size_byte = out_size*sizeof(float); const int out_size_byte = out_size*sizeof(float);
if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){ if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for" PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received had %llud columns!", " %d kernel columns, but the kernel we received had %llu columns!",
THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]); THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]);
return -1; return -1;
} }
...@@ -217,13 +217,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -217,13 +217,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i, nb_split=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1], nb_split);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch' failed (%s)," "INFO: impl 'conv_patch' failed (%s),"
...@@ -307,21 +300,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -307,21 +300,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (err == GA_NO_ERROR) if (err == GA_NO_ERROR)
{ {
if (verbose>1)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i,"
" img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1],
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel,
(unsigned long long)subsample_rows,
(unsigned long long)subsample_cols);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_patch_stack' version with nb_split=%i" "INFO: used 'conv_patch_stack' version with nb_split=%i"
...@@ -334,21 +312,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -334,21 +312,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false,"
" kern_width=%i, img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1],
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel,
(unsigned long long)subsample_rows,
(unsigned long long)subsample_cols);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch_stack' failed (%s)," "INFO: impl 'conv_patch_stack' failed (%s),"
...@@ -394,12 +357,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -394,12 +357,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows' failed (%s)," "INFO: impl 'conv_rows' failed (%s),"
...@@ -428,19 +385,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -428,19 +385,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float); size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
if (0)
fprintf(stderr,
"IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
img_contiguous_2d, kern_contiguous_2d,
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], n_blocks[2]);
GpuKernel *k = NULL; GpuKernel *k = NULL;
if(!img_contiguous_2d || !kern_contiguous_2d) { if(!img_contiguous_2d || !kern_contiguous_2d) {
//fprintf(stderr, "using false version\n");
k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0; k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
} else { } else {
//fprintf(stderr, "using true version\n");
k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0; k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
} }
...@@ -460,23 +408,11 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -460,23 +408,11 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (err == GA_NO_ERROR) if (err == GA_NO_ERROR)
{ {
work_complete = true; work_complete = true;
if (verbose>1)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n"); fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows_stack' failed (%s)," "INFO: impl 'conv_rows_stack' failed (%s),"
...@@ -543,12 +479,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -543,12 +479,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (err == GA_NO_ERROR) if (err == GA_NO_ERROR)
{ {
work_complete = true; work_complete = true;
if (verbose>1)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_rows_stack2' version %s with" "INFO: used 'conv_rows_stack2' version %s with"
...@@ -558,12 +488,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -558,12 +488,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i version=%d\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1],(version==9?2:3));
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows_stack2' failed (%s)," "INFO: impl 'conv_rows_stack2' failed (%s),"
...@@ -680,13 +604,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -680,13 +604,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (err == GA_NO_ERROR) if (err == GA_NO_ERROR)
{ {
if (verbose>1)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i, "
"n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i,"
" nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_patch_stack_reduce' version" "INFO: used 'conv_patch_stack_reduce' version"
...@@ -697,14 +614,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -697,14 +614,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i,"
" nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch_stack_reduce' failed (%s)," "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
...@@ -714,7 +623,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -714,7 +623,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} // else no good nb_splits was found } // else no good nb_splits was found
} }
if (1 && (version==6||version==-1) && if ((version==6||version==-1) &&
kern_len<=320 && kern_len<=320 &&
!work_complete) //conv_valid_row_reduce !work_complete) //conv_valid_row_reduce
{ {
...@@ -782,12 +691,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -782,12 +691,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0],
n_reduce_buf, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_valid_row_reduce' failed (%s)," "INFO: impl 'conv_valid_row_reduce' failed (%s),"
...@@ -805,43 +708,8 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -805,43 +708,8 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
(size_t)256), (size_t)256),
(size_t)1, (size_t)1}; (size_t)1, (size_t)1};
if (1) if (verbose)
{ fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose)
fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose>1)
fprintf(stderr, " img : %i %llu %i %i %p "
"%lld %lld %lld %lld\n",
nbatch, (unsigned long long)stack_len, img_len, img_wid,
(void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
(long long)img_stride_batch,
(long long)img_stride_stack,
(long long)img_stride_row,
(long long)img_stride_col);
if (verbose>1)
fprintf(stderr, " kern: %i %i %i %i %p "
"%lld %lld %lld %lld\n",
nkern, nstack, kern_len, kern_wid,
(void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
(long long)kern_stride_nkern,
(long long)kern_stride_stack,
(long long)kern_stride_row,
(long long)kern_stride_col);
if (verbose>1)
fprintf(stderr, " out : %llu %llu %i %i %p "
"%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1],
out_len, out_wid,
(void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
(long long)out_stride_batch,
(long long)out_stride_nkern,
(long long)out_stride_row,
(long long)out_stride_col);
if (verbose>1)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks[0], threads_per_block[0]);
}
void *kernel_params[] = { void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len, (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
...@@ -1113,15 +981,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1113,15 +981,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
if (err == GA_NO_ERROR) if (err == GA_NO_ERROR)
{ {
if (verbose>1)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
out_len, nb_split, version);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_full_patch_stack_padded'" "INFO: used 'conv_full_patch_stack_padded'"
...@@ -1131,15 +990,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1131,15 +990,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
out_len, nb_split, version);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_full_patch_stack_padded' %s %s" "INFO: impl 'conv_full_patch_stack_padded' %s %s"
...@@ -1179,12 +1029,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1179,12 +1029,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_full_patch' failed (%s)," "INFO: impl 'conv_full_patch' failed (%s),"
...@@ -1225,12 +1069,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1225,12 +1069,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_load_everything'" fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation\n", " failed (%s), trying next implementation\n",
...@@ -1276,12 +1114,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1276,12 +1114,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n", fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
GpuKernel_error(k, err)); GpuKernel_error(k, err));
...@@ -1298,55 +1130,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1298,55 +1130,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
(size_t)256), (size_t)256),
(size_t)1, (size_t)1}; (size_t)1, (size_t)1};
if (0)
{
if (verbose)
fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose)
fprintf(stderr, " img : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n",
(unsigned long long)nbatch,
(unsigned long long)stack_len,
(unsigned long long)img_len,
(unsigned long long)img_wid,
(void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
(long long)img_stride_batch,
(long long)img_stride_stack,
(long long)img_stride_row,
(long long)img_stride_col);
if (verbose)
fprintf(stderr, " kern: %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n",
(unsigned long long)nkern,
(unsigned long long)nstack,
(unsigned long long)kern_len,
(unsigned long long)kern_wid,
(void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
(long long)kern_stride_nkern,
(long long)kern_stride_stack,
(long long)kern_stride_row,
(long long)kern_stride_col);
if (verbose)
fprintf(stderr, " out : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1],
(unsigned long long)out_len,
(unsigned long long)out_wid,
(void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
(long long)out_stride_batch,
(long long)out_stride_nkern,
(long long)out_stride_row,
(long long)out_stride_col);
if (verbose)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks[0], threads_per_block[0]);
if (verbose)
fprintf(stderr, " subsample params: %llu %llu\n",
(unsigned long long)subsample_rows,
(unsigned long long)subsample_cols);
}
void *kernel_params[] = { void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len, (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&img_len, (void *)&img_wid, (void *)&img_len, (void *)&img_wid,
...@@ -1377,11 +1160,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1377,11 +1160,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
} }
else else
{ {
if (verbose)
fprintf(stderr, "threads_per_block[0]=%i, threads_per_block[1]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], 1, n_blocks[0], 1, 0, threads_per_block[0]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s)," fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
...@@ -1465,7 +1243,7 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern, ...@@ -1465,7 +1243,7 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
rval = pygpu_zeros(4, out_dim, rval = pygpu_zeros(4, out_dim,
img->ga.typecode, GA_C_ORDER, img->ga.typecode, GA_C_ORDER,
img->ctx, Py_None); img->context, Py_None);
//rval might be null //rval might be null
} }
if ((rval==NULL) if ((rval==NULL)
...@@ -1488,14 +1266,3 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern, ...@@ -1488,14 +1266,3 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
} }
return (PyObject*)rval; return (PyObject*)rval;
} }
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
...@@ -164,7 +164,7 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -164,7 +164,7 @@ class GpuConv(GpuKernelBase, gof.Op):
node_ = copy.copy(node) node_ = copy.copy(node)
assert node.op is node_.op assert node.op is node_.op
if node_.op.max_threads_dim0 is None: if node_.op.max_threads_dim0 is None:
node_.op.max_threads_dim0 = node._inputs[0].type.context.maxlsize node_.op.max_threads_dim0 = node_.inputs[0].type.context.maxlsize
return super(GpuConv, node_.op).make_thunk(node_, storage_map, return super(GpuConv, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling) compute_map, no_recycling)
...@@ -179,7 +179,7 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -179,7 +179,7 @@ class GpuConv(GpuKernelBase, gof.Op):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 22) return (0, 23)
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, node, nodename, inp, out_, sub):
if node.inputs[0].type.context.kind != "cuda": if node.inputs[0].type.context.kind != "cuda":
...@@ -251,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -251,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op):
""" % locals() """ % locals()
code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read() code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in ["conv_kernel.cu", "conv_full_kernel.cu"]]) for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
kname = "conv_full_load_everything"
gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags) gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
bin = gk._binary bin = gk._binary
bcode = ','.join(hex(ord(c)) for c in bin) bcode = ','.join(hex(ord(c)) for c in bin)
...@@ -262,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -262,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op):
static const char conv_bcode[] = {%(bcode)s}; static const char conv_bcode[] = {%(bcode)s};
static const char *conv_code = "%(code)s"; static const char *conv_code = "%(code)s";
""" % locals() """ % locals()
for k in kernels: return mod
mod += "static GpuKernel " + k.name + '_' + name + ";\n"
mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read() def c_support_code_struct(self, node, name):
mod = GpuKernelBase.c_support_code_struct(self, node, name)
with open(os.path.join(os.path.split(__file__)[0], "conv.cu")) as f:
mod += f.read()
return mod return mod
@utils.memoize @utils.memoize
......
...@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) { ...@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
//Must be the same size as a ptr. We can't use unsigned long as on Windows 64 //Must be the same size as a ptr. We can't use unsigned long as on Windows 64
//bit, it is 32 bit. //bit, it is 32 bit.
const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers const size_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){ __device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
if (nb_thread < 64) if (nb_thread < 64)
...@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_ ...@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
if (thread_id < nb_thread) if (thread_id < nb_thread)
{ {
const float * my_src_ptr = (const float *)( const float * my_src_ptr = (const float *)(
((uintptr_t)src) & COALESCED_ALIGN); ((size_t)src) & COALESCED_ALIGN);
my_src_ptr += thread_id; my_src_ptr += thread_id;
while (my_src_ptr < src + N) while (my_src_ptr < src + N)
{ {
......
...@@ -837,8 +837,7 @@ def local_gpu_conv(node, context_name): ...@@ -837,8 +837,7 @@ def local_gpu_conv(node, context_name):
return return
out = gpu_conv(GpuFromHost(context_name)(img), out = gpu_conv(GpuFromHost(context_name)(img),
GpuFromHost(context_name)(kern)) GpuFromHost(context_name)(kern))
# op_lifter want the output on the GPU. assert isinstance(out.type, GpuArrayType)
out = GpuFromHost(context_name)(out)
out.values_eq_approx = values_eq_approx out.values_eq_approx = values_eq_approx
return [out] return [out]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论