提交 5fc89c03 authored 作者: Frederic's avatar Frederic

new GpuConv compile, but give wrong version in some cases!

上级 baf12f54
// REMEMBER TO RAISE c_code_cache_version when changing this file // REMEMBER TO RAISE c_code_cache_version when changing this file
// //
//TODO detect SHARED_SIZE dynamically
#define SHARED_SIZE (16*1024)
enum { ConvMode_FULL, ConvMode_VALID }; enum { ConvMode_FULL, ConvMode_VALID };
PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose); PyObject * PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern, PyGpuArrayObject * out, const int mode,
const size_t subsample_rows, const size_t subsample_cols, const int version, const int verbose);
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
/* /*
* version: -1, autodetect, >=0 a specific version to use. * version: -1, autodetect, >=0 a specific version to use.
* If it can't be executed, we revert to the reference implementation * If it can't be executed, we revert to the reference implementation
*/ */
int int
CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, PyGpuArray_conv_valid(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
CudaNdarray * out, int subsample_rows, int subsample_cols, PyGpuArrayObject * out, size_t subsample_rows, size_t subsample_cols,
int version = -1, int verbose=0, int version = -1, int verbose=0,
int max_threads_dim0 = 512 int max_threads_dim0 = 512
) )
{ {
int work_complete = 0; int work_complete = 0;
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file. const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
if (img->nd != 4) if (PyGpuArray_NDIM(img) != 4)
{ {
PyErr_SetString(PyExc_ValueError, "required img of 4D"); PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1; return -1;
} }
if (kern->nd != 4) if (PyGpuArray_NDIM(kern) != 4)
{ {
PyErr_SetString(PyExc_ValueError, "required kern of 4D"); PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1; return -1;
} }
if (out->nd != 4) if (PyGpuArray_NDIM(out) != 4)
{ {
PyErr_SetString(PyExc_ValueError, "required out of 4D"); PyErr_SetString(PyExc_ValueError, "required out of 4D");
return -1; return -1;
...@@ -40,40 +50,40 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -40,40 +50,40 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
version, THEANO_KERN_WID); version, THEANO_KERN_WID);
fprintf(stderr, fprintf(stderr,
"INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n", "INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], PyGpuArray_DIMS(img)[0], PyGpuArray_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3], PyGpuArray_DIMS(img)[2],PyGpuArray_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0], PyGpuArray_STRIDES(img)[0]/4,
CudaNdarray_HOST_STRIDES(img)[1], PyGpuArray_STRIDES(img)[1]/4,
CudaNdarray_HOST_STRIDES(img)[2], PyGpuArray_STRIDES(img)[2]/4,
CudaNdarray_HOST_STRIDES(img)[3]); PyGpuArray_STRIDES(img)[3]/4);
fprintf(stderr, fprintf(stderr,
"INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n", "INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], PyGpuArray_DIMS(kern)[0], PyGpuArray_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3], PyGpuArray_DIMS(kern)[2], PyGpuArray_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0], PyGpuArray_STRIDES(kern)[0]/4,
CudaNdarray_HOST_STRIDES(kern)[1], PyGpuArray_STRIDES(kern)[1]/4,
CudaNdarray_HOST_STRIDES(kern)[2], PyGpuArray_STRIDES(kern)[2]/4,
CudaNdarray_HOST_STRIDES(kern)[3]); PyGpuArray_STRIDES(kern)[3]/4);
fprintf(stderr, fprintf(stderr,
"INFO: out dim: %i %i %i %i out stride: %i %i %i %i\n", "INFO: out dim: %i %i %i %i out stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], PyGpuArray_DIMS(out)[0], PyGpuArray_DIMS(out)[1],
CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3], PyGpuArray_DIMS(out)[2], PyGpuArray_DIMS(out)[3],
CudaNdarray_HOST_STRIDES(out)[0], PyGpuArray_STRIDES(out)[0]/4,
CudaNdarray_HOST_STRIDES(out)[1], PyGpuArray_STRIDES(out)[1]/4,
CudaNdarray_HOST_STRIDES(out)[2], PyGpuArray_STRIDES(out)[2]/4,
CudaNdarray_HOST_STRIDES(out)[3]); PyGpuArray_STRIDES(out)[3]/4);
fprintf(stderr, fprintf(stderr,
"INFO: subsample_rows=%d, subsample_cols=%d\n", "INFO: subsample_rows=%d, subsample_cols=%d\n",
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
} }
//Check the output size is valid //Check the output size is valid
assert (CudaNdarray_HOST_DIMS(out)[2] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2]- CudaNdarray_HOST_DIMS(kern)[2] + 1, subsample_rows)); assert (PyGpuArray_DIMS(out)[2] == ceil_intdiv(PyGpuArray_DIMS(img)[2]- PyGpuArray_DIMS(kern)[2] + 1, subsample_rows));
assert (CudaNdarray_HOST_DIMS(out)[3] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3]- CudaNdarray_HOST_DIMS(kern)[3] + 1, subsample_cols)); assert (PyGpuArray_DIMS(out)[3] == ceil_intdiv(PyGpuArray_DIMS(img)[3]- PyGpuArray_DIMS(kern)[3] + 1, subsample_cols));
assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]); assert (PyGpuArray_DIMS(out)[0] == PyGpuArray_DIMS(img)[0]);
assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]); assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]);
assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]); assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]);
// we now search through a few implementations until one applies to our arguments. // we now search through a few implementations until one applies to our arguments.
...@@ -82,24 +92,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -82,24 +92,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//TODO: make a parameter the number of division //TODO: make a parameter the number of division
//TODO: Should we make them in separate grid block instead? //TODO: Should we make them in separate grid block instead?
const int nstack=CudaNdarray_HOST_DIMS(kern)[1]; const int nstack=PyGpuArray_DIMS(kern)[1];
const int nbatch=CudaNdarray_HOST_DIMS(img)[0]; const int nbatch=PyGpuArray_DIMS(img)[0];
const int nkern=CudaNdarray_HOST_DIMS(kern)[0]; const int nkern=PyGpuArray_DIMS(kern)[0];
const int img_wid=CudaNdarray_HOST_DIMS(img)[3]; const int img_wid=PyGpuArray_DIMS(img)[3];
const int img_len=CudaNdarray_HOST_DIMS(img)[2]; const int img_len=PyGpuArray_DIMS(img)[2];
const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3]; const int kern_wid=PyGpuArray_DIMS(kern)[3];
const int kern_len=CudaNdarray_HOST_DIMS(kern)[2]; const int kern_len=PyGpuArray_DIMS(kern)[2];
const int out_wid=CudaNdarray_HOST_DIMS(out)[3]; const int out_wid=PyGpuArray_DIMS(out)[3];
const int out_len=CudaNdarray_HOST_DIMS(out)[2]; const int out_len=PyGpuArray_DIMS(out)[2];
const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3]; const int img_stride_col= PyGpuArray_STRIDES(img)[3]/4;
const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2]; const int img_stride_row=PyGpuArray_STRIDES(img)[2]/4;
const int img_stride_stack= CudaNdarray_HOST_STRIDES(img)[1]; const int img_stride_stack= PyGpuArray_STRIDES(img)[1]/4;
const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0]; const int img_stride_batch=PyGpuArray_STRIDES(img)[0]/4;
const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3]; const int kern_stride_col= PyGpuArray_STRIDES(kern)[3]/4;
const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2]; const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1]; const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0]; const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
const int img_size=img_len*img_wid; const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid; const int kern_size=kern_len*kern_wid;
...@@ -107,17 +117,17 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -107,17 +117,17 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
const int img_size_byte = img_size*sizeof(float); const int img_size_byte = img_size*sizeof(float);
const int kern_size_byte = kern_size*sizeof(float); const int kern_size_byte = kern_size*sizeof(float);
const int out_size_byte = out_size*sizeof(float); const int out_size_byte = out_size*sizeof(float);
if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){ if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for" PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received had %d columns!", " %d kernel columns, but the kernel we received had %ud columns!",
THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]); THEANO_KERN_WID, PyGpuArray_DIMS(kern)[3]);
return -1; return -1;
} }
bool subsample = subsample_rows!=1 || subsample_cols!=1; bool subsample = subsample_rows!=1 || subsample_cols!=1;
bool img_contiguous = CudaNdarray_is_c_contiguous(img); bool img_contiguous = img->ga.flags & GA_C_CONTIGUOUS;
bool kern_contiguous = CudaNdarray_is_c_contiguous(kern); bool kern_contiguous = kern->ga.flags & GA_C_CONTIGUOUS;
bool out_contiguous = CudaNdarray_is_c_contiguous(out); bool out_contiguous = out->ga.flags & GA_C_CONTIGUOUS;
bool c_contiguous = img_contiguous && kern_contiguous && out_contiguous; bool c_contiguous = img_contiguous && kern_contiguous && out_contiguous;
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid); bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
...@@ -130,7 +140,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -130,7 +140,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//we don't need to unflip it, but have the new value when we unflip it. //we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true; bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d; bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata; const float * kern_data_unflipped = cuda_get_ptr(kern);
int kern_stride_col_unflipped=kern_stride_col; int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row; int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){ if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
...@@ -139,7 +149,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -139,7 +149,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
kern_stride_row_unflipped=kern_wid; kern_stride_row_unflipped=kern_wid;
kern_flipped=false; kern_flipped=false;
kern_contiguous_2d_unflipped = true; kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]); kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
} }
//if we remove the restriction //if we remove the restriction
...@@ -173,7 +183,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -173,7 +183,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
dim3 grid(nbatch, nkern); dim3 grid(nbatch, nkern);
int shared_size=(img_size + kern_size)*sizeof(float); int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(const float*, const float*, float*,
int, int, int, int, int, int, int, int,
int, int); int, int);
...@@ -184,9 +194,9 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -184,9 +194,9 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CONV_PATCH_SPECIAL(THEANO_KERN_WID); CONV_PATCH_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(img->devdata, kern->devdata, out->devdata, (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack); img_len, img_wid, kern_len, kern_wid, nkern, nstack);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
...@@ -234,7 +244,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -234,7 +244,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
dim3 grid(nbatch,nkern); dim3 grid(nbatch,nkern);
int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float); int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(const float*, const float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
...@@ -277,14 +287,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -277,14 +287,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID); CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(img->devdata, kern->devdata, out->devdata, (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, img_len, img_wid, kern_len, kern_wid,
out_len, out_wid, nkern, nstack, out_len, out_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col, kern_stride_row, img_stride_batch, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols); kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
...@@ -346,7 +355,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -346,7 +355,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
dim3 threads(out_wid); dim3 threads(out_wid);
dim3 grid(out_len, nbatch*nkern); dim3 grid(out_len, nbatch*nkern);
int shared_size=(kern_len*img_wid + kern_size)*sizeof(float); int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(const float*, const float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
...@@ -358,14 +367,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -358,14 +367,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CONV_ROWS_SPECIAL(THEANO_KERN_WID); CONV_ROWS_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>> f<<< grid, threads, shared_size >>>
(img->devdata, kern->devdata, out->devdata, (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch, img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
...@@ -408,7 +416,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -408,7 +416,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float); int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(const float*, const float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
...@@ -430,16 +438,15 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -430,16 +438,15 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
} }
f<<< grid, threads, shared_size >>> f<<< grid, threads, shared_size >>>
(img->devdata, (cuda_get_ptr(img),
kern->devdata, cuda_get_ptr(kern),
out->devdata, cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch, img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
...@@ -503,7 +510,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -503,7 +510,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int shared_size=(threads.y*img_wid + k_size)*sizeof(float); int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(const float*, const float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
...@@ -518,16 +525,15 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -518,16 +525,15 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID); CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>> f<<< grid, threads, shared_size >>>
(img->devdata, (cuda_get_ptr(img),
kern->devdata, cuda_get_ptr(kern),
out->devdata, cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch, img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
...@@ -626,7 +632,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -626,7 +632,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
dim3 threads(out_wid, out_len, thread_z); dim3 threads(out_wid, out_len, thread_z);
dim3 grid(nbatch,nkern); dim3 grid(nbatch,nkern);
void (*f)(float*, float*, float*, void (*f)(const float*, const float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int,
...@@ -657,13 +663,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -657,13 +663,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>; else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID); CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata, f<<< grid, threads, shared_size>>>(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, img_len, img_wid, kern_len, kern_wid,
nkern, nstack, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
kern_stride_col_unflipped, kern_stride_row_unflipped, kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
...@@ -705,8 +711,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -705,8 +711,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
kern_len<=320 && kern_len<=320 &&
!work_complete) //conv_valid_row_reduce !work_complete) //conv_valid_row_reduce
{ {
int outsize = CudaNdarray_SIZE(out); int outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, 4096);
int block_nstack=nstack; int block_nstack=nstack;
//Max of 512 threads per blocks. //Max of 512 threads per blocks.
...@@ -736,8 +742,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -736,8 +742,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
void (*f)(int, int, int, int, void (*f)(int, int, int, int,
int, int, int, int, int, int, int, int, int, int,
float*, int, int, int, int, const float*, int, int, int, int,
float*, int, int, int, int, const float*, int, int, int, int,
float*, int, int, int, int, float*, int, int, int, int,
int, int, int); int, int, int);
...@@ -749,23 +755,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -749,23 +755,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
else else
f=conv_valid_row_reduce<true>; f=conv_valid_row_reduce<true>;
f<<<n_blocks, n_threads, n_reduce_buf>>>( f<<<n_blocks, n_threads, n_reduce_buf>>>(
nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1], nbatch, nkern, PyGpuArray_DIMS(img)[1],
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
out_len, out_wid, out_len, out_wid,
img->devdata, cuda_get_ptr(img),
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], PyGpuArray_STRIDES(img)[0]/4, PyGpuArray_STRIDES(img)[1]/4,
img_stride_row, img_stride_col, img_stride_row, img_stride_col,
kern->devdata, cuda_get_ptr(kern),
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], PyGpuArray_STRIDES(kern)[0]/4, PyGpuArray_STRIDES(kern)[1]/4,
CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3], PyGpuArray_STRIDES(kern)[2]/4, PyGpuArray_STRIDES(kern)[3]/4,
out->devdata, cuda_get_ptr(out),
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], PyGpuArray_STRIDES(out)[0]/4, PyGpuArray_STRIDES(out)[1]/4,
CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3], PyGpuArray_STRIDES(out)[2]/4, PyGpuArray_STRIDES(out)[3]/4,
subsample_rows, subsample_cols, initial_reduce_boundary); subsample_rows, subsample_cols, initial_reduce_boundary);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
...@@ -791,65 +795,64 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -791,65 +795,64 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (1 && !work_complete) //conv_reference_valid if (1 && !work_complete) //conv_reference_valid
{ {
int outsize = CudaNdarray_SIZE(out); int outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, 4096);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
NUM_VECTOR_OP_THREADS_PER_BLOCK); 256);
if (1) if (1)
{ {
if (verbose) if (verbose)
fprintf(stderr, "INFO: launching conv_reference_valid\n"); fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose>1) if (verbose>1)
fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid, nbatch, PyGpuArray_DIMS(img)[1], img_len, img_wid,
img->devdata, cuda_get_ptr(img),
CudaNdarray_HOST_STRIDES(img)[0], PyGpuArray_STRIDES(img)[0]/4,
CudaNdarray_HOST_STRIDES(img)[1], PyGpuArray_STRIDES(img)[1]/4,
CudaNdarray_HOST_STRIDES(img)[2], PyGpuArray_STRIDES(img)[2]/4,
CudaNdarray_HOST_STRIDES(img)[3]); PyGpuArray_STRIDES(img)[3]/4);
if (verbose>1) if (verbose>1)
fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
nkern, nstack, kern_len, kern_wid, nkern, nstack, kern_len, kern_wid,
kern->devdata, cuda_get_ptr(kern),
CudaNdarray_HOST_STRIDES(kern)[0], PyGpuArray_STRIDES(kern)[0]/4,
CudaNdarray_HOST_STRIDES(kern)[1], PyGpuArray_STRIDES(kern)[1]/4,
CudaNdarray_HOST_STRIDES(kern)[2], PyGpuArray_STRIDES(kern)[2]/4,
CudaNdarray_HOST_STRIDES(kern)[3]); PyGpuArray_STRIDES(kern)[3]/4);
if (verbose>1) if (verbose>1)
fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], PyGpuArray_DIMS(out)[0],
CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid, PyGpuArray_DIMS(out)[1], out_len, out_wid,
out->devdata, cuda_get_ptr(out),
CudaNdarray_HOST_STRIDES(out)[0], PyGpuArray_STRIDES(out)[0]/4,
CudaNdarray_HOST_STRIDES(out)[1], PyGpuArray_STRIDES(out)[1]/4,
CudaNdarray_HOST_STRIDES(out)[2], PyGpuArray_STRIDES(out)[2]/4,
CudaNdarray_HOST_STRIDES(out)[3]); PyGpuArray_STRIDES(out)[3]/4);
if (verbose>1) if (verbose>1)
fprintf(stderr, " launch params: %i %i %i\n", fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads); outsize, n_blocks, n_threads);
} }
conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern, conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
CudaNdarray_HOST_DIMS(img)[1], PyGpuArray_DIMS(img)[1],
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
out_len, out_wid, out_len, out_wid,
img->devdata, cuda_get_ptr(img),
CudaNdarray_HOST_STRIDES(img)[0], PyGpuArray_STRIDES(img)[0]/4,
CudaNdarray_HOST_STRIDES(img)[1], PyGpuArray_STRIDES(img)[1]/4,
CudaNdarray_HOST_STRIDES(img)[2], PyGpuArray_STRIDES(img)[2]/4,
CudaNdarray_HOST_STRIDES(img)[3], PyGpuArray_STRIDES(img)[3]/4,
kern->devdata, cuda_get_ptr(kern),
CudaNdarray_HOST_STRIDES(kern)[0], PyGpuArray_STRIDES(kern)[0]/4,
CudaNdarray_HOST_STRIDES(kern)[1], PyGpuArray_STRIDES(kern)[1]/4,
CudaNdarray_HOST_STRIDES(kern)[2], PyGpuArray_STRIDES(kern)[2]/4,
CudaNdarray_HOST_STRIDES(kern)[3], PyGpuArray_STRIDES(kern)[3]/4,
out->devdata, cuda_get_ptr(out),
CudaNdarray_HOST_STRIDES(out)[0], PyGpuArray_STRIDES(out)[0]/4,
CudaNdarray_HOST_STRIDES(out)[1], PyGpuArray_STRIDES(out)[1]/4,
CudaNdarray_HOST_STRIDES(out)[2], PyGpuArray_STRIDES(out)[2]/4,
CudaNdarray_HOST_STRIDES(out)[3], PyGpuArray_STRIDES(out)[3]/4,
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
...@@ -864,7 +867,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -864,7 +867,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
fprintf(stderr, "INFO: 'conv_reference_valid' failed\n"); fprintf(stderr, "INFO: 'conv_reference_valid' failed\n");
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for" "ERROR: all implementations failed for"
" CudaNdarray_conv_valid! (%s)", " PyGpuArray_conv_valid! (%s)",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
return -1; return -1;
} }
...@@ -873,7 +876,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -873,7 +876,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
{ {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"ERROR: no implementation(s) worked for" "ERROR: no implementation(s) worked for"
" CudaNdarray_conv_valid!" " PyGpuArray_conv_valid!"
" Version asked(%d) (-1 mean use an heuristic)", " Version asked(%d) (-1 mean use an heuristic)",
version); version);
return -1; return -1;
...@@ -882,56 +885,56 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -882,56 +885,56 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
} }
int int
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
CudaNdarray * out, int subsample_rows, PyGpuArrayObject * out, size_t subsample_rows,
int subsample_cols, int version = -1, int verbose=0, size_t subsample_cols, int version = -1, int verbose=0,
int max_threads_dim0=512) int max_threads_dim0=512)
{ {
//144 is the biggest static shared size used with compiling this file. //144 is the biggest static shared size used with compiling this file.
const int shared_avail = SHARED_SIZE - 150; const int shared_avail = SHARED_SIZE - 150;
int work_complete = 0; int work_complete = 0;
if (img->nd != 4) if (PyGpuArray_NDIM(img) != 4)
{ {
PyErr_SetString(PyExc_ValueError, "required img of 4D"); PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1; return -1;
} }
if (kern->nd != 4) if (PyGpuArray_NDIM(kern) != 4)
{ {
PyErr_SetString(PyExc_ValueError, "required kern of 4D"); PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1; return -1;
} }
if (out->nd != 4) if (PyGpuArray_NDIM(out) != 4)
{ {
PyErr_SetString(PyExc_ValueError, "required out of 4D"); PyErr_SetString(PyExc_ValueError, "required out of 4D");
return -1; return -1;
} }
// check the size of the output matrix // check the size of the output matrix
assert (CudaNdarray_HOST_DIMS(out)[2] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1, subsample_rows)); assert (PyGpuArray_DIMS(out)[2] == ceil_intdiv(PyGpuArray_DIMS(img)[2] + PyGpuArray_DIMS(kern)[2] - 1, subsample_rows));
assert (CudaNdarray_HOST_DIMS(out)[3] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1, subsample_cols)); assert (PyGpuArray_DIMS(out)[3] == ceil_intdiv(PyGpuArray_DIMS(img)[3] + PyGpuArray_DIMS(kern)[3] - 1, subsample_cols));
assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]); assert (PyGpuArray_DIMS(out)[0] == PyGpuArray_DIMS(img)[0]);
assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]); assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]);
assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]); assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]);
const int nstack=CudaNdarray_HOST_DIMS(kern)[1]; const int nstack=PyGpuArray_DIMS(kern)[1];
const int nbatch=CudaNdarray_HOST_DIMS(img)[0]; const int nbatch=PyGpuArray_DIMS(img)[0];
const int nkern=CudaNdarray_HOST_DIMS(kern)[0]; const int nkern=PyGpuArray_DIMS(kern)[0];
const int img_wid=CudaNdarray_HOST_DIMS(img)[3]; const int img_wid=PyGpuArray_DIMS(img)[3];
const int img_len=CudaNdarray_HOST_DIMS(img)[2]; const int img_len=PyGpuArray_DIMS(img)[2];
const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3]; const int kern_wid=PyGpuArray_DIMS(kern)[3];
const int kern_len=CudaNdarray_HOST_DIMS(kern)[2]; const int kern_len=PyGpuArray_DIMS(kern)[2];
const int out_wid=CudaNdarray_HOST_DIMS(out)[3]; const int out_wid=PyGpuArray_DIMS(out)[3];
const int out_len=CudaNdarray_HOST_DIMS(out)[2]; const int out_len=PyGpuArray_DIMS(out)[2];
const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3]; const int img_stride_col= PyGpuArray_STRIDES(img)[3]/4;
const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2]; const int img_stride_row=PyGpuArray_STRIDES(img)[2]/4;
const int img_stride_stack=CudaNdarray_HOST_STRIDES(img)[1]; const int img_stride_stack=PyGpuArray_STRIDES(img)[1]/4;
const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0]; const int img_stride_batch=PyGpuArray_STRIDES(img)[0]/4;
const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3]; const int kern_stride_col= PyGpuArray_STRIDES(kern)[3]/4;
const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2]; const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1]; const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0]; const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
const int img_size=img_len*img_wid; const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid; const int kern_size=kern_len*kern_wid;
...@@ -946,20 +949,20 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -946,20 +949,20 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
//const int out_size_byte = out_size*sizeof(float); // unused //const int out_size_byte = out_size*sizeof(float); // unused
if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) ||
(THEANO_KERN_WID == 0))){ (THEANO_KERN_WID == 0))){
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"ERROR: This GpuConv code was compiled for" "ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received" " %d kernel columns, but the kernel we received"
" had %d columns!", " had %ud columns!",
THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]); THEANO_KERN_WID, PyGpuArray_DIMS(kern)[3]);
return -1; return -1;
} }
bool subsample = subsample_rows!=1 || subsample_cols!=1; bool subsample = subsample_rows!=1 || subsample_cols!=1;
bool img_contiguous = CudaNdarray_is_c_contiguous(img); bool img_contiguous = img->ga.flags & GA_C_CONTIGUOUS;
bool kern_contiguous = CudaNdarray_is_c_contiguous(kern); bool kern_contiguous = kern->ga.flags & GA_C_CONTIGUOUS;
bool out_contiguous = CudaNdarray_is_c_contiguous(out); bool out_contiguous = out->ga.flags & GA_C_CONTIGUOUS;
bool c_contiguous = img_contiguous && kern_contiguous && out_contiguous; bool c_contiguous = img_contiguous && kern_contiguous && out_contiguous;
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid); bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
...@@ -974,7 +977,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -974,7 +977,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
//we don't need to unflip it, but have the new value when we unflip it. //we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true; bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d; bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata; const float * kern_data_unflipped = cuda_get_ptr(kern);
int kern_stride_col_unflipped=kern_stride_col; int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row; int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){ if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
...@@ -983,7 +986,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -983,7 +986,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
kern_stride_row_unflipped=kern_wid; kern_stride_row_unflipped=kern_wid;
kern_flipped=false; kern_flipped=false;
kern_contiguous_2d_unflipped = true; kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]); kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
} }
if (verbose>1) if (verbose>1)
...@@ -991,26 +994,26 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -991,26 +994,26 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
printf("INFO: Running conv_full version=%d," printf("INFO: Running conv_full version=%d,"
" MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID); " MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n", printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], PyGpuArray_DIMS(img)[0], PyGpuArray_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3], PyGpuArray_DIMS(img)[2], PyGpuArray_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0], PyGpuArray_STRIDES(img)[0]/4,
CudaNdarray_HOST_STRIDES(img)[1], PyGpuArray_STRIDES(img)[1]/4,
CudaNdarray_HOST_STRIDES(img)[2], PyGpuArray_STRIDES(img)[2]/4,
CudaNdarray_HOST_STRIDES(img)[3]); PyGpuArray_STRIDES(img)[3]/4);
printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n", printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], PyGpuArray_DIMS(kern)[0], PyGpuArray_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3], PyGpuArray_DIMS(kern)[2], PyGpuArray_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0], PyGpuArray_STRIDES(kern)[0]/4,
CudaNdarray_HOST_STRIDES(kern)[1], PyGpuArray_STRIDES(kern)[1]/4,
CudaNdarray_HOST_STRIDES(kern)[2], PyGpuArray_STRIDES(kern)[2]/4,
CudaNdarray_HOST_STRIDES(kern)[3]); PyGpuArray_STRIDES(kern)[3]/4);
printf("INFO: out dim: %i %i %i %i out stride: %i %i %i %i\n", printf("INFO: out dim: %i %i %i %i out stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], PyGpuArray_DIMS(out)[0], PyGpuArray_DIMS(out)[1],
CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3], PyGpuArray_DIMS(out)[2], PyGpuArray_DIMS(out)[3],
CudaNdarray_HOST_STRIDES(out)[0], PyGpuArray_STRIDES(out)[0]/4,
CudaNdarray_HOST_STRIDES(out)[1], PyGpuArray_STRIDES(out)[1]/4,
CudaNdarray_HOST_STRIDES(out)[2], PyGpuArray_STRIDES(out)[2]/4,
CudaNdarray_HOST_STRIDES(out)[3]); PyGpuArray_STRIDES(out)[3]/4);
} }
if (!subsample && if (!subsample &&
...@@ -1063,7 +1066,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1063,7 +1066,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
int shared_size=img_size_padded_byte + kern_size_byte; int shared_size=img_size_padded_byte + kern_size_byte;
if(version==5) if(version==5)
shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte; shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
void (*f)(float*, float*, float*, void (*f)(const float*, const float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
...@@ -1087,13 +1090,12 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1087,13 +1090,12 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID); CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(img->devdata, kern_data_unflipped, out->devdata, (cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped, img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
...@@ -1147,14 +1149,13 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1147,14 +1149,13 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
//TODO assert c_continious for img, kern and out in the 2 inner dimensions. //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
conv_full_patch<<< grid, threads, shared_size>>> conv_full_patch<<< grid, threads, shared_size>>>
(img->devdata, (cuda_get_ptr(img),
kern->devdata, cuda_get_ptr(kern),
out->devdata, cuda_get_ptr(out),
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
nkern, nstack); nkern, nstack);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
...@@ -1189,30 +1190,29 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1189,30 +1190,29 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
//TODO assert c_continious for img, kern and out in the 2 inner dimensions. //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//typeof(conv_full_load_everything<0>) f = ; //typeof(conv_full_load_everything<0>) f = ;
void (*f)(float*, float*, float*, void (*f)(const float*, const float*, float*,
int, int, int, int, int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int) = conv_full_load_everything<0>; int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
f = conv_full_load_everything<THEANO_KERN_WID>; f = conv_full_load_everything<THEANO_KERN_WID>;
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(img->devdata, (cuda_get_ptr(img),
kern->devdata, cuda_get_ptr(kern),
out->devdata, cuda_get_ptr(out),
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
nkern, nstack, nkern, nstack,
CudaNdarray_HOST_STRIDES(img)[3], PyGpuArray_STRIDES(img)[3]/4,
CudaNdarray_HOST_STRIDES(img)[2], PyGpuArray_STRIDES(img)[2]/4,
CudaNdarray_HOST_STRIDES(img)[1], PyGpuArray_STRIDES(img)[1]/4,
CudaNdarray_HOST_STRIDES(img)[0], PyGpuArray_STRIDES(img)[0]/4,
CudaNdarray_HOST_STRIDES(kern)[3], PyGpuArray_STRIDES(kern)[3]/4,
CudaNdarray_HOST_STRIDES(kern)[2], PyGpuArray_STRIDES(kern)[2]/4,
CudaNdarray_HOST_STRIDES(kern)[1], PyGpuArray_STRIDES(kern)[1]/4,
CudaNdarray_HOST_STRIDES(kern)[0] PyGpuArray_STRIDES(kern)[0]/4
); );
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
...@@ -1246,7 +1246,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1246,7 +1246,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
dim3 grid(nbatch,nkern); dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float); int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(const float*, const float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int); int, int, int, int);
...@@ -1257,15 +1257,15 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1257,15 +1257,15 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>; else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
f<<< grid, threads, shared_size>>>( f<<< grid, threads, shared_size>>>(
img->devdata, cuda_get_ptr(img),
kern->devdata, cuda_get_ptr(kern),
out->devdata, cuda_get_ptr(out),
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
nkern, nstack,img_stride_col, img_stride_row, nkern, nstack,img_stride_col, img_stride_row,
kern_stride_col, kern_stride_row, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
...@@ -1290,48 +1290,48 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1290,48 +1290,48 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
{ {
if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n"); if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");
int outsize = CudaNdarray_SIZE(out); int outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, 4096);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
NUM_VECTOR_OP_THREADS_PER_BLOCK); 256);
if (0) if (0)
{ {
if (verbose) if (verbose)
fprintf(stderr, "INFO: launching conv_reference_valid\n"); fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose) if (verbose)
fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], PyGpuArray_DIMS(img)[0],
CudaNdarray_HOST_DIMS(img)[1], PyGpuArray_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], PyGpuArray_DIMS(img)[2],
CudaNdarray_HOST_DIMS(img)[3], PyGpuArray_DIMS(img)[3],
img->devdata, cuda_get_ptr(img),
CudaNdarray_HOST_STRIDES(img)[0], PyGpuArray_STRIDES(img)[0]/4,
CudaNdarray_HOST_STRIDES(img)[1], PyGpuArray_STRIDES(img)[1]/4,
CudaNdarray_HOST_STRIDES(img)[2], PyGpuArray_STRIDES(img)[2]/4,
CudaNdarray_HOST_STRIDES(img)[3]); PyGpuArray_STRIDES(img)[3]/4);
if (verbose) if (verbose)
fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], PyGpuArray_DIMS(kern)[0],
CudaNdarray_HOST_DIMS(kern)[1], PyGpuArray_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2], PyGpuArray_DIMS(kern)[2],
CudaNdarray_HOST_DIMS(kern)[3], PyGpuArray_DIMS(kern)[3],
kern->devdata, cuda_get_ptr(kern),
CudaNdarray_HOST_STRIDES(kern)[0], PyGpuArray_STRIDES(kern)[0]/4,
CudaNdarray_HOST_STRIDES(kern)[1], PyGpuArray_STRIDES(kern)[1]/4,
CudaNdarray_HOST_STRIDES(kern)[2], PyGpuArray_STRIDES(kern)[2]/4,
CudaNdarray_HOST_STRIDES(kern)[3] PyGpuArray_STRIDES(kern)[3]/4
); );
if (verbose) if (verbose)
fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], PyGpuArray_DIMS(out)[0],
CudaNdarray_HOST_DIMS(out)[1], PyGpuArray_DIMS(out)[1],
CudaNdarray_HOST_DIMS(out)[2], PyGpuArray_DIMS(out)[2],
CudaNdarray_HOST_DIMS(out)[3], PyGpuArray_DIMS(out)[3],
out->devdata, cuda_get_ptr(out),
CudaNdarray_HOST_STRIDES(out)[0], PyGpuArray_STRIDES(out)[0]/4,
CudaNdarray_HOST_STRIDES(out)[1], PyGpuArray_STRIDES(out)[1]/4,
CudaNdarray_HOST_STRIDES(out)[2], PyGpuArray_STRIDES(out)[2]/4,
CudaNdarray_HOST_STRIDES(out)[3]); PyGpuArray_STRIDES(out)[3]/4);
if (verbose) if (verbose)
fprintf(stderr, " launch params: %i %i %i\n", fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads); outsize, n_blocks, n_threads);
...@@ -1340,25 +1340,24 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1340,25 +1340,24 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
} }
conv_reference_full<<<n_blocks, n_threads>>>( conv_reference_full<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0], PyGpuArray_DIMS(img)[0], PyGpuArray_DIMS(kern)[0],
CudaNdarray_HOST_DIMS(img)[1], PyGpuArray_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3], PyGpuArray_DIMS(img)[2], PyGpuArray_DIMS(img)[3],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3], PyGpuArray_DIMS(kern)[2], PyGpuArray_DIMS(kern)[3],
CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3], PyGpuArray_DIMS(out)[2], PyGpuArray_DIMS(out)[3],
img->devdata, CudaNdarray_HOST_STRIDES(img)[0], cuda_get_ptr(img), PyGpuArray_STRIDES(img)[0]/4,
CudaNdarray_HOST_STRIDES(img)[1], PyGpuArray_STRIDES(img)[1]/4,
CudaNdarray_HOST_STRIDES(img)[2], PyGpuArray_STRIDES(img)[2]/4,
CudaNdarray_HOST_STRIDES(img)[3], PyGpuArray_STRIDES(img)[3]/4,
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], cuda_get_ptr(kern), PyGpuArray_STRIDES(kern)[0]/4,
CudaNdarray_HOST_STRIDES(kern)[1], PyGpuArray_STRIDES(kern)[1]/4,
CudaNdarray_HOST_STRIDES(kern)[2], PyGpuArray_STRIDES(kern)[2]/4,
CudaNdarray_HOST_STRIDES(kern)[3], PyGpuArray_STRIDES(kern)[3]/4,
out->devdata, CudaNdarray_HOST_STRIDES(out)[0], cuda_get_ptr(out), PyGpuArray_STRIDES(out)[0]/4,
CudaNdarray_HOST_STRIDES(out)[1], PyGpuArray_STRIDES(out)[1]/4,
CudaNdarray_HOST_STRIDES(out)[2], PyGpuArray_STRIDES(out)[2]/4,
CudaNdarray_HOST_STRIDES(out)[3], PyGpuArray_STRIDES(out)[3]/4,
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
...@@ -1392,9 +1391,9 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1392,9 +1391,9 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
} }
PyObject * PyObject *
CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
CudaNdarray * out, const int mode, PyGpuArrayObject * out, const int mode,
const int subsample_rows, const int subsample_cols, const size_t subsample_rows, const size_t subsample_cols,
const int version, const int verbose, const int version, const int verbose,
const int max_threads_dim0 = 512 const int max_threads_dim0 = 512
) )
...@@ -1402,43 +1401,43 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1402,43 +1401,43 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
// Re-use the out object if possible. If the out object it not used, then its refcount is not modified. // Re-use the out object if possible. If the out object it not used, then its refcount is not modified.
// If the out object is re-used then it is returned, and its refcount is incremented by 1. // If the out object is re-used then it is returned, and its refcount is incremented by 1.
// //
if (img->nd != 4) if (PyGpuArray_NDIM(img) != 4)
{ {
PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); PyErr_SetString(PyExc_ValueError, "PyGpuArray 4-D tensor required");
return NULL; return NULL;
} }
if (kern->nd != 4) if (PyGpuArray_NDIM(kern) != 4)
{ {
PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); PyErr_SetString(PyExc_ValueError, "PyGpuArray 4-D tensor required");
return NULL; return NULL;
} }
int out_dim[4]; size_t out_dim[4];
out_dim[0] = CudaNdarray_HOST_DIMS(img)[0]; out_dim[0] = PyGpuArray_DIMS(img)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0]; out_dim[1] = PyGpuArray_DIMS(kern)[0];
int logical_rows, logical_cols; size_t logical_rows, logical_cols;
if (mode == ConvMode_VALID) if (mode == ConvMode_VALID)
{ {
logical_rows = CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1; logical_rows = PyGpuArray_DIMS(img)[2] - PyGpuArray_DIMS(kern)[2] + 1;
logical_cols = CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1; logical_cols = PyGpuArray_DIMS(img)[3] - PyGpuArray_DIMS(kern)[3] + 1;
} }
else else
{ {
logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1; logical_rows = PyGpuArray_DIMS(img)[2] + PyGpuArray_DIMS(kern)[2] - 1;
logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1; logical_cols = PyGpuArray_DIMS(img)[3] + PyGpuArray_DIMS(kern)[3] - 1;
} }
out_dim[2] = ceil_intdiv(logical_rows, subsample_rows); out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
out_dim[3] = ceil_intdiv(logical_cols, subsample_cols); out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
CudaNdarray * rval = NULL; PyGpuArrayObject * rval = NULL;
if ( out if ( out
&& out->nd==4 && PyGpuArray_NDIM(out)==4
&& CudaNdarray_is_c_contiguous(out) && out->ga.flags & GA_C_CONTIGUOUS
&& CudaNdarray_HOST_DIMS(out)[0]==out_dim[0] && PyGpuArray_DIMS(out)[0]==out_dim[0]
&& CudaNdarray_HOST_DIMS(out)[1]==out_dim[1] && PyGpuArray_DIMS(out)[1]==out_dim[1]
&& CudaNdarray_HOST_DIMS(out)[2]==out_dim[2] && PyGpuArray_DIMS(out)[2]==out_dim[2]
&& CudaNdarray_HOST_DIMS(out)[3]==out_dim[3]) && PyGpuArray_DIMS(out)[3]==out_dim[3])
{ {
rval = out; rval = out;
Py_INCREF(rval); Py_INCREF(rval);
...@@ -1458,20 +1457,22 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1458,20 +1457,22 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
"INFO: Conv don't have an 'out' argument" "INFO: Conv don't have an 'out' argument"
" structure.\n"); " structure.\n");
rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim); rval = pygpu_zeros(4, out_dim,
img->ga.typecode, GA_C_ORDER,
pygpu_default_context(), Py_None);
//rval might be null //rval might be null
} }
if ((rval==NULL) if ((rval==NULL)
|| ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, || ((mode==ConvMode_VALID) && PyGpuArray_conv_valid(img, kern, rval,
subsample_rows, subsample_rows,
subsample_cols, subsample_cols,
version, verbose, version, verbose,
max_threads_dim0)) max_threads_dim0))
|| ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval, || ((mode==ConvMode_FULL) && PyGpuArray_conv_full(img, kern, rval,
subsample_rows, subsample_rows,
subsample_cols, subsample_cols,
version, verbose, version, verbose,
max_threads_dim0)) max_threads_dim0))
) )
{ {
// if rval is something we just allocated, // if rval is something we just allocated,
......
import copy
import os
import theano import theano
from theano import gof from theano import config, gof
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
from theano.sandbox.gpuarray.type import GpuArrayType
class GpuConv(gof.Op): class GpuConv(gof.Op):
...@@ -114,6 +119,9 @@ class GpuConv(gof.Op): ...@@ -114,6 +119,9 @@ class GpuConv(gof.Op):
str(self.kshp)) str(self.kshp))
def make_node(self, img, kern): def make_node(self, img, kern):
if img.dtype != "float32" or kern.dtype != "float32":
raise NotImplementedError("GpuConv currently only work"
" with float32 dtype")
if img.type.ndim != 4: if img.type.ndim != 4:
raise TypeError('img must be 4D tensor') raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4: if kern.type.ndim != 4:
...@@ -121,7 +129,8 @@ class GpuConv(gof.Op): ...@@ -121,7 +129,8 @@ class GpuConv(gof.Op):
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False] False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()]) out = GpuArrayType(img.dtype, broadcastable)()
return gof.Apply(self, [img, kern], [out])
def flops(self, inputs, outputs): def flops(self, inputs, outputs):
""" Useful with the hack in profilemode to print the MFlops""" """ Useful with the hack in profilemode to print the MFlops"""
...@@ -145,6 +154,8 @@ class GpuConv(gof.Op): ...@@ -145,6 +154,8 @@ class GpuConv(gof.Op):
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling):
node_ = copy.copy(node) node_ = copy.copy(node)
assert node.op is node_.op assert node.op is node_.op
if config.gpuarray.sync:
raise NotImplementedError("GpuConv do not implement gpuarray.sync Theano flag")
if node_.op.max_threads_dim0 is None: if node_.op.max_threads_dim0 is None:
cuda = theano.sandbox.cuda cuda = theano.sandbox.cuda
device_id = cuda.use.device_number device_id = cuda.use.device_number
...@@ -169,20 +180,30 @@ class GpuConv(gof.Op): ...@@ -169,20 +180,30 @@ class GpuConv(gof.Op):
return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G'] return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G']
def c_headers(self): def c_headers(self):
return ['cuda_ndarray.cuh', '<stdio.h>'] return ['<stdio.h>', 'cuda.h',
'<compyte/extension.h>', '<compyte/numpy_compat.h>']
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 20) return (0, 20)
def c_init_code(self):
return ['cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
# these files # these files
files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu'] files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
codes = [open(os.path.join(os.path.split(__file__)[0], f)).read() codes = ["CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);",
for f in files] "float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}",
"const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}"]
codes += [open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in files]
return reduce(str.__add__, codes) return reduce(str.__add__, codes)
def c_compiler(self):
return NVCC_compiler
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp img, kern = inp
out, = out_ out, = out_
...@@ -226,7 +247,8 @@ class GpuConv(gof.Op): ...@@ -226,7 +247,8 @@ class GpuConv(gof.Op):
} }
// TODO, make out be decref before we alloc out2! // TODO, make out be decref before we alloc out2!
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, PyGpuArrayObject * out2 = (PyGpuArrayObject *)PyGpuArray_Conv(
%(img)s, %(kern)s,
%(out)s, mode, %(out)s, mode,
dx, dy, dx, dy,
version, verbose, version, verbose,
......
...@@ -4,7 +4,8 @@ ...@@ -4,7 +4,8 @@
//grid block size=batch_id //grid block size=batch_id
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void __global__ void
conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img_wid, int kern_len, int kern_wid, int nb_split) conv_full_patch_split(const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
...@@ -60,7 +61,7 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img ...@@ -60,7 +61,7 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
//grid block size=batch_id, nkern //grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void __global__ void
conv_full_patch( float* img, float* kern, float* out, conv_full_patch( const float* img, const float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack) int kern_len, int kern_wid, int nkern, int nstack)
{ {
...@@ -122,7 +123,7 @@ conv_full_patch( float* img, float* kern, float* out, ...@@ -122,7 +123,7 @@ conv_full_patch( float* img, float* kern, float* out,
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d> template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__global__ void __global__ void
conv_full_patch_stack( float* img, float* kern, float* out, conv_full_patch_stack( const float* img, const float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
...@@ -133,7 +134,7 @@ conv_full_patch_stack( float* img, float* kern, float* out, ...@@ -133,7 +134,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1; out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;//blockDim.z* nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
float __shared__ *kern_, *img_; const float __shared__ *kern_, *img_;
extern __shared__ float s_data[]; extern __shared__ float s_data[];
const int batch_id = blockIdx.x; const int batch_id = blockIdx.x;
...@@ -201,7 +202,7 @@ conv_full_patch_stack( float* img, float* kern, float* out, ...@@ -201,7 +202,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
*/ */
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem > template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
__global__ void __global__ void
conv_full_patch_stack_padded( float* img, float* kern, float* out, conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int img_len, const int img_wid,
const int kern_len, const int kern_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
...@@ -365,7 +366,7 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co ...@@ -365,7 +366,7 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
} }
template<int NSTACK> template<int NSTACK>
__global__ void __global__ void
conv_full_load_everything( float* img, float* kern, float* out, conv_full_load_everything( const float* img, const float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
......
...@@ -221,7 +221,7 @@ __device__ void store_or_accumulate(float& dst,const float value ){ ...@@ -221,7 +221,7 @@ __device__ void store_or_accumulate(float& dst,const float value ){
*/ */
template<bool flipped_kern, int KERN_WIDTH, bool split> template<bool flipped_kern, int KERN_WIDTH, bool split>
__global__ void __global__ void
conv_patch( float* img, float* kern, float* out, conv_patch( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack) int nkern, int nstack)
{ {
...@@ -304,7 +304,7 @@ conv_patch( float* img, float* kern, float* out, ...@@ -304,7 +304,7 @@ conv_patch( float* img, float* kern, float* out,
*/ */
template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample> template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__global__ void __global__ void
conv_patch_stack( float* img, float* kern, float* out, conv_patch_stack( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid, int out_len, int out_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
...@@ -375,7 +375,7 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -375,7 +375,7 @@ conv_patch_stack( float* img, float* kern, float* out,
out_row*out_wid+out_col],sum); out_row*out_wid+out_col],sum);
}else{ }else{
float __shared__ *kern_, *img_; const float __shared__ *kern_, *img_;
int __shared__ out_len_max; int __shared__ out_len_max;
kern_=kern+kern_stride_nkern*kern_id;//the good nkern kern_=kern+kern_stride_nkern*kern_id;//the good nkern
...@@ -456,7 +456,7 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -456,7 +456,7 @@ conv_patch_stack( float* img, float* kern, float* out,
*/ */
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern> template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
__global__ void __global__ void
conv_patch_stack_reduce( float* img, float* kern, float* out, conv_patch_stack_reduce( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
...@@ -572,7 +572,7 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -572,7 +572,7 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
*/ */
template<int KERN_WIDTH, bool c_contiguous> template<int KERN_WIDTH, bool c_contiguous>
__global__ void __global__ void
conv_rows( float* img, float* kern, float* out, conv_rows( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
...@@ -633,7 +633,7 @@ conv_rows( float* img, float* kern, float* out, ...@@ -633,7 +633,7 @@ conv_rows( float* img, float* kern, float* out,
*/ */
template<int KERN_WIDTH, bool c_contiguous> template<int KERN_WIDTH, bool c_contiguous>
__global__ void __global__ void
conv_rows_stack( float* img, float* kern, float* out, conv_rows_stack( const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
...@@ -731,7 +731,7 @@ conv_rows_stack( float* img, float* kern, float* out, ...@@ -731,7 +731,7 @@ conv_rows_stack( float* img, float* kern, float* out,
*/ */
template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern> template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
__global__ void __global__ void
conv_rows_stack2( float* img, float* kern, float* out, conv_rows_stack2(const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
...@@ -831,8 +831,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -831,8 +831,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int kern_len, int kern_wid,
int out_len, int out_wid, //physical int out_len, int out_wid, //physical
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C , float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols, int subsample_rows, int subsample_cols,
const int initial_reduce_boundary) const int initial_reduce_boundary)
...@@ -859,8 +859,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -859,8 +859,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
float sum = 0.0f; float sum = 0.0f;
if(stack_loop){ if(stack_loop){
for (; ss < stacklen; ss+=blockDim.x){ for (; ss < stacklen; ss+=blockDim.x){
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R; const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C; const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc) for (int cc = 0; cc < kern_wid; ++cc)
{ {
sum += kk_0[0] * ii_0[0]; sum += kk_0[0] * ii_0[0];
...@@ -869,8 +869,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -869,8 +869,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
} }
} }
}else{ }else{
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R; const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C; const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc) for (int cc = 0; cc < kern_wid; ++cc)
{ {
sum += kk_0[0] * ii_0[0]; sum += kk_0[0] * ii_0[0];
...@@ -925,8 +925,8 @@ conv_reference_valid(int nB, int nK, int stacklen, ...@@ -925,8 +925,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int kern_len, int kern_wid,
int out_len, int out_wid, //physical int out_len, int out_wid, //physical
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C , float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols) int subsample_rows, int subsample_cols)
{ {
...@@ -984,8 +984,8 @@ conv_reference_full(int nB, int nK, int stacklen, ...@@ -984,8 +984,8 @@ conv_reference_full(int nB, int nK, int stacklen,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int kern_len, int kern_wid,
int out_len, int out_wid, //physical dimensions int out_len, int out_wid, //physical dimensions
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C, float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
int subsample_rows, int subsample_cols) int subsample_rows, int subsample_cols)
{ {
......
...@@ -25,6 +25,7 @@ from theano.tests.unittest_tools import seed_rng ...@@ -25,6 +25,7 @@ from theano.tests.unittest_tools import seed_rng
from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu, from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
mode_without_gpu) mode_without_gpu)
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.conv import GpuConv
import pygpu import pygpu
gftensor4 = GpuArrayType('float32', [False] * 4) gftensor4 = GpuArrayType('float32', [False] * 4)
...@@ -159,11 +160,11 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -159,11 +160,11 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
t1 = time.time() t1 = time.time()
i = gftensor4() i = gftensor4()
k = gftensor4() k = gftensor4()
op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode, op = GpuConv(border_mode=mode,
subsample=subsample, subsample=subsample,
version=version, version=version,
verbose=verbose, verbose=verbose,
kshp=compile_kshp)(i, k) kshp=compile_kshp)(i, k)
f = theano.function([i, k], op, mode=mode_with_gpu) f = theano.function([i, k], op, mode=mode_with_gpu)
gpuval = f(img, kern) gpuval = f(img, kern)
t2 = time.time() t2 = time.time()
...@@ -731,7 +732,7 @@ class TestConv2DGPU(unittest.TestCase): ...@@ -731,7 +732,7 @@ class TestConv2DGPU(unittest.TestCase):
func = theano.function([a, A], image_estimate, mode=mode_with_gpu) func = theano.function([a, A], image_estimate, mode=mode_with_gpu)
#theano.printing.debugprint(func,) #theano.printing.debugprint(func,)
assert any([isinstance(node.op, theano.sandbox.cuda.blas.GpuConv) assert any([isinstance(node.op, GpuConv)
for node in func.maker.fgraph.toposort()]) for node in func.maker.fgraph.toposort()])
a_in = numpy.random.randn(*featshp).astype("float32") a_in = numpy.random.randn(*featshp).astype("float32")
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论