提交 a68097aa authored 作者: Frederic Bastien's avatar Frederic Bastien

included cuda_ndarray in theano to have everything in one package.

上级 847c505b
...@@ -16,7 +16,7 @@ def debug(*msg): ...@@ -16,7 +16,7 @@ def debug(*msg):
_logger.debug(_logger_name+'DEBUG: '+' '.join(str(m) for m in msg)) _logger.debug(_logger_name+'DEBUG: '+' '.join(str(m) for m in msg))
# Compile type_support.cu # Compile cuda_ndarray.cu
# This need that nvcc (part of cuda) is installed. If it is not, a warning is # This need that nvcc (part of cuda) is installed. If it is not, a warning is
# printed and this module will not be working properly (we set `enable_cuda` # printed and this module will not be working properly (we set `enable_cuda`
# to False). # to False).
...@@ -45,54 +45,27 @@ def set_cuda_disabled(): ...@@ -45,54 +45,27 @@ def set_cuda_disabled():
warning('Cuda is disabled, cuda-based code will thus not be ' warning('Cuda is disabled, cuda-based code will thus not be '
'working properly') 'working properly')
old_file = os.path.join(os.path.split(__file__)[0],'type_support.so') #cuda_ndarray compile and import
if os.path.exists(old_file): sys.path.append(get_compiledir())
os.remove(old_file)
try: try:
sys.path.append(get_compiledir()) from cuda_ndarray.cuda_ndarray import *
from type_support.type_support import *
except ImportError: except ImportError:
import nvcc_compiler import nvcc_compiler
if not nvcc_compiler.is_nvcc_available(): if not nvcc_compiler.is_nvcc_available():
set_cuda_disabled() set_cuda_disabled()
if enable_cuda: if enable_cuda:
cuda_path = os.path.split(__file__)[0]
code = open(os.path.join(cuda_path, "cuda_ndarray.cu")).read()
cuda_path=os.path.split(old_file)[0] loc = os.path.join(get_compiledir(),'cuda_ndarray')
code = open(os.path.join(cuda_path, "type_support.cu")).read()
loc = os.path.join(get_compiledir(),'type_support')
if not os.path.exists(loc): if not os.path.exists(loc):
os.makedirs(loc) os.makedirs(loc)
CUDA_NDARRAY=os.getenv('CUDA_NDARRAY') nvcc_compiler.nvcc_module_compile_str('cuda_ndarray', code, location = loc, include_dirs=[cuda_path], libs=['cublas'],
include_dirs=[] preargs=['-DDONT_UNROLL', '-O3'])
lib_dirs=[]
if CUDA_NDARRAY:
include_dirs.append(CUDA_NDARRAY)
lib_dirs.append(CUDA_NDARRAY)
else:
import theano.sandbox
path = os.path.split(os.path.split(os.path.split(theano.sandbox.__file__)[0])[0])[0]
path2 = os.path.join(path,'cuda_ndarray')
if os.path.isdir(path2):
include_dirs.append(path2)
lib_dirs.append(path2)
else:
path = os.path.split(path)[0]
path2 = os.path.join(path,'cuda_ndarray')
include_dirs.append(path2)
lib_dirs.append(path2)
nvcc_compiler.nvcc_module_compile_str('type_support', code, location = loc, include_dirs=include_dirs, lib_dirs=lib_dirs, libs=['cuda_ndarray'])
from type_support.type_support import *
from cuda_ndarray.cuda_ndarray import *
if enable_cuda: if enable_cuda:
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
...@@ -118,7 +91,7 @@ def use(device=config.THEANO_GPU): ...@@ -118,7 +91,7 @@ def use(device=config.THEANO_GPU):
device=0 device=0
device=int(device) device=int(device)
try: try:
cuda_ndarray.gpu_init(device) gpu_init(device)
handle_shared_float32(True) handle_shared_float32(True)
use.device_number = device use.device_number = device
except RuntimeError, e: except RuntimeError, e:
......
...@@ -2,7 +2,7 @@ from theano import Op, Type, Apply, Variable, Constant ...@@ -2,7 +2,7 @@ from theano import Op, Type, Apply, Variable, Constant
from theano import tensor, scalar from theano import tensor, scalar
import StringIO import StringIO
import cuda_ndarray import cuda_ndarray.cuda_ndarray as cuda
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
class GpuDot22(Op): class GpuDot22(Op):
...@@ -187,7 +187,7 @@ class GpuConv(Op): ...@@ -187,7 +187,7 @@ class GpuConv(Op):
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()]) return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def perform(self, node, (img, kern), (out,)): def perform(self, node, (img, kern), (out,)):
out[0] = cuda_ndarray.conv(img, kern, out[0] = cuda.conv(img, kern,
mode=self.border_mode, mode=self.border_mode,
out=out[0], out=out[0],
subsample=self.subsample, subsample=self.subsample,
......
#include"conv_kernel.cu"
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len/nb_split
//grid block size=batch_id
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void
conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int out_col = tx;//output col
int out_row = ty;//output row
const int thread_id = out_row*out_wid + out_col;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
img+=img_len*img_wid*batch_id;//the good batch
load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid);
__syncthreads();
for(int out_row=ty;out_row<out_len;out_row+=out_len/nb_split){
float sum = 0.0f;
int img_row = out_row;
for (int row=0; row < kern_len; row++) {//loop over row
int inverse_row = (img_row-row);
if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image
const float* idx_in=&d_img[inverse_row*img_wid];
const float* idx_kern=&d_kern[row*kern_wid];
int img_col = out_col;
int col=0,last=0;
for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col
if(last<0 ||last>=(img_wid))continue;//col outside the image
sum+=idx_in[last]*idx_kern[col];
}
}
out[batch_id*out_len*out_wid+//the output image
out_row*out_wid+out_col] = sum;
}
}
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len
//grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void
conv_full_patch( float* img, float* kern, float* out,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int out_col = tx;//output col
int out_row = ty;//output row
const int thread_id = out_row*out_wid + out_col;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
kern+=kern_len*kern_wid*nstack*blockIdx.y;//the good nkern
img+=img_len*img_wid*batch_id;//the good batch
load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid, true);
__syncthreads();
float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row
if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
const float* idx_kern=&d_kern[row*kern_wid];
int col=0;
int max_col=kern_wid;
int img_col=out_col-kern_wid+1;
max_col=min(max_col,img_wid-img_col);
if(img_col<0){col=-img_col;img_col+=col;}
for (; col < max_col; col++, img_col++) {//loop over col
sum+=idx_in[col]*idx_kern[col];
}
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum;
}
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len
//grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__global__ void
conv_full_patch_stack( float* img, float* kern, float* out,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
float __shared__ *kern_, *img_;
extern __shared__ float s_data[];
const int batch_id = blockIdx.x;
const int nkern_id = blockIdx.y;
const int out_col = threadIdx.x;
const int out_row = threadIdx.y;
const int thread_id = threadIdx.y*blockDim.x+ threadIdx.x;
float* d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float* d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
kern_=kern+kern_stride_nkern*nkern_id;//the good nkern
img_=img+img_len*img_stride_row*(nstack*batch_id);//the good batch
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
load_to_shared(d_img, img_+stack*img_len*img_stride_row, thread_id,nb_thread_id,img_wid,img_len,img_stride_col, img_stride_row,false,img_c_contiguous_2d);
load_to_shared(d_kern, kern_+stack*kern_stride_stack, thread_id,nb_thread_id,kern_wid,kern_len,kern_stride_col,kern_stride_row,true,kern_c_contiguous_2d);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
const float* idx_kern=&d_kern[row*kern_wid];
int col=0;
int max_col=kern_wid;
int img_col=out_col-kern_wid+1;
max_col=min(max_col,img_wid-img_col);
if(img_col<0){col=-img_col;img_col+=col;}
for (; col < max_col; col++, img_col++) {//loop over col
sum+=idx_in[col]*idx_kern[col];
}
}
//Needed as not all thread finish at the same time the loop
//And we don't want to overwrite the shared memory.
__syncthreads();
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum;
}
/**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more then the 10/16 max for each thread and thus this could lower the occupency.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
* thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
* grid block size=batch_id, nkern
* dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
* dynamic shared memory: low mem:((kern_len+nb_row-1)+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
*
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template c_contiguous: if true, the image and kernel have are c_contiguous.(use less registers)
* template split: if true, each thread compute more then 1 output pixel.
* template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
* if you set split and low_mem to true, we will use the low_mem version!
*/
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
__global__ void
conv_full_patch_stack_padded( float* img, float* kern, float* out,
const int img_len, const int img_wid,
const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
__shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
batch_id = blockIdx.x;
kern_id = blockIdx.y;
nb_rows = blockDim.y;
// Thread index
const int tx = threadIdx.x;
const int ty = threadIdx.y;
int out_col = tx;//output col
const int thread_id = ty*blockDim.x + tx;
float * d_kern=&s_data[0];//size of [KERNEL_LEN * KERNEL_WID];
float * d_img=&s_data[kern_len*kern_wid];//size of [see fct doc];
kern+=kern_stride_nkern*kern_id;//the good nkern
img+=img_stride_batch*batch_id;//the good batch
img_wid_valid=img_wid+2*kern_wid-2;
if(!split && !low_mem){
fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
const int out_row = ty;//output row
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){
__syncthreads();
load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img,
thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, kern_wid-1,
c_contiguous);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
}
}
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}else if(split && !low_mem){
fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
//out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
__shared__ int out_len_max;
//TODO pass a parameter nb_split
out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
__syncthreads();
//TODO: load only the part of the image needed or put the partial result in shared memory
load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),
img+img_stride_stack*stack,
thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, kern_wid-1,
c_contiguous);
load_to_shared(d_kern, kern+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
}
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col] = sum;
}
}
}else{//low_mem version
//don't need to fill the last rows padding as this is done later.
fill(d_img,img_wid_valid*((kern_len+nb_rows-1)+2*kern_len-2), 0, thread_id, nb_thread_id);
//out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
__shared__ int out_len_max;
//TODO pass a parameter nb_split
if(thread_id==0)
out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows;
__syncthreads();
for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
out_row+=nb_rows, out_row_iter++){
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
__syncthreads();
const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start
//we need to reload some row as when we change of out_row we lost the last load du to the stack.
const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload
load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid,
img+img_stride_stack*stack//the good stack image
+(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
thread_id,nb_thread_id,img_wid,
len_to_load+previous_row,
img_stride_col, img_stride_row, kern_wid-1,
c_contiguous);
//TODO: fill the last row padding only when needed.
//We always fill the last rows padding event when not needed.
int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load;
row_to_fill = min(row_to_fill,kern_len-1);
fill(d_img+(kern_len-1+len_to_load)*img_wid_valid,
img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
load_to_shared(d_kern, kern+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
}
}
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col] = sum;
}
}
}
template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy)
{
return x[0] * y[0] + everything_dot<i-1>(x+sx, sx, y+sy, sy);
}
template <> __device__ float everything_dot<0>(const float * x, const int sx, const float * y, const int sy)
{
return 0;
}
template<int NSTACK>
__global__ void
conv_full_load_everything( float* img, float* kern, float* out,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
const int out_col = threadIdx.x;//output col
const int out_row = threadIdx.y;//output row
const int thread_id = out_row*out_wid + out_col;
float * d_img=&s_data[0]; //size [nstack * IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[nstack * img_len * img_wid];//size [nstack * KERNEL_LEN * KERNEL_WID];
img += blockIdx.x * img_stride_batch;//the good batch
// load the image to shared memory
for (int i = thread_id; i < nstack * img_len * img_wid; i += nb_thread_id)
{
int stack = i / (img_wid*img_len);
int row = (i % (img_wid*img_len)) / img_wid;
int col = (i % (img_wid*img_len)) % img_wid;
d_img[i] = img[stack*img_stride_stack +row*img_stride_row +col*img_stride_col];
}
for (int kern_idx = 0; kern_idx < nkern; ++kern_idx, kern += kern_stride_nkern)
{
// load the kernel into shared memory and flip it
for (int i = thread_id; i < nstack * kern_len * kern_wid; i += nb_thread_id)
{
int stack = i / (kern_wid*kern_len);
int row = (i % (kern_wid*kern_len)) / kern_wid;
int col = (i % (kern_wid*kern_len)) % kern_wid;
d_kern[stack*kern_len*kern_wid + (kern_len-1-row)*kern_wid + (kern_wid-1-col)]
= kern[stack*kern_stride_stack +row*kern_stride_row +col*kern_stride_col];
}
__syncthreads();
float sum = 0.0f;
for (int row=0; row < kern_len; ++row)
{
int irow = out_row - kern_len+1+row;
if (irow < 0 || irow > img_len) continue;
for (int col = 0; col < kern_wid; ++col)
{
int icol = out_col - kern_wid+1+col;
if (icol < 0 || icol > img_wid) continue;
if (NSTACK > 0)
{
sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid,
d_kern + row*kern_wid+col, kern_len*kern_wid);
}
else
{
for (int stack = 0; stack < nstack; ++stack)
{
sum += d_img[stack*img_len*img_wid + irow*img_wid + icol] * d_kern[stack*kern_len*kern_wid+row*kern_wid+col];
}
}
}
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_idx+//the output image
out_row*out_wid+out_col] = sum;
__syncthreads(); //don't start loading another kernel until we're done here
}
}
//implement the valid convolution only
/*
for (int iter_m=0; iter_m < Os[0]; iter_m++) {
// Reposition index into input image based on requested output size
int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
int new_m = (pos_m+dim_ker[0]-1);
for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s;
%(type)s sum=0;
// Sum over kernel, if index into image is out of bounds
// fill with the value
for (int j=0; j < dim_ker[0]; j++) {
int inverse_row = (new_m-j);
const %(type)s* idx_in=&in[inverse_row*dim_im[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
const %(type)s* idx_kern=&hvals[j*dim_ker[1]];
int new_n = (pos_n+dim_ker[1]-1);
for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
sum+=idx_kern[k]*idx_in[last];
}
}//for j
out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
}//for n
}//for m
*/
#ifndef CONV_KERNEL_CU
#define CONV_KERNEL_CU
#include <stdio.h>
/*
#define CHECK_BANK_CONFLICTS 0
#if CHECK_BANK_CONFLICTS
#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
#else
#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]
#endif
*/
const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
#define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float))
__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
if (nb_thread < 64)
{
if(flipped)
//TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=src[N - 1 - i];
//dst[N-1-i]=src[i];
else
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=src[i];
}
else
{
nb_thread = nb_thread & 0xFFFFFFE0; //make nb_thread a multiple of 32
// Global memory:
// <-------------------------------------->
// A A A A A // points of 128-bit alignment
// dddddddddddddddddddddd // layout of src in global memory
// |--| // masked_src_offset
//
if (thread_id < nb_thread)
{
const int masked_src_offset = MASKED_OFFSET(src);
for(int masked_i=thread_id; masked_i<N + masked_src_offset; masked_i+=nb_thread)
{
int i = masked_i - masked_src_offset;
if (i >= 0)
if (flipped)
dst[N-1-i] = src[i];
else
dst[i]=src[i];
}
}
}
}
/*
* We load from global memory to shared memory. The outer if is optimized away at compilation.
*/
__device__ void load_to_shared(float * dst, const float * src, const int thread_id,
int nb_thread, const int nb_col, const int nb_row,
const int stride_col, const int stride_row,
const bool flipped=false, const bool c_contiguous=true){
if(flipped && ! c_contiguous){
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
dst[nb_row*nb_col-1-i]=src[i/nb_col*stride_row+i%nb_col*stride_col];
}else if(c_contiguous){
load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
}else if(flipped){//c_contiguous==true
//TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
int N=nb_col*nb_row;
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=src[N - 1 - i];
//dst[N-1-i]=src[i];
}else if(c_contiguous){//flipped==false
for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread)
dst[i]=src[i];
}else{ // !flipped && !c_contiguous
/*
for(int i=thread_id;i<nb_row;i+=nb_thread){
float* s=&src[i*stride_row];
float* d=&dst[i*nb_col];
for(int j=thread_id;j<nb_col;i+=nb_thread)
// dst[i*nb_col+j]=src[i*stride_row+j*stride_col];//dst[i]=src[i];
d[j]=s[j*stride_col];
}*/
/* We don't do this if as nvcc 2.3 take 2 more registers when we add the if
Why it do this?
if(stride_col==1 && stride_row==nb_col)
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
dst[i]=src[i];
else*/
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
dst[i]=src[i/nb_col*stride_row+i%nb_col*stride_col];
}
}
__device__ void fill(float * dst, int N, float value, int thread_id, int nb_thread){
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=value;
}
/*
* We load from global memory to shared memory. The outer if is optimized away at compilation.
* We put the image at the center of another one. Usefull to padd an image with 0.
*/
__device__ void load_padded_col_to_shared(float * dst, const float * src,
const int thread_id, const int nb_thread,
const int nb_col, const int nb_row,
const int stride_col, const int stride_row,
const int wid_pad, const bool c_contiguous=true){
if(c_contiguous){//flipped==false
for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
int col=i%nb_col;
int row=i/nb_col;
dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
}
}else{
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread){
int col=i%nb_col;
int row=i/nb_col;
dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[row*stride_row+col*stride_col];
}
}
}
template<int i> __device__ float convolutionRowNoFlip(const float *data,
const float *kern){
return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
}
template<> __device__ float convolutionRowNoFlip<0>(const float *data,
const float *kern){
return 0;
}
template<int KERN_WIDTH>
__device__ void convolutionRowNoFlip(float& sum,
const float *data,
const float *kern, const int kern_wid){
if(KERN_WIDTH>0)
sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
else
#pragma unroll 8
for (int col=0; col < kern_wid; col++) {//loop over col
sum+=data[col]*kern[col];
}
}
template<bool accumulate>
__device__ void store_or_accumulate(float& dst,const float value ){
if(accumulate){
dst += value;
}else
dst = value;
}
/**
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* Don't implement the stack.
* each thread compute only one value for the output if split is false
* thread block size=out_wid, out_len(or less then out_len if split is true)
* grid block size=batch_id, nkern
* dynamic shared memory: img_len*img_wid+kern_len*kern_wid
*
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template split: if true, each thread compute more then 1 output pixel
* When true, allow for output image bigger then 512 pixel.
* Use more registers.
*/
template<bool flipped_kern, int KERN_WIDTH, bool split>
__global__ void
conv_patch( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
__shared__ int batch_id, kern_id;
batch_id = blockIdx.x;
kern_id = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int out_col = tx;//output col
const int thread_id = ty*blockDim.x + tx;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
kern+=kern_len*kern_wid*nstack*kern_id;
img+=img_len*img_wid*(nstack*batch_id);
load_to_shared(d_img, img, thread_id,nb_thread_id,img_len*img_wid);
load_to_shared(d_kern, kern, thread_id,nb_thread_id,kern_len*kern_wid,flipped_kern);
__syncthreads();
if(!split){
int out_row = ty;//output row
float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
blockIdx.y*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}else{
for(int out_row=ty;out_row<out_len;out_row+=blockDim.y){
float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}
}
}
/**
* As conv_patch, but implement the stack in the kernel.
* I keep it separated from conv_patch as we take more registers and this could lower the occupency.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* each thread compute only one value for the output if split==false else it compute more then 1 values
* thread block size=out_wid, out_len/X (X is any number, optimized value is ceil(out_len/N)
* grid block size=batch_id, nkern
* dynamic shared memory: img_len*img_wid+(preload_full_kern?KERNEL_LEN:1)*kern_wid
*
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template accumulate: if true, we add the result, else we override the result
* template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
* template img_c_contiguous_2d: if true, the img have are collon and row contiguous
* template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
* template split: if true, each thread generate more then 1 output pixel, but use more registers.
* template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
*/
template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern>
__global__ void
conv_patch_stack( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
int kern_id = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int out_col = tx;//output col
int out_row = ty;//output row
const int thread_id = out_row*out_wid + out_col;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
if(!split){
kern+=kern_stride_nkern*kern_id;//the good nkern
img+=img_stride_batch*batch_id;//the good batch
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){
load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, false, img_c_contiguous_2d);
if(preload_full_kern)
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
if(!preload_full_kern){
int idx2;
if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row;
else idx2=(row)*kern_stride_row;
load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads();
}
const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern;
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
store_or_accumulate<accumulate>(
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col],sum);
}else{
float __shared__ *kern_, *img_;
int __shared__ out_len_max;
kern_=kern+kern_stride_nkern*kern_id;//the good nkern
img_=img+img_stride_batch*batch_id;//the good batch
//out_len_max must by higher then out_len as we need all thread when we load the image as the blockDim.y is not always a multiple of out_len.
out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
//TODO: inverse the out_row and stack loop to don't load the date as frequently!
//TODO: do this happen elsewhere?
for(int out_row=ty;out_row<out_len_max;out_row+=blockDim.y){
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
//TODO: load only the part of the image needed or put the partial result in shared memory
int idx1=img_stride_stack*stack;
load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, false, img_c_contiguous_2d);
if(preload_full_kern){
int idx2=kern_stride_stack*stack;
load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
}
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
if(!preload_full_kern){
int idx2=kern_stride_stack*stack;
if(flipped_kern)
idx2+=(kern_len-row-1)*kern_stride_row;
else
idx2+=(row)*kern_stride_row;
load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads();
}
const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern;
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
if(out_row<out_len)
store_or_accumulate<accumulate>(
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col],sum);
}
}
}
/**
* As conv_patch_stack, but kern_len thread for each output pixel
* I keep it separated as use more register.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* thread block size=out_wid, out_len, ceil_intdiv(kern_len/nb_split)
* grid block size=batch_id, nkern
* dynamic shared memory: img_len*img_wid+kern_wid*(preload_full_kern?kern_len:thread_z)+out_size*thread_z
*
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template img_contiguous: if true, the img have are collon and row contiguous
* template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
*/
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
__global__ void
conv_patch_stack_reduce( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int tz = threadIdx.z;
int out_col = tx;//output col
int out_row = ty;//output row
const int thread_id = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID];
float * d_reduce=&s_data[img_len*img_wid+(preload_full_kern?kern_len:blockDim.z)*kern_wid];
float sum = 0.0f;
kern+=kern_stride_nkern*blockIdx.y;//the good nkern
img+=img_stride_batch*batch_id;//the good batch
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){
__syncthreads();
load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
img_stride_col, img_stride_row, false, c_contiguous);
if(!(split && ! preload_full_kern))
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
if(split && ! preload_full_kern){
for(int first_row=0, row=tz;first_row<kern_len;row+=blockDim.z, first_row+=blockDim.z){
int idx3;
//TODO: test/check for flipped_kern
if(flipped_kern)
idx3=(kern_len-(first_row)-blockDim.z);//the current last row flipped
else
idx3=first_row;
__syncthreads();
load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, blockDim.z,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
const float* idx_kern=&d_kern[tz*kern_stride_row];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
float sum2 = 0;
if(row<kern_len)
convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
sum+=sum2;
}
}else if(split){
for(int row=tz;row<kern_len;row+=blockDim.z){
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
}else{
int row = tz;//The row of the kernel.
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
//reduce
d_reduce[thread_id]=sum;
__syncthreads();
if(thread_id<out_len*out_wid){
sum=0;
for(int i=0;i<blockDim.z;i++){
sum+=d_reduce[thread_id+i*blockDim.x*blockDim.y];
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum;
}
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* we store kern_len row of the image and the full kernel in the shared memory
* each thread compute only one value for the output
* Don't implement the stack and nkern in the kernel.
* thread block size=out_wid
* grid block size=out_len,batch_id
* dynamic shared memory: kern_len*img_wid+kern_len*kern_wid
* Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<int KERN_WIDTH, bool c_contiguous>
__global__ void
conv_rows( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack,
int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
float __shared__ *d_img, *d_kern;
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
batch_id= blockIdx.y/nkern;
kern_id = blockIdx.y%nkern;
extern __shared__ float s_data[];
const int out_col = threadIdx.x;//output col
const int out_row = blockIdx.x;;//output row
const int thread_id = threadIdx.x;
d_img=&s_data[0];//size of [KERN_LEN * IMAGE_WID];
d_kern=&s_data[kern_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
img+=img_stride_batch*batch_id;//selection the good image from the batch
img+=out_row*img_stride_row;//select the good top row.
kern+=kern_stride_nkern*kern_id;//the good nkern
load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,kern_len,
img_stride_col, img_stride_row, false, c_contiguous);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads();
float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows, but implement the stack. Separate as this use more register.
* we store kern_len row of the image and the full kernel in the shared memory
* each thread compute only one value for the output
* thread block size=out_wid, block_len
* grid block size=intceil(out_len/block_len),batch_id
* dynamic shared memory: (kern_len+block_len-1)*img_wid+kern_len*kern_wid
* Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<int KERN_WIDTH, bool c_contiguous>
__global__ void
conv_rows_stack( float* img, float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern;
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
batch_id= blockIdx.y/nkern;
kern_id = blockIdx.y%nkern;
nb_rows = blockDim.y;
extern __shared__ float s_data[];
const int out_col = threadIdx.x;//output col
const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
const int shared_row = threadIdx.y;
const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
d_img=&s_data[0];//size of [(KERN_LEN+block_len-1) * IMAGE_WID];
d_kern=&s_data[(kern_len+nb_rows-1) * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
int _idx=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch
_idx+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
load_to_shared(d_img,img+_idx,thread_id,nb_thread_id,img_wid,kern_len+nb_rows-1,
img_stride_col, img_stride_row, false, c_contiguous);
_idx=kern_stride_nkern*kern_id+kern_stride_stack*stack;
load_to_shared(d_kern, kern+_idx, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads();
for (int row=0; row < kern_len && out_row<out_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads();//to be sure all thread have finished before we modif the shared memory.
}
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
* we store block_len row of the image(at a time) and one or all kernel row in the shared memory
* each thread compute only one value for the output
* thread block size=out_wid, block_len
* grid block size=intceil(out_len/block_len),batch_id
* dynamic shared memory: block_len * img_wid+(preload_full_kern?kern_len:1)*kern_wid
* Diff with conv_patch: don't store the full image and kernel in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
__global__ void
conv_rows_stack2( float* img, float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern;
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
batch_id= blockIdx.y/nkern;
kern_id = blockIdx.y%nkern;
nb_rows = blockDim.y;
extern __shared__ float s_data[];
const int out_col = threadIdx.x;//output col
const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
const int shared_row = threadIdx.y;
const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
d_img=&s_data[0];//size of [nb_rows * IMAGE_WID];
d_kern=&s_data[nb_rows*img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
int _idx2=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
_idx2+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
__syncthreads();
load_to_shared(d_img,img+_idx2,thread_id,nb_thread_id,img_wid,nb_rows-1,
img_stride_col, img_stride_row, false, c_contiguous);
if(preload_full_kern)
load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
_idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
_idx1+=(row+nb_rows-1)*img_stride_row;//the current last row
__syncthreads();
load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid,
img+_idx1, thread_id, nb_thread_id, img_wid, 1,
img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer.
if(!preload_full_kern){
int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack
_idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped
load_to_shared(d_kern, kern+_idx3,
thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, true, c_contiguous);
}
__syncthreads();
if(out_row<out_len){
const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern;
const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
}
}
__syncthreads();
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}
/**
* Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
* kernel-image inner-product in parallel.
*
* This implementation uses shared memory for the reduce, so it is limited by the product of stacklen x kern_len
*
* template stack_loop: if true, we accept that blockDim.x < nstack and we add a loop for this(use 3 more registers, so lower occupency when true, but accept nstack*kern_len>512)
* TODO: explain parameters, preconditions
*/
template<bool stack_loop>
__global__ void
conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols,
const int initial_reduce_boundary)
{
const int outsize = nB * nK * out_len * out_wid;
extern __shared__ float reducebuf[];
for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
{
//figure out what output element we're in charge of computing
int ii = i;
int iB = ii % nB; // output batch index
ii = ii / nB;
int iK = ii % nK; // output kernel index
ii = ii / nK;
int iR_physical = ii % out_len; //output kernel row
int iC_physical = ii / out_len; // output kernel column
int iR_logical = iR_physical * subsample_rows;
int iC_logical = iC_physical * subsample_cols;
int ss = threadIdx.x;
int rr = threadIdx.y;
int img_rr = iR_logical + kern_len - 1 - rr;
int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y;
float sum = 0.0f;
if(stack_loop){
for (; ss < stacklen; ss+=blockDim.x){
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc)
{
sum += kk_0[0] * ii_0[0];
kk_0 += kern_str_C;
ii_0 -= img_str_C;
}
}
}else{
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc)
{
sum += kk_0[0] * ii_0[0];
kk_0 += kern_str_C;
ii_0 -= img_str_C;
}
}
if (blockDim.x * blockDim.y == 1)
{
out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
}
else
{
reducebuf[reduceIdx] = sum;
__syncthreads();
int reduce_boundary = initial_reduce_boundary;
// add in the terms above the reduce boundary
if (reduceIdx + reduce_boundary < (blockDim.x * blockDim.y))
reducebuf[reduceIdx] += reducebuf[reduce_boundary +reduceIdx];
reduce_boundary >>= 1;
// there are an equal number of terms above and below the reduce_boundary
while (reduce_boundary)
{
__syncthreads();
if (reduceIdx < reduce_boundary)
{
reducebuf[reduceIdx] += reducebuf[reduce_boundary + reduceIdx];
}
reduce_boundary >>= 1;
}
if (reduceIdx == 0)
{
out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = reducebuf[0];
}
}
}
}
/**
* Reference implementation of 'valid' mode convolution (with stack)
*
* This implementation works for any size of image and kernel. It does not use shared memory.
*
* TODO: explain parameters, preconditions
*/
__global__ void
conv_reference_valid(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, outsize;
numThreads = blockDim.x * gridDim.x;
outsize = nB * nK * out_len * out_wid;
for (int i = idx; i < outsize; i += numThreads) //physical
{
//figure out what output element we're in charge of computing
int ii = i;
int iB = ii % nB; // output batch index
ii = ii / nB;
int iK = ii % nK; // output kernel index
ii = ii / nK;
int iR_physical = ii % out_len; //output kernel row
int iC_physical = ii / out_len; // output kernel column
int iR_logical = iR_physical * subsample_rows;
int iC_logical = iC_physical * subsample_cols;
float sum = 0.0f;
for (int ss = 0; ss < stacklen; ++ss)
{
for (int rr = 0; rr < kern_len; ++rr)
{
int img_rr = iR_logical + kern_len - 1 - rr;
for (int cc = 0; cc < kern_wid; ++cc)
{
int img_cc = iC_logical + kern_wid-1-cc;
float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
sum += k_0 * i_0;
}
}
}
//coords[i*5+0] = iB;
//coords[i*5+1] = iK;
//coords[i*5+2] = iR;
//coords[i*5+3] = iC;
//coords[i*5+4] = iB * out_str_B + iK * out_str_K + iR * out_str_R + iC * out_str_C;
out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
}
}
/**
* Reference implementation of 'full' mode convolution (with stack)
*
* This implementation works for any size of image and kernel. It does not use shared memory.
*
* TODO: explain parameters, preconditions
*/
__global__ void
conv_reference_full(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical dimensions
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
int subsample_rows, int subsample_cols)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, physical_outsize;
numThreads = blockDim.x * gridDim.x;
physical_outsize = nB * nK * out_len * out_wid;
for (int i = idx; i < physical_outsize; i += numThreads)
{
//figure out what output element we're in charge of computing
int ii = i;
int iB = ii % nB; // output batch index
ii = ii / nB;
int iK = ii % nK; // output kernel index
ii = ii / nK;
int iR_physical = ii % out_len; //output kernel row
int iC_physical = ii / out_len; // output kernel column
int iR_logical = iR_physical * subsample_rows;
int iC_logical = iC_physical * subsample_cols;
float sum = 0.0f;
for (int ss = 0; ss < stacklen; ++ss)
{
for (int rr = 0; rr < kern_len; ++rr)
{
int img_rr = iR_logical - rr;
if ((img_rr >= 0) && (img_rr < img_len))
{
for (int cc = 0; cc < kern_wid; ++cc)
{
int img_cc = iC_logical - cc;
if ((img_cc >= 0) && (img_cc < img_wid))
{
float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
sum += k_0 * i_0;
}
}
}
}
}
out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
}
}
#endif // #ifndef CONV_KERNEL_CU
This source diff could not be displayed because it is too large. You can view the blob instead.
#ifndef _CUDA_NDARRAY_H
#define _CUDA_NDARRAY_H
#include <numpy/arrayobject.h>
#include <stdio.h>
#include <cublas.h>
typedef float real;
#define REAL_TYPENUM 11
#ifdef __DEVICE_EMULATION__
#define NUM_VECTOR_OP_BLOCKS 4096
#define NUM_VECTOR_OP_THREADS_PER_BLOCK 1 //This prevents printf from getting tangled up
#else
#define NUM_VECTOR_OP_BLOCKS 4096 //Max number of blocks to launch. Should be read from device properties. (#10)
#define NUM_VECTOR_OP_THREADS_PER_BLOCK 256 //Should be read from device properties. (#10)
#endif
#if 0
// Do not wait after every kernel & transfer.
#define CNDA_THREAD_SYNC
#else
// This is useful for using normal profiling tools
#define CNDA_THREAD_SYNC cudaThreadSynchronize();
#endif
/**
* struct CudaNdarray
*
* This is a Python type.
*
*/
struct CudaNdarray
{
PyObject_HEAD
/**
* base:
* either NULL or a pointer to a fellow CudaNdarray into which this one is viewing.
* This pointer is never followed, except during Py_DECREF when we do not need it any longer.
*/
PyObject * base;
/* Type-specific fields go here. */
//GpuTensorType::VoidTensor * vt;
int nd; //the number of dimensions of the tensor
int * host_structure; //dim0, dim1, ... stride0, stride1, ...
int data_allocated; //the number of bytes allocated for devdata
//device pointers (allocated by cudaMalloc)
int dev_structure_fresh;
int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
real* devdata; //pointer to data element [0,..,0].
};
/*
* Return a CudaNdarray whose 'nd' dimensions are all 0.
*/
PyObject *
CudaNdarray_New(int nd);
/**
* Return 1 for a CudaNdarray otw 0
*/
int
CudaNdarray_Check(const PyObject * ob);
/**
* Return 1 for a CudaNdarray otw 0
*/
int
CudaNdarray_CheckExact(const PyObject * ob);
/****
* Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
*/
int
cnda_structure_size(int nd)
{
// dim0, dim1, ...
// str0, str1, ...
// log2(dim0), log2(dim1), ...
return nd + nd + nd;
}
const int *
CudaNdarray_HOST_DIMS(const CudaNdarray * self)
{
return self->host_structure;
}
const int *
CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
{
return self->host_structure + self->nd;
}
const int *
CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
{
return self->host_structure + 2*self->nd;
}
void
cnda_mark_dev_structure_dirty(CudaNdarray * self)
{
self->dev_structure_fresh = 0;
}
/****
* Set the idx'th dimension to value d.
*
* Updates the log2dim shaddow array.
*
* Does not sync structure to host.
*/
void
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
{
if ((idx >= self->nd) || (idx < 0) || (d < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: %i %i\n", idx, d);
}
if (d != self->host_structure[idx])
{
self->host_structure[idx] = d;
int log2d = (int)log2((double)d);
self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
cnda_mark_dev_structure_dirty(self);
}
}
void
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
{
if ((idx >= self->nd) || (idx < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
}
if (s != CudaNdarray_HOST_STRIDES(self)[idx])
{
self->host_structure[idx+self->nd] = s;
cnda_mark_dev_structure_dirty(self);
}
}
/***
* Update dependent variables from the contents of CudaNdarray_HOST_DIMS(self) and CudaNdarray_HOST_STRIDES(self)
*
* This means: recalculate the log2dims and transfer structure to the card
*/
int
cnda_copy_structure_to_device(CudaNdarray * self)
{
cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
CNDA_THREAD_SYNC;
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{
PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
return -1;
}
self->dev_structure_fresh = 1;
return 0;
}
const int *
CudaNdarray_DEV_DIMS(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
if (cnda_copy_structure_to_device(self))
return NULL;
}
return self->dev_structure;
}
const int *
CudaNdarray_DEV_STRIDES(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
if (cnda_copy_structure_to_device(self))
return NULL;
}
return self->dev_structure + self->nd;
}
const int *
CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
if (cnda_copy_structure_to_device(self))
return NULL;
}
return self->dev_structure + 2*self->nd;
}
float *
CudaNdarray_DEV_DATA(const CudaNdarray * self)
{
return self->devdata;
}
/**
* Return the number of elements in the ndarray (product of the dimensions)
*/
int
CudaNdarray_SIZE(const CudaNdarray *self)
{
if (self->nd == -1) return 0;
int size = 1;
for (int i = 0; i < self->nd; ++i)
{
size *= CudaNdarray_HOST_DIMS(self)[i];
}
return size;
}
/**
* Allocate a new CudaNdarray with nd==-1
*/
PyObject * CudaNdarray_new_null();
/**
* Allocate a new CudaNdarray with room for given number of dimensions
*
* No Storage space is allocated (and all dimensions are 0)
*/
PyObject * CudaNdarray_new_nd(const int nd);
/**
* [Re]allocate a CudaNdarray with access to 'nd' dimensions.
*
* Note: This does not allocate storage for data.
*/
int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
{
if (nd != self->nd)
{
if (self->dev_structure)
{
cublasFree(self->dev_structure);
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{
PyErr_SetString(PyExc_MemoryError, "error freeing device memory");
return -1;
}
self->dev_structure = NULL;
}
if (self->host_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->nd = -1;
}
if (nd == -1) return 0;
self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
//initialize all dimensions and strides to 0
for (int i = 0; i < cnda_structure_size(nd); ++i) self->host_structure[i] = 0;
if (NULL == self->host_structure)
{
PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
return -1;
}
cublasAlloc(cnda_structure_size(nd), sizeof(int), (void**)&self->dev_structure);
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{
PyErr_SetString(PyExc_MemoryError, "error allocating device memory");
free(self->host_structure);
self->host_structure = NULL;
self->dev_structure = NULL;
return -1;
}
self->nd = nd;
self->dev_structure_fresh = 0;
}
return 0;
}
/**
* CudaNdarray_alloc_contiguous
*
* Allocate storage space for a tensor of rank 'nd' and given dimensions.
*
* Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
*/
template<typename inttype>
int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype * dim)
{
// allocate an empty ndarray with c_contiguous access
// return 0 on success
int size = 1; //set up the strides for contiguous tensor
assert (nd >= 0);
if (CudaNdarray_set_nd(self, nd))
{
return -1;
}
//TODO: check if by any chance our current dims are correct,
// and strides already contiguous
// in that case we can return right here.
for (int i = nd-1; i >= 0; --i)
{
CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
CudaNdarray_set_dim(self, i, dim[i]);
size = size * dim[i];
}
if (self->data_allocated != size)
{
//std::cerr << "resizing from " << self->data_allocated << " to " << size << '\n';
cublasFree(self->devdata);
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{// Does this ever happen?? Do we need to set data_allocated or devdata to 0?
PyErr_SetString(PyExc_MemoryError, "error freeing device memory");
return -1;
}
assert(size>0);
cublasAlloc(size, sizeof(real), (void**)&(self->devdata));
//std::cerr << "cublasAlloc returned " << self->devdata << "\n";
//We must do both checks as the first one is not enough in some cases!
if (CUBLAS_STATUS_SUCCESS != cublasGetError() || !self->devdata)
{
PyErr_Format(PyExc_MemoryError, "error allocating %i bytes device memory",size);
CudaNdarray_set_nd(self,-1);
self->data_allocated = 0;
self->devdata = 0;
return -1;
}
self->data_allocated = size;
}
return 0;
}
/*
* Return a CudaNdarray whose 'nd' dimensions are set to dims, and allocated.
*/
template<typename inttype>
PyObject *
CudaNdarray_NewDims(int nd, const inttype * dims)
{
CudaNdarray * rval = (CudaNdarray*)CudaNdarray_new_null();
if (rval)
{
if (CudaNdarray_alloc_contiguous(rval, nd, dims))
{
Py_DECREF(rval);
return NULL;
}
}
return (PyObject*)rval;
}
/**
* CudaNdarray_set_device_data
*
* Set self to be a view of given `data`, owned by existing CudaNdarray `base`.
*/
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base);
/**
* Return an independent copy of self
*/
PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo);
/**
* Return an independent copy of self
*/
PyObject * CudaNdarray_Copy(CudaNdarray * self);
/**
* Return a new object obtained by summing over the dimensions for which there is a 1 in the mask.
*/
PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask);
/**
* Transfer the contents of numpy array `obj` to `self`.
*
* self is reallocated to have the correct dimensions if necessary.
*/
int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
/**
* Transfer the contents of CudaNdarray `other` to `self`.
*
* self is reallocated to have the correct dimensions if necessary.
*/
int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other);
/**
* Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
*/
PyObject *
CudaNdarray_CreateArrayObj(CudaNdarray * self);
/**
* True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
*/
bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
int CudaNdarray_reduce_min(CudaNdarray * self, CudaNdarray * A);
int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A);
int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern);
enum { ConvMode_FULL, ConvMode_VALID };
PyObject * CudaNdarray_Conv(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
PyObject * CudaNdarray_Conv(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, const int mode)
{
return CudaNdarray_Conv(img, kern, out, mode, 1, 1, -1, 0);
}
int CudaNdarray_conv(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, const int mode);
void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
{
fprintf(fd, "CudaNdarray <%p, %p> nd=%i \n", self, self->devdata, self->nd);
fprintf(fd, "\tHOST_DIMS: ");
for (int i = 0; i < self->nd; ++i)
{
fprintf(fd, "%i\t", CudaNdarray_HOST_DIMS(self)[i]);
}
fprintf(fd, "\n\tHOST_STRIDES: ");
for (int i = 0; i < self->nd; ++i)
{
fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
}
fprintf(fd, "\n");
}
#endif
...@@ -2,7 +2,6 @@ from theano import Op, Type, Apply, Variable, Constant ...@@ -2,7 +2,6 @@ from theano import Op, Type, Apply, Variable, Constant
from theano import tensor, scalar from theano import tensor, scalar
import StringIO import StringIO
import cuda_ndarray
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.kernel_codegen import nvcc_kernel, inline_reduce_max, inline_reduce_sum, inline_softmax from theano.sandbox.cuda.kernel_codegen import nvcc_kernel, inline_reduce_max, inline_reduce_sum, inline_softmax
......
...@@ -72,6 +72,8 @@ def nvcc_module_compile_str(module_name, src_code, location=None, include_dirs=[ ...@@ -72,6 +72,8 @@ def nvcc_module_compile_str(module_name, src_code, location=None, include_dirs=[
cmd.extend('-I%s'%idir for idir in include_dirs) cmd.extend('-I%s'%idir for idir in include_dirs)
cmd.extend(['-o',lib_filename]) cmd.extend(['-o',lib_filename])
cmd.append(cppfilename) cmd.append(cppfilename)
if module_name != 'cuda_ndarray':
cmd.append(os.path.join(os.path.split(cppfilename)[0],'..','cuda_ndarray','cuda_ndarray.so'))
cmd.extend(['-L%s'%ldir for ldir in lib_dirs]) cmd.extend(['-L%s'%ldir for ldir in lib_dirs])
cmd.extend(['-l%s'%l for l in libs]) cmd.extend(['-l%s'%l for l in libs])
debug('Running cmd', ' '.join(cmd)) debug('Running cmd', ' '.join(cmd))
......
...@@ -11,12 +11,12 @@ import theano.tensor as T ...@@ -11,12 +11,12 @@ import theano.tensor as T
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
try: try:
import cuda_ndarray import cuda_ndarray.cuda_ndarray as cuda_ndarray
except ImportError: except ImportError:
raise SkipTest('Optional package cuda_ndarray not available') raise SkipTest('Optional package cuda_ndarray not available')
import theano.sandbox.cuda as tcn import theano.sandbox.cuda as tcn
import cuda_ndarray as cuda import theano.sandbox.cuda as cuda
import theano.compile.mode import theano.compile.mode
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
......
import sys, time
import numpy
import theano.sandbox.cuda as cuda_ndarray
def py_conv_valid_numpy(img, kern):
assert img.shape[1] == kern.shape[1]
outshp = (img.shape[0], kern.shape[0],
img.shape[2] - kern.shape[2] + 1,
img.shape[3] - kern.shape[3] + 1)
out = numpy.zeros(outshp, dtype='float32')
for b in xrange(out.shape[0]):
for k in xrange(out.shape[1]):
for rr in xrange(out.shape[2]):
for cc in xrange(out.shape[3]):
#rr, cc is the upper-left corner of img patches
imgpatch = img[b,:,rr:rr+kern.shape[2], cc:cc+kern.shape[3]]
#print img.shape, kern.shape, imgpatch.shape, rr+kern.shape[2]-1, rr-1, -1
innerprod = (imgpatch[:,::-1,::-1] * kern[k,:,:,:]).sum()
out[b, k, rr, cc] = innerprod
return out
def py_conv_full_numpy(img, kern):
# manually pad the img with zeros all around, and then run it through py_conv_valid
pad_rows = 2*(kern.shape[2]-1) + img.shape[2]
pad_cols = 2*(kern.shape[3]-1) + img.shape[3]
padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols), dtype=img.dtype)
padded_img[:,:,kern.shape[2]-1:kern.shape[2]-1+img.shape[2],kern.shape[3]-1:kern.shape[3]-1+img.shape[3]] = img
return py_conv_valid(padded_img, kern)
def py_conv_scipy(img, kern, mode, subsample):
from scipy.signal import convolve2d
assert img.shape[1] == kern.shape[1]
if mode == 'valid':
outshp = (img.shape[0], kern.shape[0],
img.shape[2] - kern.shape[2] + 1,
img.shape[3] - kern.shape[3] + 1)
else:
outshp = (img.shape[0], kern.shape[0],
img.shape[2] + kern.shape[2] - 1,
img.shape[3] + kern.shape[3] - 1)
out = numpy.zeros(outshp, dtype='float32')
for b in xrange(out.shape[0]):
for k in xrange(out.shape[1]):
for s in xrange(img.shape[1]):
out[b,k,:,:] += convolve2d(img[b,s,:,:]
, kern[k,s,:,:]
, mode)
return out[:,:,::subsample[0], ::subsample[1]]
def _params_allgood_header():
print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup"
def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), kern_stride=(1,1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol = 1e-8, nb_iter=0, ones=False):
if ones:
assert not random
npy_img = numpy.asarray(numpy.ones(ishape), dtype='float32')
npy_kern = -numpy.asarray(numpy.ones(kshape), dtype='float32')
elif random:
npy_img = numpy.asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = numpy.asarray(numpy.random.rand(*kshape), dtype='float32')
else:
npy_img = numpy.asarray(numpy.arange(numpy.prod(ishape)).reshape(ishape), dtype='float32')+1
npy_kern = -(numpy.asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32')+1)
img = cuda_ndarray.CudaNdarray(npy_img)
kern = cuda_ndarray.CudaNdarray(npy_kern)
#we take the stride after the transfert as we make c_contiguous data on the GPU.
img=img[:,:,::img_stride[0],::img_stride[1]]
kern=kern[:,:,::kern_stride[0],::kern_stride[1]]
npy_img = npy_img[:,:,::img_stride[0],::img_stride[1]]
npy_kern = npy_kern[:,:,::kern_stride[0],::kern_stride[1]]
t2 = None
rval = True
try:
t0 = time.time()
cpuval = py_conv_scipy(npy_img, npy_kern, mode, subsample)
t1 = time.time()
gpuval = cuda_ndarray.conv(img, kern, mode, subsample=subsample,
version=version, verbose=verbose)
t2 = time.time()
for i in range(nb_iter):
gpuval2 = cuda_ndarray.conv(img, kern, mode, subsample=subsample,
version=version, verbose=0)
assert numpy.allclose(numpy.asarray(gpuval),numpy.asarray(gpuval2))
assert (numpy.asarray(gpuval)==numpy.asarray(gpuval2)).all()
gpuval = numpy.asarray(gpuval)
if gpuval.shape != cpuval.shape:
print >> sys.stdout, "ERROR: shape mismatch", gpuval.shape, cpuval.shape
rval = False
if rval:
rval = numpy.allclose(cpuval, gpuval, rtol = rtol)
except NotImplementedError, e:
print >> sys.stdout, '_params_allgood Failed allclose', e
rval = False
if (t2 is not None):
if mode == 'valid':
approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2
else:
approx_fp = ishape[0] * kshape[0] * kshape[1] * kshape[2] * kshape[3] * ishape[2] * ishape[3] * 2
approx_fp /= 1e6
cpu_mflops = approx_fp / (t1-t0)
gpu_mflops = approx_fp / (t2-t1)
print >> sys.stdout, '%15s'% str(ishape), '%15s'% str(kshape),
print >> sys.stdout, '%12.5f %7.2f %7.2f %7.1f' % (approx_fp,
cpu_mflops, gpu_mflops,(t1-t0)/(t2-t1))
if not rval:
print >> sys.stdout, 'test_'+mode+' id='+str(id)+' FAILED for ishape, kshape, mode, subsample, img_stride, kern_stride, version', ishape, kshape, mode, subsample, img_stride, kern_stride, version
diff=cpuval-gpuval
diffabs=numpy.absolute(diff)
pr_diff=diffabs/numpy.absolute(cpuval)
nb_close=(diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
print "max absolute diff:",diffabs.max(),"avg abs diff:",numpy.average(diffabs)
print "median abs diff:", numpy.median(diffabs), "nb close:",nb_close, "/", diff.size
print "max relatif diff:",pr_diff.max(), "avg rel diff:", numpy.average(pr_diff)
print rval
if not rval and print_!=False:
if npy_img.shape[0]>5:
print "img",npy_img[0]
print "kern",npy_kern[0]
print "gpu",gpuval[0][0]
print "cpu",cpuval[0][0]
print "diff",diff[0][0]
else:
print "img",npy_img
print "kern",npy_kern
print "gpu",gpuval
print "cpu",cpuval
print "diff",diff
return rval
def exec_conv(version, shapes, verbose, random, mode, print_=None, rtol=1e-5, ones=False):
_params_allgood_header()
nb_failed = 0
nb_tests = 0
failed_version=set()
failed_id=[]
for ver in version:# I put -1 in case we forget to add version in the test to.
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
ret=False
try:
ret = _params_allgood(ishape, kshape, mode,
subsample=subshape, img_stride=istride, kern_stride=kstride,
version=ver, verbose=verbose, random=random, id=id,print_=print_,rtol=rtol,ones=ones)
except:
pass
if not ret:
failed_version.add(ver)
failed_id.append(id)
nb_failed+=1
nb_tests+=1
if nb_failed>0:
print "nb_failed",nb_failed,"on",nb_tests, "failed_version",failed_version, "failed_id",failed_id
assert nb_failed==0
else:
print 'Executed',nb_tests,'different shapes'
def get_basic_shapes():
return [
#basic test of image and kernel shape
((1, 1, 1, 1), (1, 1, 1, 1), (1,1), (1,1), (1,1))
, ((1, 1, 2, 2), (1, 1, 2, 2), (1,1), (1,1), (1,1))
, ((1, 1, 3, 3), (1, 1, 2, 2), (1,1), (1,1), (1,1))
#basic test for unsquare kernel and image
, ((1, 1, 2, 4), (1, 1, 2, 2), (1,1), (1,1), (1,1))
, ((1, 1, 3, 4), (1, 1, 2, 2), (1,1), (1,1), (1,1))
, ((1, 1, 4, 3), (1, 1, 2, 2), (1,1), (1,1), (1,1))
, ((1, 1, 4, 4), (1, 1, 3, 2), (1,1), (1,1), (1,1))
, ((1, 1, 4, 4), (1, 1, 2, 3), (1,1), (1,1), (1,1))]
def get_shapes(imshp=(1,1), kshp=(1,1), subsample=(1,1), img_stride=(1,1), kern_stride=(1,1)):
""" all possible case if we one or more of stack size, batch size, nkern. We use the gived image shape, kernel shape and subsmaple shape."""
return [ ((1, 2)+imshp, (1, 2)+kshp,subsample, img_stride, kern_stride)#stack only
, ((3, 1)+imshp, (1, 1)+kshp,subsample, img_stride, kern_stride)#batch only
, ((1, 1)+imshp, (2, 1)+kshp,subsample, img_stride, kern_stride)#nkern only
, ((3, 1)+imshp, (2, 1)+kshp,subsample, img_stride, kern_stride)#batch and nkern
, ((3, 2)+imshp, (1, 2)+kshp,subsample, img_stride, kern_stride)#batch and stack
, ((1, 2)+imshp, (2, 2)+kshp,subsample, img_stride, kern_stride)#stack and nkern
, ((2, 2)+imshp, (2, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack
, ((3, 2)+imshp, (4, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack
]
def get_shapes2(scales_img=(1,1), scales_kern=(1,1), subsample=(1,1), img_stride=(1,1), kern_stride=(1,1)):
#basic test of stack, batch and nkern paramter
shapes =get_shapes((1*scales_img[0],1*scales_img[1]),
(1*scales_kern[0],1*scales_kern[1]),subsample, img_stride, kern_stride)
#basic test of stack, batch and nkern paramter with image and kernel shape
shapes +=get_shapes((2*scales_img[0],2*scales_img[1]),
(2*scales_kern[0],2*scales_kern[1]),subsample, img_stride, kern_stride)
#basic test of stack, batch and nkern paramter with image and kernel shape
shapes +=get_shapes((3*scales_img[0],3*scales_img[1]),
(2*scales_kern[0],2*scales_kern[1]),subsample, img_stride, kern_stride)
#basic test of stack, batch and nkern paramter with not square image.
shapes +=get_shapes((4*scales_img[0],3*scales_img[1]),
(2*scales_kern[0],2*scales_kern[1]),subsample, img_stride, kern_stride)
#basic test of stack, batch and nkern paramter with not square image.
shapes +=get_shapes((3*scales_img[0],4*scales_img[1]),
(2*scales_kern[0],2*scales_kern[1]),subsample, img_stride, kern_stride)
#basic test of stack, batch and nkern paramter with not square kernel.
shapes +=get_shapes((4*scales_img[0],4*scales_img[1]),
(3*scales_kern[0],2*scales_kern[1]),subsample, img_stride, kern_stride)
#basic test of stack, batch and nkern paramter with not square kernel.
shapes +=get_shapes((4*scales_img[0],4*scales_img[1]),
(2*scales_kern[0],3*scales_kern[1]),subsample, img_stride, kern_stride)
return shapes
def test_valid():
# img shape, kern shape, subsample shape
shapes = get_basic_shapes()
shapes +=get_shapes2()
#test image stride
shapes += get_shapes2(scales_img=(2,2),img_stride=(1,2))
shapes += get_shapes2(scales_img=(2,2),img_stride=(2,1))
shapes += get_shapes2(scales_img=(2,2),img_stride=(2,2))
shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
#test subsample
shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
shapes += [
#other test
((2, 1, 2, 2), (1, 1, 2, 2), (1,1), (1,1), (1,1))
, ((3, 2, 4, 4), (4, 2, 4, 4), (1,1), (1,1), (1,1))
, ((4, 1, 10, 10), (1, 1, 2, 2), (1,1), (1,1), (1,1))
, ((1, 1, 4, 4), (1, 1, 2, 3), (1,1), (1,1), (1,1))
, ((4, 1, 10, 10), (1, 1, 2, 3), (1,1), (1,1), (1,1))
, ((4, 1, 10, 10), (1, 1, 2, 10), (1,1), (1,1), (1,1))
, ((4, 1, 20, 10), (1, 1, 2, 10), (1,1), (1,1), (1,1))
, ((3, 2, 8, 8), (4, 2, 4, 4), (1,1), (1,1), (1,1)) #stack, nkern, bsize
, ((3, 2, 8, 6), (4, 2, 4, 4), (1,1), (1,1), (1,1)) #stack, nkern, bsize, non-square image
, ((3, 2, 8, 6), (4, 2, 4, 3), (1,1), (1,1), (1,1)) #stack, nkern, bsize, non-square image, non-square kern
, ((3, 2, 8, 6), (4, 2, 4, 6), (1,1), (1,1), (1,1)) #stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
, ((16, 5, 64, 64), (8, 5, 8, 8), (1,1), (1,1), (1,1)) # a big one
, ((16, 1, 28, 28), (20, 1, 5, 5), (1,1), (1,1), (1,1)) # MNIST LeNET layer 1
, ((20, 16, 32, 32), (1, 16, 28, 28), (1,1), (1,1), (1,1)) # layer 1 backprop to weights
, ((60,20,28,28), (10,20,5,5), (1,1), (2,2), (1,1))#added a test case that fail from test_nnet.py.test_conv_nnet2
, ((10,5,28,28), (10,5,5,5), (1,1), (2,2), (1,1))#test precedent but reduced that triger the error
]
shapes += [ ((60,1,28,28),(20,1,5,5), (1,1), (1,1), (1,1))#test_lenet_28 1 layers
, ((60,20,12,12),(30,20,5,5), (1,1), (1,1), (1,1))#test_lenet_28 2 layers
, ((60,30,8,8),(20,30,5,5), (1,1), (1,1), (1,1))#test_lenet_28 bprop 1 full
, ((20,60,12,12),(30,60,8,8), (1,1), (1,1), (1,1))#test_lenet_28 bprop 2 valid
, ((1,60,28,28),(20,60,24,24), (1,1), (1,1), (1,1))#test_lenet_28 bprop 2 valid
, ((10,1,64,64),(20,1,7,7), (1,1), (1,1), (1,1))#test_lenet_64 1 layers
, ((10,20,29,29),(30,20,7,7), (1,1), (1,1), (1,1))#test_lenet_64 2 layers
, ((10,30,23,23),(20,30,7,7), (1,1), (1,1), (1,1))#test_lenet_64 full
, ((20,10,29,29),(30,10,23,23), (1,1), (1,1), (1,1))#test_lenet_64 bprop 1
, ((1,10,64,64),(20,10,58,58), (1,1), (1,1), (1,1))#test_lenet_64 bprop 2
]
shapes=shapes[425:426]
# I put -1 in case we forget to add version in the test to.
# I put -2 to test the reference version.
version=[-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13]
verbose=0
# version=[1]
random = True
print_ = True
ones = False
if ones:
random = False
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5)
def test_full():
shapes = get_basic_shapes()
shapes +=get_shapes2()
#test image stride
shapes += get_shapes2(scales_img=(2,2),img_stride=(1,2))
shapes += get_shapes2(scales_img=(2,2),img_stride=(2,1))
shapes += get_shapes2(scales_img=(2,2),img_stride=(2,2))
shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
#test subsample
shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
shapes += [
#other test
((2, 1, 2, 2), (1, 1, 2, 2), (1,1), (1,1), (1,1))
, ((3, 2, 4, 4), (4, 2, 4, 4), (1,1), (1,1), (1,1))
, ((4, 1, 10, 10), (1, 1, 2, 2), (1,1), (1,1), (1,1))
, ((1, 1, 4, 4), (1, 1, 2, 3), (1,1), (1,1), (1,1))
, ((4, 1, 10, 10), (1, 1, 2, 3), (1,1), (1,1), (1,1))
, ((4, 1, 10, 10), (1, 1, 2, 10), (1,1), (1,1), (1,1))
, ((4, 1, 20, 10), (1, 1, 2, 10), (1,1), (1,1), (1,1))
, ((3, 2, 8, 8), (4, 2, 4, 4), (1,1), (1,1), (1,1)) #stack, nkern, bsize
, ((3, 2, 8, 6), (4, 2, 4, 4), (1,1), (1,1), (1,1)) #stack, nkern, bsize, non-square image
, ((3, 2, 8, 6), (4, 2, 4, 3), (1,1), (1,1), (1,1)) #stack, nkern, bsize, non-square image, non-square kern
, ((3, 2, 8, 6), (4, 2, 4, 6), (1,1), (1,1), (1,1)) #stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
, ((16, 5, 64, 64), (8, 5, 8, 8), (1,1), (1,1), (1,1)) # a big one
, ((16, 1, 28, 28), (20, 1, 5, 5), (1,1), (1,1), (1,1)) # MNIST LeNET layer 1
, ((20, 16, 32, 32), (1, 16, 28, 28), (1,1), (1,1), (1,1)) # layer 1 backprop to weights
#other test
, ((3, 1, 1, 1), (2, 1, 5, 3), (1,1), (1,1), (1,1))#kernel bigger then image
, ((3, 2, 1, 1), (4, 2, 1, 1), (1,1), (1,1), (1,1))
, ((3, 2, 4, 4), (4, 2, 2, 6), (1,1), (1,1), (1,1))
, ((3, 2, 4, 4), (4, 2, 8, 6), (1,1), (1,1), (1,1))#kernel bigger then image
, ((4, 2, 10, 10), (3, 2, 2, 12), (1,1), (1,1), (1,1))
]
shapes += [
# ((60,1,28,28),(20,1,5,5), (1,1), (1,1), (1,1))#test_lenet_28 1 layers
# , ((60,20,12,12),(30,20,5,5), (1,1), (1,1), (1,1))#test_lenet_28 2 layers
((60,30,8,8),(20,30,5,5), (1,1), (1,1), (1,1))#test_lenet_28 bprop 1 full
# , ((20,60,12,12),(30,60,8,8), (1,1), (1,1), (1,1))#test_lenet_28 bprop 2 valid
# , ((1,60,28,28),(20,60,24,24), (1,1), (1,1), (1,1))#test_lenet_28 bprop 2 valid
# , ((10,1,64,64),(20,1,7,7), (1,1), (1,1), (1,1))#test_lenet_64 1 layers
# , ((10,20,29,29),(30,20,7,7), (1,1), (1,1), (1,1))#test_lenet_64 2 layers
, ((10,30,23,23),(20,30,7,7), (1,1), (1,1), (1,1))#test_lenet_64 full
# , ((20,10,29,29),(30,10,23,23), (1,1), (1,1), (1,1))#test_lenet_64 bprop 1
# , ((1,10,64,64),(20,10,58,58), (1,1), (1,1), (1,1))#test_lenet_64 bprop 2
]
# shapes=shapes[:277]
version=[-2,-1,0,1,2,3,4,5]
verbose=0
# version=[4]
random=True
exec_conv(version, shapes, verbose, random, 'full')
def test_subsample():
# implement when
shapes = [
((1, 1, 1, 1), (1, 1, 1, 1), (1,1))
, ((1, 1, 1, 1), (1, 1, 1, 1), (2,2))
, ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3))
, ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3))
, ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1))
]
all_good = True
_params_allgood_header()
for ishape, kshape, ds in shapes:
if not _params_allgood(ishape, kshape, 'full', subsample=ds):
all_good = False
if not _params_allgood(ishape, kshape, 'valid', subsample=ds):
all_good = False
assert all_good
def test_logical_shapes():
# implement when
print >> sys.stderr, "INFO: test_logical_shapes not implemented (i.e. imshp_logical, kshp_logical, kshp_logical_top_aligned)"
def _test_dummy():
ishape = (1, 1, 5, 5)
kshape = (1, 1, 3, 3)
mode = 'valid'
subsample = (1,1)
npy_img = numpy.asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = numpy.asarray(numpy.random.rand(*kshape), dtype='float32')
img = cuda_ndarray.CudaNdarray(npy_img)
kern = cuda_ndarray.CudaNdarray(npy_kern)
#print >> sys.stdout, '_params_allgood trying ', ishape, kshape, mode
t2 = None
rval = True
t0 = time.time()
cpuval = py_conv_scipy(npy_img, npy_kern, mode, subsample)
t1 = time.time()
gpuval = cuda_ndarray.conv(img, kern, mode, subsample)
t2 = time.time()
gpuval = numpy.asarray(gpuval)
print gpuval
print cpuval
def benchmark():
shapes_valid = [
#test_lenet_28 shape
((20, 60,12,12), (30,60,8,8), (1,1), (1,1), (1,1))#valid
,((60, 20,12,12), (30,20,5,5), (1,1), (1,1), (1,1))#valid
,((60, 1,28,28), (20,1,5,5), (1,1), (1,1), (1,1))#valid
,((1, 60,28,28), (20,60,24,24), (1,1), (1,1), (1,1))#valid
#test_lenet_32 shape
,((20, 60,14,14), (30,60,10,10), (1,1), (1,1), (1,1))#valid
,((60, 20,14,14), (30,20,5,5), (1,1), (1,1), (1,1))#valid
,((60, 1,32,32), (20,1,5,5), (1,1), (1,1), (1,1))#valid
,((1, 60,32,32), (20,60,28,28), (1,1), (1,1), (1,1))#valid
#test_lenet_64 shape
,((10, 20,29,29), (30,20,7,7), (1,1), (1,1), (1,1))#valid
,((20, 10,29,29), (30,10,23,23), (1,1), (1,1), (1,1))#valid
,((10, 1,64,64), (20,1,7,7), (1,1), (1,1), (1,1))#valid
,((1, 10,64,64), (20,10,58,58), (1,1), (1,1), (1,1))#valid
#test_lenet_108 shape
,((10, 20,51,51), (30,20,7,7), (1,1), (1,1), (1,1))#valid
,((20, 10,51,51), (30,10,45,45), (1,1), (1,1), (1,1))#valid
,((10, 1,108,108), (20,1,7,7), (1,1), (1,1), (1,1))#valid
,((1, 10,108,108), (20,10,102,102), (1,1), (1,1), (1,1))#valid
#test_lenet_256 shape
,((2, 20,124,124), (30,20,9,9), (1,1), (1,1), (1,1))#valid
,((20, 2,124,124), (30,2,116,116), (1,1), (1,1), (1,1))#valid
,((2, 1,256,256), (20,1,9,9), (1,1), (1,1), (1,1))#valid
,((1, 2,256,256), (20,2,248,248), (1,1), (1,1), (1,1))#valid
]
shapes_full = [
#test_lenet_28 shape
((60, 30,8,8), (20, 30, 5, 5), (1,1), (1,1), (1,1))#full
#test_lenet_32 shape
,((60, 30,10,10), (20, 30, 5, 5), (1,1), (1,1), (1,1))#full conv_full_patch_stack_padded' N=1
#test_lenet_64 shape
,((10, 30,23,23), (20, 30, 7, 7), (1,1), (1,1), (1,1))#full conv_full_patch_stack_padded' N=3
#test_lenet_108 shape
,((10, 30,45,45), (20, 30, 7, 7), (1,1), (1,1), (1,1))#full 'conv_full_patch_stack_padded' N=9
#test_lenet_256 shape
,((2, 30,116,116), (20, 30, 9,9), (1,1), (1,1), (1,1))#full conv_reference_full
]
# shapes_valid=shapes_valid[-1:]
# shapes_full=shapes_full[-1:]
version=[-1]
verbose=1
random=True
exec_conv(version, shapes_valid, verbose, random, 'valid', print_=None, rtol=1e-3)
exec_conv(version, shapes_full, verbose, random, 'full')
import time, copy, sys
import theano.sandbox.cuda as cuda_ndarray
import numpy
def test_host_to_device():
print >>sys.stderr, 'starting test_host_to_dev'
for shape in ((), (3,), (2,3), (3,4,5,6)):
a = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
c = numpy.asarray(b)
assert numpy.all(a == c)
def test_add():
for shape in ((), (3,), (2,3), (1,10000000),(10,1000000), (100,100000),(1000,10000),(10000,1000)):
a0 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
a1 = a0.copy()
b0 = cuda_ndarray.CudaNdarray(a0)
b1 = cuda_ndarray.CudaNdarray(a1)
t0 = time.time()
bsum = b0 + b1
bsum = b0 + b1
t1 = time.time()
gpu_dt = t1 - t0
t0 = time.time()
asum = a0 + a1
asum = a0 + a1
t1 = time.time()
cpu_dt = t1 - t0
print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', cpu_dt / gpu_dt
assert numpy.allclose(asum, numpy.asarray(bsum))
if len(shape)==2:
#test not contiguous version.
#should raise not implemented.
_b = b0[::, ::-1]
b = numpy.asarray(_b)
ones = numpy.ones(shape, dtype='float32')
_ones = cuda_ndarray.CudaNdarray(ones)
t = False
try:
_c = _b+_ones
except:
t = True
assert t
def test_exp():
print >>sys.stderr, 'starting test_exp'
for shape in ((), (3,), (2,3), (1,10000000),(10,1000000), (100,100000),(1000,10000),(10000,1000)):
a0 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
a1 = a0.copy()
b0 = cuda_ndarray.CudaNdarray(a0)
b1 = cuda_ndarray.CudaNdarray(a1)
t0 = time.time()
bsum = b0.exp()
t1 = time.time()
gpu_dt = t1 - t0
t0 = time.time()
asum = numpy.exp(a1)
t1 = time.time()
cpu_dt = t1 - t0
print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', cpu_dt / gpu_dt
#c = numpy.asarray(b0+b1)
if asum.shape:
assert numpy.allclose(asum, numpy.asarray(bsum))
def test_copy():
print >>sys.stderr, 'starting test_copy'
shape = (5,)
a = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
print >>sys.stderr, '.. creating device object'
b = cuda_ndarray.CudaNdarray(a)
print >>sys.stderr, '.. copy'
c = copy.copy(b)
print >>sys.stderr, '.. deepcopy'
d = copy.deepcopy(b)
print >>sys.stderr, '.. comparisons'
assert numpy.allclose(a, numpy.asarray(b))
assert numpy.allclose(a, numpy.asarray(c))
assert numpy.allclose(a, numpy.asarray(d))
def test_dot():
print >>sys.stderr, 'starting test_dot'
a0 = numpy.asarray(numpy.random.rand(4, 7), dtype='float32')
a1 = numpy.asarray(numpy.random.rand(7, 6), dtype='float32')
b0 = cuda_ndarray.CudaNdarray(a0)
b1 = cuda_ndarray.CudaNdarray(a1)
assert numpy.allclose(numpy.dot(a0, a1), cuda_ndarray.dot(b0, b1))
print >> sys.stderr, 'WARNING test_dot: not testing all 8 transpose cases of dot'
def test_sum():
shape = (2,3)
a0 = numpy.asarray(numpy.arange(shape[0]*shape[1]).reshape(shape), dtype='float32')
b0 = cuda_ndarray.CudaNdarray(a0)
assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1,1])))
a0sum = a0.sum(axis=0)
b0sum = b0.reduce_sum([1,0])
print 'asum\n',a0sum
print 'bsum\n',numpy.asarray(b0sum)
assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1,0])))
assert numpy.allclose(a0.sum(axis=1), numpy.asarray(b0.reduce_sum([0,1])))
assert numpy.allclose(a0, numpy.asarray(b0.reduce_sum([0,0])))
shape = (3,4,5,6,7,8)
a0 = numpy.asarray(numpy.arange(3*4*5*6*7*8).reshape(shape), dtype='float32')
b0 = cuda_ndarray.CudaNdarray(a0)
assert numpy.allclose(a0.sum(axis=5).sum(axis=3).sum(axis=0), numpy.asarray(b0.reduce_sum([1,0,0,1,0,1])))
shape = (16,2048)
a0 = numpy.asarray(numpy.arange(16*2048).reshape(shape), dtype='float32')
b0 = cuda_ndarray.CudaNdarray(a0)
assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1,0])))
shape = (16,10)
a0 = numpy.asarray(numpy.arange(160).reshape(shape), dtype='float32')
b0 = cuda_ndarray.CudaNdarray(a0)
assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1,1])))
def test_reshape():
shapelist = [
((1,2,3), (1,2,3)),
((1,), (1,)),
((1,2,3), (3,2,1)),
((1,2,3), (6,)),
((1,2,3,2), (6,2)),
((2,3,2), (6,2))
]
def subtest(shape_1, shape_2):
#print >> sys.stderr, "INFO: shapes", shape_1, shape_2
a = numpy.asarray(numpy.random.rand(*shape_1), dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
aa = a.reshape(shape_2)
bb = b.reshape(shape_2)
n_bb = numpy.asarray(bb)
#print n_bb
assert numpy.all(aa == n_bb)
# test working shapes
for shape_1, shape_2 in shapelist:
subtest(shape_1, shape_2)
subtest(shape_2, shape_1)
print >> sys.stderr, "WARN: TODO: test shape combinations that should give error"
def test_getshape():
shapelist = [
((1,2,3), (1,2,3)),
((1,), (1,)),
((1,2,3), (3,2,1)),
((1,2,3), (6,)),
((1,2,3,2), (6,2)),
((2,3,2), (6,2))
]
def subtest(shape):
a = numpy.asarray(numpy.random.rand(*shape_1), dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
assert b.shape == a.shape
for shape_1, shape_2 in shapelist:
subtest(shape_1)
subtest(shape_2)
def test_stride_manipulation():
a = numpy.asarray([[0,1,2], [3,4,5]], dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
v = b.view()
v._dev_data += 0
c = numpy.asarray(v)
assert numpy.all(a == c)
sizeof_float = 4
offset = 0
b_strides = b._strides
for i in xrange(len(b.shape)):
offset += (b.shape[i]-1) * b_strides[i]
v._set_stride(i, -b_strides[i])
v._dev_data += offset * sizeof_float
c = numpy.asarray(v)
assert numpy.all(c == [[5, 4, 3], [2, 1, 0]])
def test_copy_subtensor0():
sizeof_float=4
a = numpy.asarray(numpy.random.rand(30,20,5,5), dtype='float32')
cuda_a = cuda_ndarray.CudaNdarray(a)
a_view = cuda_a.view()
a_view_strides = a_view._strides
a_view._set_stride(2, -a_view_strides[2])
a_view._set_stride(3, -a_view_strides[3])
a_view._dev_data += 24 * sizeof_float
a_view_copy = copy.deepcopy(a_view)
assert numpy.all(a[:,:,::-1,::-1] == numpy.asarray(a_view_copy))
def test_mapping_getitem_ellipsis():
a = numpy.asarray(numpy.random.rand(5,4,3,2), dtype='float32')
a = cuda_ndarray.CudaNdarray(a)
b = a[...]
assert b._dev_data == a._dev_data
assert b._strides == a._strides
assert b.shape == a.shape
def test_mapping_getitem_reverse_some_dims():
dim=(5,4,3,2)
a = numpy.asarray(numpy.random.rand(*dim), dtype='float32')
_a = cuda_ndarray.CudaNdarray(a)
_b = _a[:,:,::-1, ::-1]
b = numpy.asarray(_b)
assert numpy.all(b==a[:,:,::-1,::-1])
def test_mapping_getitem_w_int():
def _cmp(x,y):
assert x.shape == y.shape
if not numpy.all(x == y):
print x
print y
assert numpy.all(x == y)
dim =(2,)
a = numpy.asarray(numpy.random.rand(*dim), dtype='float32')
_a = cuda_ndarray.CudaNdarray(a)
_cmp(numpy.asarray(_a[1]), a[1])
_cmp(numpy.asarray(_a[::1]), a[::1])
_cmp(numpy.asarray(_a[::-1]), a[::-1])
_cmp(numpy.asarray(_a[...]), a[...])
dim =()
a = numpy.asarray(numpy.random.rand(*dim), dtype='float32')
_a = cuda_ndarray.CudaNdarray(a)
_cmp(numpy.asarray(_a[...]), a[...])
dim =(5,4,3,2)
a = numpy.asarray(numpy.random.rand(*dim), dtype='float32')
_a = cuda_ndarray.CudaNdarray(a)
_cmp(numpy.asarray(_a[:,:,::-1, ::-1]), a[:,:,::-1,::-1])
_cmp(numpy.asarray(_a[:,:,1,-1]), a[:,:,1,-1])
_cmp(numpy.asarray(_a[:,:,-1,:]), a[:,:,-1,:])
_cmp(numpy.asarray(_a[:,::-2,-1,:]), a[:,::-2,-1,:])
_cmp(numpy.asarray(_a[:,::-2,-1]), a[:,::-2,-1])
_cmp(numpy.asarray(_a[0,::-2,-1]), a[0,::-2,-1])
_cmp(numpy.asarray(_a[1]), a[1])
_cmp(numpy.asarray(_a[...]), a[...])
def test_gemm_vector_vector():
a = numpy.asarray(numpy.random.rand(5,1), dtype='float32')
_a = cuda_ndarray.CudaNdarray(a)
b = numpy.asarray(numpy.random.rand(1,5), dtype='float32')
_b = cuda_ndarray.CudaNdarray(b)
_c = cuda_ndarray.dot(_a,_b)
assert _c.shape == (5,5)
assert numpy.allclose(_c, numpy.dot(a, b))
_c = cuda_ndarray.dot(_b,_a)
assert _c.shape == (1,1)
assert numpy.allclose(_c, numpy.dot(b, a))
...@@ -7,10 +7,9 @@ from theano import Op, Type, Apply, Variable, Constant ...@@ -7,10 +7,9 @@ from theano import Op, Type, Apply, Variable, Constant
from theano import tensor from theano import tensor
import theano.config as config import theano.config as config
import cuda_ndarray.cuda_ndarray as cuda
import cuda_ndarray import cuda_ndarray
from theano.sandbox.cuda import filter as type_support_filter
from theano.sandbox.cuda.nvcc_compiler import nvcc_module_compile_str from theano.sandbox.cuda.nvcc_compiler import nvcc_module_compile_str
class CudaNdarrayType(Type): class CudaNdarrayType(Type):
...@@ -51,7 +50,7 @@ class CudaNdarrayType(Type): ...@@ -51,7 +50,7 @@ class CudaNdarrayType(Type):
self.dtype_specs() # error checking is done there self.dtype_specs() # error checking is done there
def filter(self, data, strict=False): def filter(self, data, strict=False):
return type_support_filter(data, self.broadcastable, strict) return cuda.filter(data, self.broadcastable, strict)
@staticmethod @staticmethod
def values_eq(a, b): def values_eq(a, b):
...@@ -254,7 +253,7 @@ class CudaNdarrayType(Type): ...@@ -254,7 +253,7 @@ class CudaNdarrayType(Type):
return ret return ret
def c_libraries(self): def c_libraries(self):
return ['cuda_ndarray', 'cudart'] return ['cudart']
def c_support_code(cls): def c_support_code(cls):
return "" return ""
...@@ -285,5 +284,5 @@ copy_reg.constructor(CudaNdarray_unpickler) ...@@ -285,5 +284,5 @@ copy_reg.constructor(CudaNdarray_unpickler)
def CudaNdarray_pickler(cnda): def CudaNdarray_pickler(cnda):
return (CudaNdarray_unpickler, (numpy.asarray(cnda),)) return (CudaNdarray_unpickler, (numpy.asarray(cnda),))
copy_reg.pickle(cuda_ndarray.CudaNdarray, CudaNdarray_pickler, CudaNdarray_unpickler) copy_reg.pickle(cuda.CudaNdarray, CudaNdarray_pickler, CudaNdarray_unpickler)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论