提交 42927b2b authored 作者: James Bergstra's avatar James Bergstra

merge

...@@ -453,7 +453,6 @@ class GpuSum(Op): ...@@ -453,7 +453,6 @@ class GpuSum(Op):
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output"); PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
%(fail)s; %(fail)s;
} }
} }
""" %locals() """ %locals()
...@@ -472,12 +471,10 @@ class GpuSum(Op): ...@@ -472,12 +471,10 @@ class GpuSum(Op):
#TODO: check if we are ccontiguous when we un-dimshuffle #TODO: check if we are ccontiguous when we un-dimshuffle
#TODO: if only some dims are ccontiguous, call version with less dims. #TODO: if only some dims are ccontiguous, call version with less dims.
print >> sio, 'if(CudaNdarray_is_c_contiguous(%(x)s)){'%locals() print >> sio, 'if(CudaNdarray_is_c_contiguous(%(x)s)){'%locals()
self.c_code_reduce_ccontig(sio, node, name, x, z, fail) self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
print >> sio, "}else{" print >> sio, "}else{"
getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
print >> sio, "}" print >> sio, "}"
else: else:
getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
...@@ -826,8 +823,16 @@ class GpuSum(Op): ...@@ -826,8 +823,16 @@ class GpuSum(Op):
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(1,CudaNdarray_HOST_DIMS(%(x)s)[1]); dim3 n_blocks(1,
if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n"); std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
NUM_VECTOR_OP_BLOCKS));
if (verbose) {
fprintf(stderr,
"running kernel_reduce_sum_10_%(name)s n_blocks=(%%i,%%i)\\n",
n_blocks.x,
n_blocks.y);
}
assert( CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]);
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
1, 1,
...@@ -1175,9 +1180,7 @@ class GpuSum(Op): ...@@ -1175,9 +1180,7 @@ class GpuSum(Op):
""" %locals() """ %locals()
def c_code_cache_version(self): def c_code_cache_version(self):
#return () return (20,)
return (19,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
sio = StringIO.StringIO() sio = StringIO.StringIO()
......
...@@ -363,9 +363,10 @@ class GpuConv(Op): ...@@ -363,9 +363,10 @@ class GpuConv(Op):
return ['cuda_ndarray.cuh','<stdio.h>'] return ['cuda_ndarray.cuh','<stdio.h>']
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,8) return (0,9) # raise this whenever modifying any of the support_code_files
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of these files
return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\ return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\
open(os.path.join(os.path.split(__file__)[0],'conv_full_kernel.cu')).read()+\ open(os.path.join(os.path.split(__file__)[0],'conv_full_kernel.cu')).read()+\
open(os.path.join(os.path.split(__file__)[0],'conv.cu')).read() open(os.path.join(os.path.split(__file__)[0],'conv.cu')).read()
...@@ -405,8 +406,7 @@ class GpuConv(Op): ...@@ -405,8 +406,7 @@ class GpuConv(Op):
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, %(out)s, CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, %(out)s,
mode, dx, dy, version, verbose); mode, dx, dy, version, verbose);
if(%(out)s && %(out)s==out2) Py_XDECREF(%(out)s);
Py_DECREF(out2);//CudaNdarray_Conv incremented the count to out
%(out)s = out2; %(out)s = out2;
"""%sub """%sub
......
// REMEMBER TO RAISE c_code_cache_version when changing this file
//
//implement the valid convolution only //implement the valid convolution only
/* /*
...@@ -38,6 +40,8 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) { ...@@ -38,6 +40,8 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
#define BS(i, j) Bs[i][j] #define BS(i, j) Bs[i][j]
#endif #endif
*/ */
#define MAX(a,b) ((a)>(b)?(a):(b))
#define MIN(a,b) ((a)<(b)?(a):(b))
const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
#define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float)) #define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float))
...@@ -46,7 +50,8 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_ ...@@ -46,7 +50,8 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
if (nb_thread < 64) if (nb_thread < 64)
{ {
if(flipped) if(flipped)
//TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped. //TODO very slow on device before 1.3.
// make access to kern sequential and access to d_kern flipped.
for(int i=thread_id;i<N;i+=nb_thread) for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=src[N - 1 - i]; dst[i]=src[N - 1 - i];
//dst[N-1-i]=src[i]; //dst[N-1-i]=src[i];
...@@ -88,10 +93,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_ ...@@ -88,10 +93,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
const bool flipped=false, const bool c_contiguous=true){ const bool flipped=false, const bool c_contiguous=true){
if(flipped && ! c_contiguous){ if(flipped && ! c_contiguous){
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread) for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
dst[nb_row*nb_col-1-i]=src[i/nb_col*stride_row+i%nb_col*stride_col]; dst[nb_row*nb_col-1-i]=src[(i/nb_col)*stride_row+(i%nb_col)*stride_col];
}else if(c_contiguous){ }else if(c_contiguous){
load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped); load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
}else if(flipped){//c_contiguous==true }else if(flipped){//c_contiguous==true
//TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped. //TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
int N=nb_col*nb_row; int N=nb_col*nb_row;
...@@ -440,10 +444,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -440,10 +444,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id; //int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len - kern_len + 1; //out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1; //out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; const int out_wid = blockDim.x;
const int out_len = blockDim.y;
const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[]; extern __shared__ float s_data[];
...@@ -458,9 +464,16 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -458,9 +464,16 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
int out_row = ty;//output row int out_row = ty;//output row
const int thread_id = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx; const int thread_id = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID]; //d_img size [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]; float * d_img=&s_data[0];
float * d_reduce=&s_data[img_len*img_wid+(preload_full_kern?kern_len:blockDim.z)*kern_wid];
//d_kern size[(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]
float * d_kern=&s_data[img_len * img_wid];
//d_reduce size [n_threads]
//N.B. this overlaps with d_img and d_kern!
float * d_reduce=&s_data[0];
float sum = 0.0f; float sum = 0.0f;
kern+=kern_stride_nkern*blockIdx.y;//the good nkern kern+=kern_stride_nkern*blockIdx.y;//the good nkern
...@@ -471,30 +484,31 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -471,30 +484,31 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
__syncthreads(); __syncthreads();
load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len, load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
img_stride_col, img_stride_row, false, c_contiguous); img_stride_col, img_stride_row, false, c_contiguous);
if(!(split && ! preload_full_kern))
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
if(split && ! preload_full_kern){ if(split && ! preload_full_kern){
for(int first_row=0, row=tz;first_row<kern_len;row+=blockDim.z, first_row+=blockDim.z){ for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
int idx3; //N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
//TODO: test/check for flipped_kern // (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
if(flipped_kern) // details, but that seemed really weird. tricky bug to find too.
idx3=(kern_len-(first_row)-blockDim.z);//the current last row flipped int idx3 = flipped_kern
else ? max((kern_len - (int)blockDim.z - first_row),0)
idx3=first_row; : first_row;
int len3 = min(blockDim.z, kern_len - first_row);
__syncthreads(); __syncthreads();
load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, blockDim.z, load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
const float* idx_kern=&d_kern[tz*kern_stride_row]; const float* idx_kern=&d_kern[tz*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
float sum2 = 0; float sum2 = 0;
if(row<kern_len) if(tz<len3)
convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
sum+=sum2; sum+=sum2;
} }
}else if(split){ }else if(split){
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for(int row=tz;row<kern_len;row+=blockDim.z){ for(int row=tz;row<kern_len;row+=blockDim.z){
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
...@@ -504,18 +518,21 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -504,18 +518,21 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
int row = tz;//The row of the kernel. int row = tz;//The row of the kernel.
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
//reduce //reduce no sync because previous loop ends with sync
d_reduce[thread_id]=sum; d_reduce[thread_id]=sum;
__syncthreads(); __syncthreads();
if(thread_id<out_len*out_wid){ if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
sum=0; //sum=0;
for(int i=0;i<blockDim.z;i++){ for(int i=1;i<blockDim.z;i++){
sum+=d_reduce[thread_id+i*blockDim.x*blockDim.y]; sum+=d_reduce[thread_id+i*out_wid*out_len];
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image out_wid*out_len*blockIdx.y+//the output image
......
...@@ -134,7 +134,9 @@ CudaNdarray_uninit(CudaNdarray*self) ...@@ -134,7 +134,9 @@ CudaNdarray_uninit(CudaNdarray*self)
assert(self->devdata); assert(self->devdata);
if (device_free(self->devdata)) if (device_free(self->devdata))
{ {
std::cerr << "!!!! error freeing device memory\n"; fprintf(stderr,
"!!!! error freeing device memory %p (self=%p)\n",
self->devdata, self);
rval = -1; rval = -1;
} }
self->devdata = NULL; self->devdata = NULL;
...@@ -144,7 +146,9 @@ CudaNdarray_uninit(CudaNdarray*self) ...@@ -144,7 +146,9 @@ CudaNdarray_uninit(CudaNdarray*self)
{ {
if (device_free(self->dev_structure)) if (device_free(self->dev_structure))
{ {
std::cerr << "!!!! error freeing device memory\n"; fprintf(stderr,
"!!!! error freeing dev_structure memory %p (self=%p)\n",
self->dev_structure, self);
rval = -1; rval = -1;
} }
self->dev_structure = NULL; self->dev_structure = NULL;
...@@ -1848,6 +1852,8 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args) ...@@ -1848,6 +1852,8 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
} }
get_gpu_ptr_size<<<1,1>>>(gpu_data); get_gpu_ptr_size<<<1,1>>>(gpu_data);
if (cudaSuccess != cublasGetError()){ if (cudaSuccess != cublasGetError()){
device_free(gpu_data);
return PyErr_Format(PyExc_RuntimeError, return PyErr_Format(PyExc_RuntimeError,
"CudaNdarray_ptr_int_size: error when calling the gpu code."); "CudaNdarray_ptr_int_size: error when calling the gpu code.");
} }
......
...@@ -403,6 +403,11 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype ...@@ -403,6 +403,11 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
self->devdata = 0; self->devdata = 0;
return -1; return -1;
} }
if (0)
fprintf(stderr,
"Allocated devdata %p (self=%p)\n",
self->devdata,
self);
self->data_allocated = size; self->data_allocated = size;
} }
return 0; return 0;
......
...@@ -84,14 +84,36 @@ def py_conv_scipy(img, kern, mode, subsample): ...@@ -84,14 +84,36 @@ def py_conv_scipy(img, kern, mode, subsample):
def _params_allgood_header(): def _params_allgood_header():
print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup" print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup"
def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), kern_stride=(1,1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol = 1e-8, nb_iter=0, ones=False): def test_example():
# Test a specific configuration that was failing in one of the big unit-tests
# This configuration information was read from one of the 'FAIL' lines printed by
# _params_allgood during a nosetest run
#
# now it can be tested directly by nosetests test_conv_cuda_ndarray.py:test_example
assert _params_allgood(
(1,1,4,4),
(1,1,3,2),
'valid',
version=13,
random=False)
def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
kern_stride=(1,1), version=-1, verbose=0, random=True, print_=None,
id=None, rtol=1e-5, atol = 1e-8, nb_iter=0, ones=False):
#
# This function is the core of several of the big unit-test drivers,
# but it can also be used very directly on its own to test a specific
# kind of convolution.
#
# See `test_example` (above) for an example of how to use this directly.
#
if ones: if ones:
assert not random assert not random
npy_img = theano._asarray(numpy.ones(ishape), dtype='float32') npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32') npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
elif random: elif random:
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_img = theano._asarray(numpy.random.rand(*ishape)+1, dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape)-2, dtype='float32')
else: else:
npy_img = theano._asarray(numpy.arange(numpy.prod(ishape)).reshape(ishape), dtype='float32')+1 npy_img = theano._asarray(numpy.arange(numpy.prod(ishape)).reshape(ishape), dtype='float32')+1
npy_kern = -(theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32')+1) npy_kern = -(theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32')+1)
...@@ -155,8 +177,6 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ker ...@@ -155,8 +177,6 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ker
print "max absolute diff:",diffabs.max(),"avg abs diff:",numpy.average(diffabs) print "max absolute diff:",diffabs.max(),"avg abs diff:",numpy.average(diffabs)
print "median abs diff:", numpy.median(diffabs), "nb close:",nb_close, "/", diff.size print "median abs diff:", numpy.median(diffabs), "nb close:",nb_close, "/", diff.size
print "max relatif diff:",pr_diff.max(), "avg rel diff:", numpy.average(pr_diff) print "max relatif diff:",pr_diff.max(), "avg rel diff:", numpy.average(pr_diff)
print rval
if not rval and print_!=False: if not rval and print_!=False:
if npy_img.shape[0]>5: if npy_img.shape[0]>5:
print "img",npy_img[0] print "img",npy_img[0]
...@@ -185,9 +205,19 @@ def exec_conv(version, shapes, verbose, random, mode, print_=None, rtol=1e-5, on ...@@ -185,9 +205,19 @@ def exec_conv(version, shapes, verbose, random, mode, print_=None, rtol=1e-5, on
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
ret=False ret=False
try: try:
ret = _params_allgood(ishape, kshape, mode, ret = _params_allgood(ishape,
subsample=subshape, img_stride=istride, kern_stride=kstride, kshape,
version=ver, verbose=verbose, random=random, id=id,print_=print_,rtol=rtol,ones=ones) mode,
subsample=subshape,
img_stride=istride,
kern_stride=kstride,
version=ver,
verbose=verbose,
random=random,
id=id,
print_=print_,
rtol=rtol,
ones=ones)
except Exception, e: except Exception, e:
print ver, id,(ishape, kshape, subshape, istride, kstride) print ver, id,(ishape, kshape, subshape, istride, kstride)
print e print e
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论