提交 5fc89c03 authored 作者: Frederic's avatar Frederic

new GpuConv compile, but give wrong version in some cases!

上级 baf12f54
import copy
import os
import theano import theano
from theano import gof from theano import config, gof
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
from theano.sandbox.gpuarray.type import GpuArrayType
class GpuConv(gof.Op): class GpuConv(gof.Op):
...@@ -114,6 +119,9 @@ class GpuConv(gof.Op): ...@@ -114,6 +119,9 @@ class GpuConv(gof.Op):
str(self.kshp)) str(self.kshp))
def make_node(self, img, kern): def make_node(self, img, kern):
if img.dtype != "float32" or kern.dtype != "float32":
raise NotImplementedError("GpuConv currently only work"
" with float32 dtype")
if img.type.ndim != 4: if img.type.ndim != 4:
raise TypeError('img must be 4D tensor') raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4: if kern.type.ndim != 4:
...@@ -121,7 +129,8 @@ class GpuConv(gof.Op): ...@@ -121,7 +129,8 @@ class GpuConv(gof.Op):
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False] False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()]) out = GpuArrayType(img.dtype, broadcastable)()
return gof.Apply(self, [img, kern], [out])
def flops(self, inputs, outputs): def flops(self, inputs, outputs):
""" Useful with the hack in profilemode to print the MFlops""" """ Useful with the hack in profilemode to print the MFlops"""
...@@ -145,6 +154,8 @@ class GpuConv(gof.Op): ...@@ -145,6 +154,8 @@ class GpuConv(gof.Op):
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling):
node_ = copy.copy(node) node_ = copy.copy(node)
assert node.op is node_.op assert node.op is node_.op
if config.gpuarray.sync:
raise NotImplementedError("GpuConv do not implement gpuarray.sync Theano flag")
if node_.op.max_threads_dim0 is None: if node_.op.max_threads_dim0 is None:
cuda = theano.sandbox.cuda cuda = theano.sandbox.cuda
device_id = cuda.use.device_number device_id = cuda.use.device_number
...@@ -169,20 +180,30 @@ class GpuConv(gof.Op): ...@@ -169,20 +180,30 @@ class GpuConv(gof.Op):
return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G'] return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G']
def c_headers(self): def c_headers(self):
return ['cuda_ndarray.cuh', '<stdio.h>'] return ['<stdio.h>', 'cuda.h',
'<compyte/extension.h>', '<compyte/numpy_compat.h>']
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 20) return (0, 20)
def c_init_code(self):
return ['cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
# these files # these files
files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu'] files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
codes = [open(os.path.join(os.path.split(__file__)[0], f)).read() codes = ["CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);",
for f in files] "float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}",
"const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}"]
codes += [open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in files]
return reduce(str.__add__, codes) return reduce(str.__add__, codes)
def c_compiler(self):
return NVCC_compiler
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp img, kern = inp
out, = out_ out, = out_
...@@ -226,7 +247,8 @@ class GpuConv(gof.Op): ...@@ -226,7 +247,8 @@ class GpuConv(gof.Op):
} }
// TODO, make out be decref before we alloc out2! // TODO, make out be decref before we alloc out2!
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, PyGpuArrayObject * out2 = (PyGpuArrayObject *)PyGpuArray_Conv(
%(img)s, %(kern)s,
%(out)s, mode, %(out)s, mode,
dx, dy, dx, dy,
version, verbose, version, verbose,
......
...@@ -4,7 +4,8 @@ ...@@ -4,7 +4,8 @@
//grid block size=batch_id //grid block size=batch_id
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void __global__ void
conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img_wid, int kern_len, int kern_wid, int nb_split) conv_full_patch_split(const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
...@@ -60,7 +61,7 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img ...@@ -60,7 +61,7 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
//grid block size=batch_id, nkern //grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void __global__ void
conv_full_patch( float* img, float* kern, float* out, conv_full_patch( const float* img, const float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack) int kern_len, int kern_wid, int nkern, int nstack)
{ {
...@@ -122,7 +123,7 @@ conv_full_patch( float* img, float* kern, float* out, ...@@ -122,7 +123,7 @@ conv_full_patch( float* img, float* kern, float* out,
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d> template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__global__ void __global__ void
conv_full_patch_stack( float* img, float* kern, float* out, conv_full_patch_stack( const float* img, const float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
...@@ -133,7 +134,7 @@ conv_full_patch_stack( float* img, float* kern, float* out, ...@@ -133,7 +134,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1; out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;//blockDim.z* nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
float __shared__ *kern_, *img_; const float __shared__ *kern_, *img_;
extern __shared__ float s_data[]; extern __shared__ float s_data[];
const int batch_id = blockIdx.x; const int batch_id = blockIdx.x;
...@@ -201,7 +202,7 @@ conv_full_patch_stack( float* img, float* kern, float* out, ...@@ -201,7 +202,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
*/ */
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem > template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
__global__ void __global__ void
conv_full_patch_stack_padded( float* img, float* kern, float* out, conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int img_len, const int img_wid,
const int kern_len, const int kern_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
...@@ -365,7 +366,7 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co ...@@ -365,7 +366,7 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
} }
template<int NSTACK> template<int NSTACK>
__global__ void __global__ void
conv_full_load_everything( float* img, float* kern, float* out, conv_full_load_everything( const float* img, const float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
......
...@@ -221,7 +221,7 @@ __device__ void store_or_accumulate(float& dst,const float value ){ ...@@ -221,7 +221,7 @@ __device__ void store_or_accumulate(float& dst,const float value ){
*/ */
template<bool flipped_kern, int KERN_WIDTH, bool split> template<bool flipped_kern, int KERN_WIDTH, bool split>
__global__ void __global__ void
conv_patch( float* img, float* kern, float* out, conv_patch( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack) int nkern, int nstack)
{ {
...@@ -304,7 +304,7 @@ conv_patch( float* img, float* kern, float* out, ...@@ -304,7 +304,7 @@ conv_patch( float* img, float* kern, float* out,
*/ */
template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample> template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__global__ void __global__ void
conv_patch_stack( float* img, float* kern, float* out, conv_patch_stack( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid, int out_len, int out_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
...@@ -375,7 +375,7 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -375,7 +375,7 @@ conv_patch_stack( float* img, float* kern, float* out,
out_row*out_wid+out_col],sum); out_row*out_wid+out_col],sum);
}else{ }else{
float __shared__ *kern_, *img_; const float __shared__ *kern_, *img_;
int __shared__ out_len_max; int __shared__ out_len_max;
kern_=kern+kern_stride_nkern*kern_id;//the good nkern kern_=kern+kern_stride_nkern*kern_id;//the good nkern
...@@ -456,7 +456,7 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -456,7 +456,7 @@ conv_patch_stack( float* img, float* kern, float* out,
*/ */
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern> template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
__global__ void __global__ void
conv_patch_stack_reduce( float* img, float* kern, float* out, conv_patch_stack_reduce( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
...@@ -572,7 +572,7 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -572,7 +572,7 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
*/ */
template<int KERN_WIDTH, bool c_contiguous> template<int KERN_WIDTH, bool c_contiguous>
__global__ void __global__ void
conv_rows( float* img, float* kern, float* out, conv_rows( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
...@@ -633,7 +633,7 @@ conv_rows( float* img, float* kern, float* out, ...@@ -633,7 +633,7 @@ conv_rows( float* img, float* kern, float* out,
*/ */
template<int KERN_WIDTH, bool c_contiguous> template<int KERN_WIDTH, bool c_contiguous>
__global__ void __global__ void
conv_rows_stack( float* img, float* kern, float* out, conv_rows_stack( const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
...@@ -731,7 +731,7 @@ conv_rows_stack( float* img, float* kern, float* out, ...@@ -731,7 +731,7 @@ conv_rows_stack( float* img, float* kern, float* out,
*/ */
template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern> template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
__global__ void __global__ void
conv_rows_stack2( float* img, float* kern, float* out, conv_rows_stack2(const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
...@@ -831,8 +831,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -831,8 +831,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int kern_len, int kern_wid,
int out_len, int out_wid, //physical int out_len, int out_wid, //physical
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C , float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols, int subsample_rows, int subsample_cols,
const int initial_reduce_boundary) const int initial_reduce_boundary)
...@@ -859,8 +859,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -859,8 +859,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
float sum = 0.0f; float sum = 0.0f;
if(stack_loop){ if(stack_loop){
for (; ss < stacklen; ss+=blockDim.x){ for (; ss < stacklen; ss+=blockDim.x){
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R; const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C; const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc) for (int cc = 0; cc < kern_wid; ++cc)
{ {
sum += kk_0[0] * ii_0[0]; sum += kk_0[0] * ii_0[0];
...@@ -869,8 +869,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -869,8 +869,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
} }
} }
}else{ }else{
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R; const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C; const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc) for (int cc = 0; cc < kern_wid; ++cc)
{ {
sum += kk_0[0] * ii_0[0]; sum += kk_0[0] * ii_0[0];
...@@ -925,8 +925,8 @@ conv_reference_valid(int nB, int nK, int stacklen, ...@@ -925,8 +925,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int kern_len, int kern_wid,
int out_len, int out_wid, //physical int out_len, int out_wid, //physical
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C , float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols) int subsample_rows, int subsample_cols)
{ {
...@@ -984,8 +984,8 @@ conv_reference_full(int nB, int nK, int stacklen, ...@@ -984,8 +984,8 @@ conv_reference_full(int nB, int nK, int stacklen,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int kern_len, int kern_wid,
int out_len, int out_wid, //physical dimensions int out_len, int out_wid, //physical dimensions
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C, float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
int subsample_rows, int subsample_cols) int subsample_rows, int subsample_cols)
{ {
......
...@@ -25,6 +25,7 @@ from theano.tests.unittest_tools import seed_rng ...@@ -25,6 +25,7 @@ from theano.tests.unittest_tools import seed_rng
from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu, from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
mode_without_gpu) mode_without_gpu)
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.conv import GpuConv
import pygpu import pygpu
gftensor4 = GpuArrayType('float32', [False] * 4) gftensor4 = GpuArrayType('float32', [False] * 4)
...@@ -159,11 +160,11 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -159,11 +160,11 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
t1 = time.time() t1 = time.time()
i = gftensor4() i = gftensor4()
k = gftensor4() k = gftensor4()
op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode, op = GpuConv(border_mode=mode,
subsample=subsample, subsample=subsample,
version=version, version=version,
verbose=verbose, verbose=verbose,
kshp=compile_kshp)(i, k) kshp=compile_kshp)(i, k)
f = theano.function([i, k], op, mode=mode_with_gpu) f = theano.function([i, k], op, mode=mode_with_gpu)
gpuval = f(img, kern) gpuval = f(img, kern)
t2 = time.time() t2 = time.time()
...@@ -731,7 +732,7 @@ class TestConv2DGPU(unittest.TestCase): ...@@ -731,7 +732,7 @@ class TestConv2DGPU(unittest.TestCase):
func = theano.function([a, A], image_estimate, mode=mode_with_gpu) func = theano.function([a, A], image_estimate, mode=mode_with_gpu)
#theano.printing.debugprint(func,) #theano.printing.debugprint(func,)
assert any([isinstance(node.op, theano.sandbox.cuda.blas.GpuConv) assert any([isinstance(node.op, GpuConv)
for node in func.maker.fgraph.toposort()]) for node in func.maker.fgraph.toposort()])
a_in = numpy.random.randn(*featshp).astype("float32") a_in = numpy.random.randn(*featshp).astype("float32")
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论