提交 5fc89c03 authored 作者: Frederic's avatar Frederic

new GpuConv compile, but give wrong version in some cases!

上级 baf12f54
import copy
import os
import theano
from theano import gof
from theano import config, gof
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
from theano.sandbox.gpuarray.type import GpuArrayType
class GpuConv(gof.Op):
......@@ -114,6 +119,9 @@ class GpuConv(gof.Op):
str(self.kshp))
def make_node(self, img, kern):
if img.dtype != "float32" or kern.dtype != "float32":
raise NotImplementedError("GpuConv currently only work"
" with float32 dtype")
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
......@@ -121,7 +129,8 @@ class GpuConv(gof.Op):
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
out = GpuArrayType(img.dtype, broadcastable)()
return gof.Apply(self, [img, kern], [out])
def flops(self, inputs, outputs):
""" Useful with the hack in profilemode to print the MFlops"""
......@@ -145,6 +154,8 @@ class GpuConv(gof.Op):
def make_thunk(self, node, storage_map, compute_map, no_recycling):
node_ = copy.copy(node)
assert node.op is node_.op
if config.gpuarray.sync:
raise NotImplementedError("GpuConv do not implement gpuarray.sync Theano flag")
if node_.op.max_threads_dim0 is None:
cuda = theano.sandbox.cuda
device_id = cuda.use.device_number
......@@ -169,20 +180,30 @@ class GpuConv(gof.Op):
return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G']
def c_headers(self):
return ['cuda_ndarray.cuh', '<stdio.h>']
return ['<stdio.h>', 'cuda.h',
'<compyte/extension.h>', '<compyte/numpy_compat.h>']
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 20)
def c_init_code(self):
return ['cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of
# these files
files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in files]
codes = ["CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);",
"float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}",
"const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}"]
codes += [open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in files]
return reduce(str.__add__, codes)
def c_compiler(self):
return NVCC_compiler
def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp
out, = out_
......@@ -226,7 +247,8 @@ class GpuConv(gof.Op):
}
// TODO, make out be decref before we alloc out2!
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
PyGpuArrayObject * out2 = (PyGpuArrayObject *)PyGpuArray_Conv(
%(img)s, %(kern)s,
%(out)s, mode,
dx, dy,
version, verbose,
......
......@@ -4,7 +4,8 @@
//grid block size=batch_id
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void
conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
conv_full_patch_split(const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1;
......@@ -60,7 +61,7 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
//grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void
conv_full_patch( float* img, float* kern, float* out,
conv_full_patch( const float* img, const float* kern, float* out,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack)
{
......@@ -122,7 +123,7 @@ conv_full_patch( float* img, float* kern, float* out,
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__global__ void
conv_full_patch_stack( float* img, float* kern, float* out,
conv_full_patch_stack( const float* img, const float* kern, float* out,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row,
......@@ -133,7 +134,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
float __shared__ *kern_, *img_;
const float __shared__ *kern_, *img_;
extern __shared__ float s_data[];
const int batch_id = blockIdx.x;
......@@ -201,7 +202,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
*/
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
__global__ void
conv_full_patch_stack_padded( float* img, float* kern, float* out,
conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const int img_len, const int img_wid,
const int kern_len, const int kern_wid,
const int nkern, const int nstack,
......@@ -365,7 +366,7 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
}
template<int NSTACK>
__global__ void
conv_full_load_everything( float* img, float* kern, float* out,
conv_full_load_everything( const float* img, const float* kern, float* out,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row,
......
......@@ -221,7 +221,7 @@ __device__ void store_or_accumulate(float& dst,const float value ){
*/
template<bool flipped_kern, int KERN_WIDTH, bool split>
__global__ void
conv_patch( float* img, float* kern, float* out,
conv_patch( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack)
{
......@@ -304,7 +304,7 @@ conv_patch( float* img, float* kern, float* out,
*/
template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__global__ void
conv_patch_stack( float* img, float* kern, float* out,
conv_patch_stack( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row,
......@@ -375,7 +375,7 @@ conv_patch_stack( float* img, float* kern, float* out,
out_row*out_wid+out_col],sum);
}else{
float __shared__ *kern_, *img_;
const float __shared__ *kern_, *img_;
int __shared__ out_len_max;
kern_=kern+kern_stride_nkern*kern_id;//the good nkern
......@@ -456,7 +456,7 @@ conv_patch_stack( float* img, float* kern, float* out,
*/
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
__global__ void
conv_patch_stack_reduce( float* img, float* kern, float* out,
conv_patch_stack_reduce( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch,
......@@ -572,7 +572,7 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
*/
template<int KERN_WIDTH, bool c_contiguous>
__global__ void
conv_rows( float* img, float* kern, float* out,
conv_rows( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack,
int img_stride_col, int img_stride_row,
......@@ -633,7 +633,7 @@ conv_rows( float* img, float* kern, float* out,
*/
template<int KERN_WIDTH, bool c_contiguous>
__global__ void
conv_rows_stack( float* img, float* kern, float* out,
conv_rows_stack( const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
......@@ -731,7 +731,7 @@ conv_rows_stack( float* img, float* kern, float* out,
*/
template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
__global__ void
conv_rows_stack2( float* img, float* kern, float* out,
conv_rows_stack2(const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
......@@ -831,8 +831,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols,
const int initial_reduce_boundary)
......@@ -859,8 +859,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
float sum = 0.0f;
if(stack_loop){
for (; ss < stacklen; ss+=blockDim.x){
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc)
{
sum += kk_0[0] * ii_0[0];
......@@ -869,8 +869,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
}
}
}else{
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc)
{
sum += kk_0[0] * ii_0[0];
......@@ -925,8 +925,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols)
{
......@@ -984,8 +984,8 @@ conv_reference_full(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical dimensions
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
int subsample_rows, int subsample_cols)
{
......
......@@ -25,6 +25,7 @@ from theano.tests.unittest_tools import seed_rng
from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
mode_without_gpu)
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.conv import GpuConv
import pygpu
gftensor4 = GpuArrayType('float32', [False] * 4)
......@@ -159,11 +160,11 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
t1 = time.time()
i = gftensor4()
k = gftensor4()
op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
subsample=subsample,
version=version,
verbose=verbose,
kshp=compile_kshp)(i, k)
op = GpuConv(border_mode=mode,
subsample=subsample,
version=version,
verbose=verbose,
kshp=compile_kshp)(i, k)
f = theano.function([i, k], op, mode=mode_with_gpu)
gpuval = f(img, kern)
t2 = time.time()
......@@ -731,7 +732,7 @@ class TestConv2DGPU(unittest.TestCase):
func = theano.function([a, A], image_estimate, mode=mode_with_gpu)
#theano.printing.debugprint(func,)
assert any([isinstance(node.op, theano.sandbox.cuda.blas.GpuConv)
assert any([isinstance(node.op, GpuConv)
for node in func.maker.fgraph.toposort()])
a_in = numpy.random.randn(*featshp).astype("float32")
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论