提交 c032ac66 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Attempt at fixing convolution. It does not crash but gives wrong results.

上级 4f1c2697
...@@ -164,7 +164,7 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -164,7 +164,7 @@ class GpuConv(GpuKernelBase, gof.Op):
node_ = copy.copy(node) node_ = copy.copy(node)
assert node.op is node_.op assert node.op is node_.op
if node_.op.max_threads_dim0 is None: if node_.op.max_threads_dim0 is None:
node_.op.max_threads_dim0 = node._inputs[0].type.context.maxlsize node_.op.max_threads_dim0 = node_.inputs[0].type.context.maxlsize
return super(GpuConv, node_.op).make_thunk(node_, storage_map, return super(GpuConv, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling) compute_map, no_recycling)
...@@ -179,7 +179,7 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -179,7 +179,7 @@ class GpuConv(GpuKernelBase, gof.Op):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 22) return (0, 23)
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, node, nodename, inp, out_, sub):
if node.inputs[0].type.context.kind != "cuda": if node.inputs[0].type.context.kind != "cuda":
...@@ -251,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -251,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op):
""" % locals() """ % locals()
code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read() code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in ["conv_kernel.cu", "conv_full_kernel.cu"]]) for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
kname = "conv_full_load_everything"
gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags) gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
bin = gk._binary bin = gk._binary
bcode = ','.join(hex(ord(c)) for c in bin) bcode = ','.join(hex(ord(c)) for c in bin)
...@@ -262,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -262,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op):
static const char conv_bcode[] = {%(bcode)s}; static const char conv_bcode[] = {%(bcode)s};
static const char *conv_code = "%(code)s"; static const char *conv_code = "%(code)s";
""" % locals() """ % locals()
for k in kernels: return mod
mod += "static GpuKernel " + k.name + '_' + name + ";\n"
mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read() def c_support_code_struct(self, node, name):
mod = GpuKernelBase.c_support_code_struct(self, node, name)
with open(os.path.join(os.path.split(__file__)[0], "conv.cu")) as f:
mod += f.read()
return mod return mod
@utils.memoize @utils.memoize
......
...@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) { ...@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
//Must be the same size as a ptr. We can't use unsigned long as on Windows 64 //Must be the same size as a ptr. We can't use unsigned long as on Windows 64
//bit, it is 32 bit. //bit, it is 32 bit.
const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers const size_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){ __device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
if (nb_thread < 64) if (nb_thread < 64)
...@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_ ...@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
if (thread_id < nb_thread) if (thread_id < nb_thread)
{ {
const float * my_src_ptr = (const float *)( const float * my_src_ptr = (const float *)(
((uintptr_t)src) & COALESCED_ALIGN); ((size_t)src) & COALESCED_ALIGN);
my_src_ptr += thread_id; my_src_ptr += thread_id;
while (my_src_ptr < src + N) while (my_src_ptr < src + N)
{ {
......
...@@ -837,8 +837,7 @@ def local_gpu_conv(node, context_name): ...@@ -837,8 +837,7 @@ def local_gpu_conv(node, context_name):
return return
out = gpu_conv(GpuFromHost(context_name)(img), out = gpu_conv(GpuFromHost(context_name)(img),
GpuFromHost(context_name)(kern)) GpuFromHost(context_name)(kern))
# op_lifter want the output on the GPU. assert isinstance(out.type, GpuArrayType)
out = GpuFromHost(context_name)(out)
out.values_eq_approx = values_eq_approx out.values_eq_approx = values_eq_approx
return [out] return [out]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论