提交 c1ebebce authored 作者: Frederic's avatar Frederic

Fix GPU crash/too slow on Windows 64 bit for the convolution.

We where casting a 64 bit ptr to 32 bit. This was done with much help from Aaron Defazio. fix gh-1547
上级 bac100b4
...@@ -29,6 +29,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) { ...@@ -29,6 +29,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
*/ */
#ifndef CONV_KERNEL_CU #ifndef CONV_KERNEL_CU
#define CONV_KERNEL_CU #define CONV_KERNEL_CU
#include <stdint.h>
/* /*
#define CHECK_BANK_CONFLICTS 0 #define CHECK_BANK_CONFLICTS 0
...@@ -44,7 +45,9 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) { ...@@ -44,7 +45,9 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
#define MIN(a, b) ((a) < (b) ? (a) : (b) ) #define MIN(a, b) ((a) < (b) ? (a) : (b) )
#define MAX(a, b) ((a) < (b) ? (b) : (a) ) #define MAX(a, b) ((a) < (b) ? (b) : (a) )
const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers //Must be the same size as a ptr. We can't use unsigned long as on Windows 64
//bit, it is 32 bit.
const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){ __device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
if (nb_thread < 64) if (nb_thread < 64)
...@@ -73,7 +76,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_ ...@@ -73,7 +76,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
if (thread_id < nb_thread) if (thread_id < nb_thread)
{ {
const float * my_src_ptr = (const float *)( const float * my_src_ptr = (const float *)(
((unsigned long int)src) & COALESCED_ALIGN); ((uintptr_t)src) & COALESCED_ALIGN);
my_src_ptr += thread_id; my_src_ptr += thread_id;
while (my_src_ptr < src + N) while (my_src_ptr < src + N)
{ {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论