提交 6657d35b authored 作者: goodfeli's avatar goodfeli

Merge pull request #947 from nouiz/gpu_reduce

Gpu reduce small change.
...@@ -373,7 +373,12 @@ def use(device, ...@@ -373,7 +373,12 @@ def use(device,
if test_driver: if test_driver:
import theano.sandbox.cuda.tests.test_driver import theano.sandbox.cuda.tests.test_driver
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1() theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
if device_properties(use.device_number)["warpSize"] != 32:
raise ValueError("Your GPU have a warpSize of 32. Currently"
" we have code that depend on this. Email"
" Theano mailing list to tell us about"
" this new GPU as we don't know any with"
" this properties")
if move_shared_float32_to_gpu: if move_shared_float32_to_gpu:
handle_shared_float32(True) handle_shared_float32(True)
......
...@@ -624,8 +624,8 @@ class GpuCAReduce(GpuOp): ...@@ -624,8 +624,8 @@ class GpuCAReduce(GpuOp):
# but tensor.elemwise.CAReduce has this exact same check so I guess # but tensor.elemwise.CAReduce has this exact same check so I guess
# this is OK to do # this is OK to do
if self.scalar_op in [scal.minimum, scal.maximum]: if self.scalar_op in [scal.minimum, scal.maximum]:
for i in xrange(nd_in):
conds = [] conds = []
for i in xrange(nd_in):
if self.reduce_mask[i]: if self.reduce_mask[i]:
conds.append("(CudaNdarray_HOST_DIMS(%(x)s)[%(i)s] == 0)" % locals()) conds.append("(CudaNdarray_HOST_DIMS(%(x)s)[%(i)s] == 0)" % locals())
assert len(conds) > 0 assert len(conds) > 0
...@@ -723,7 +723,7 @@ class GpuCAReduce(GpuOp): ...@@ -723,7 +723,7 @@ class GpuCAReduce(GpuOp):
if (verbose) if (verbose)
printf("running kernel_reduce_10_%(name)s\\n"); printf("running kernel_reduce_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_10_%(name)s<<<n_blocks, n_threads, kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
n_shared>>>( n_shared>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
...@@ -862,11 +862,10 @@ class GpuCAReduce(GpuOp): ...@@ -862,11 +862,10 @@ class GpuCAReduce(GpuOp):
extern __shared__ float buf[]; extern __shared__ float buf[];
float myresult = 0.0f; float myresult = 0.0f;
//This is caught in cuda/init.py when we init the gpu. I keep
//it here to ease finding code that rely on this.
if (warpSize != 32) if (warpSize != 32)
{ {
// TODO: set error code
// 2012-09-20 IG: as of today, Fred says he will check
// this elsewhere, in a different PR
Z[0] = -666; Z[0] = -666;
return; return;
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论