提交 57f60f1f authored 作者: Frederic Bastien's avatar Frederic Bastien

Make the example with pycuda work for bigger size of matrix.

上级 171e716c
...@@ -106,21 +106,23 @@ class PycudaElemwiseSourceModuleOp(Op): ...@@ -106,21 +106,23 @@ class PycudaElemwiseSourceModuleOp(Op):
otype = CudaNdarrayType(broadcastable=[False]*_inputs[0].type.ndim) otype = CudaNdarrayType(broadcastable=[False]*_inputs[0].type.ndim)
assert self.nout == 1 assert self.nout == 1
#TODO change the scalar op with the good c_code!
fct_name = "pycuda_elemwise_%s"%str(self.scalar_op) fct_name = "pycuda_elemwise_%s"%str(self.scalar_op)
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)]) out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i"+str(id) for id in range(len(inputs))] in_name = ["i"+str(id) for id in range(len(inputs))]
out_name = ["o"+str(id) for id in range(self.nout)] out_name = ["o"+str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {}) c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {})
c_code_param = ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]) c_code_param = ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]+["int size"])
mod = SourceModule(""" mod = SourceModule("""
#include<Python.h> #include<Python.h>
#include <numpy/arrayobject.h> #include <numpy/arrayobject.h>
__global__ void %s(%s) __global__ void %s(%s)
{ {
int i = threadIdx.x + threadIdx.y*blockDim.x; int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
i += threadIdx.x + threadIdx.y*blockDim.x;
if(i<size){
%s %s
} }
}
"""%(fct_name,c_code_param,c_code)) """%(fct_name,c_code_param,c_code))
self.pycuda_fct = mod.get_function(fct_name) self.pycuda_fct = mod.get_function(fct_name)
return out_node return out_node
...@@ -131,7 +133,16 @@ class PycudaElemwiseSourceModuleOp(Op): ...@@ -131,7 +133,16 @@ class PycudaElemwiseSourceModuleOp(Op):
z, = out z, = out
if z[0] is None or z[0].shape!=inputs[0].shape: if z[0] is None or z[0].shape!=inputs[0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape) z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
self.pycuda_fct(inputs[0],inputs[1],z[0], block=(inputs[0].shape[0],inputs[0].shape[1],1)) if inputs[0].shape != inputs[1].shape:
raise TypeError("PycudaElemwiseSourceModuleOp: inputs don't have the same shape!")
if inputs[0].size > 512:
grid = (int(numpy.ceil(inputs[0].size / 512.)),1)
block = (512,1,1)
else:
grid = (1,1)
block = (inputs[0].shape[0],inputs[0].shape[1],1)
self.pycuda_fct(inputs[0], inputs[1], z[0], numpy.intc(inputs[1].size), block=block, grid=grid)
class PycudaElemwiseKernelOp(Op): class PycudaElemwiseKernelOp(Op):
......
...@@ -24,23 +24,27 @@ else: ...@@ -24,23 +24,27 @@ else:
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu') mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def test_pycuda_elemwise_source_module(): def test_pycuda_elemwise_source_module():
for shape in [(5,5), (10,49), (50,49),(500,501),(5000,5001)]:
for op in [theano.scalar.basic.mul, theano.scalar.basic.add]:
x=T.fmatrix('x') x=T.fmatrix('x')
y=T.fmatrix('y') y=T.fmatrix('y')
f=theano.function([x,y],x*y, mode=mode_with_gpu) pycuda_op = PycudaElemwiseSourceModuleOp(op)
print f.maker.env.toposort() elemwise_op = theano.tensor.Elemwise(op)
f2 = theano.function([x,y],x*y, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise")) f=theano.function([x,y], elemwise_op(x,y), mode=mode_with_gpu)
print f2.maker.env.toposort() f2 = theano.function([x,y], theano.sandbox.cuda.host_from_gpu(pycuda_op(x,y)))
f3 = theano.function([x,y], elemwise_op(x,y),
mode=mode_with_gpu.including("local_pycuda_gpu_elemwise"))
assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()]) assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f2.maker.env.toposort()]) assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f2.maker.env.toposort()])
assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f3.maker.env.toposort()])
val1 = numpy.asarray(numpy.random.rand(5,5), dtype='float32') val1 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
val2 = numpy.asarray(numpy.random.rand(5,5), dtype='float32') val2 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
#val1 = numpy.ones((5,5))
#val2 = numpy.arange(25).reshape(5,5)
assert (f(val1,val2) == f2(val1,val2)).all() assert (f(val1,val2) == f2(val1,val2)).all()
print f(val1,val2) assert (f(val1,val2) == f3(val1,val2)).all()
print f2(val1,val2) #print f(val1,val2)
#print f2(val1,val2)
def test_pycuda_elemwise_kernel(): def test_pycuda_elemwise_kernel():
x=T.fmatrix('x') x=T.fmatrix('x')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论