提交 33da7e20 authored 作者: Frederic's avatar Frederic

pep8

上级 b0e55935
"""
This file show how we can use Pycuda compiled fct in a Theano Op. Do no use those op in production code. See the TODO.
"""This file show how we can use Pycuda compiled fct in a Theano
Op. Do no use those op in production code. See the TODO.
You can use them as a guide to use your pycuda code into a Theano op.
The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code generated with pycuda.compiler.SourceModule
The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code
generated with pycuda.compiler.SourceModule
The PycudaElemwiseKernelOp op use pycuda code generated with pycuda.elementwise.ElementwiseKernel. It must be wrapper by TheanoElementwiseKernel.
The PycudaElemwiseKernelOp op use pycuda code generated with
pycuda.elementwise.ElementwiseKernel. It must be wrapper by
TheanoElementwiseKernel.
Their is a test in test_pycuda.py.
This don't work with broadcast and non-contiguous memory as pycuda don't support that, but we make sure we don't introduce problem.
This don't work with broadcast and non-contiguous memory as pycuda
don't support that, but we make sure we don't introduce problem.
If the memory is non-contiguous, we create a new copy that is contiguous.
If their is broadcasted dimensions, we raise an error.
"""
import numpy
......@@ -19,7 +24,8 @@ import numpy
import theano
from theano.gof import Op, Apply, local_optimizer, EquilibriumDB
from theano.sandbox.cuda import GpuElemwise, CudaNdarrayType, GpuOp
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, gpu_contiguous
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous)
from theano.sandbox.cuda.opt import gpu_seqopt
import pycuda_init
......@@ -30,30 +36,36 @@ import pycuda
from pycuda.elementwise import ElementwiseKernel
from pycuda.compiler import SourceModule
from pycuda.tools import VectorArg
import pycuda.gpuarray
def theano_parse_c_arg(c_arg):
c_arg = c_arg.replace('npy_float32','float')
c_arg = c_arg.replace('npy_float64','double')
c_arg = c_arg.replace('npy_int32','int')
c_arg = c_arg.replace('npy_int8','char')
c_arg = c_arg.replace('npy_ucs4','unsigned int')
c_arg = c_arg.replace('npy_uint32','unsigned int')
c_arg = c_arg.replace('npy_uint16','unsigned short')
c_arg = c_arg.replace('npy_uint8','unsigned char')
c_arg = c_arg.replace('npy_float32', 'float')
c_arg = c_arg.replace('npy_float64', 'double')
c_arg = c_arg.replace('npy_int32', 'int')
c_arg = c_arg.replace('npy_int8', 'char')
c_arg = c_arg.replace('npy_ucs4', 'unsigned int')
c_arg = c_arg.replace('npy_uint32', 'unsigned int')
c_arg = c_arg.replace('npy_uint16', 'unsigned short')
c_arg = c_arg.replace('npy_uint8', 'unsigned char')
return pycuda.tools.parse_c_arg(c_arg)
class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
def __init__(self, arguments, operation,
name="kernel", keep=False, options=[], **kwargs):
if isinstance(arguments, basestring):
arguments = [theano_parse_c_arg(arg) for arg in arguments.split(",")]
pycuda.elementwise.ElementwiseKernel.__init__(self, arguments, operation, name, keep, options, **kwargs)
arguments = [theano_parse_c_arg(arg)
for arg in arguments.split(",")]
pycuda.elementwise.ElementwiseKernel.__init__(self, arguments,
operation, name, keep,
options, **kwargs)
def __call__(self, *args):
vectors = []
invocation_args = []
for arg, arg_descr in zip(args, self.arguments):
for arg, arg_descr in zip(args, self.gen_kwargs["arguments"]):
if isinstance(arg_descr, VectorArg):
vectors.append(arg)
invocation_args.append(arg.gpudata)
......@@ -62,7 +74,7 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
repr_vec = vectors[0]
invocation_args.append(repr_vec.mem_size)
if hasattr(repr_vec,"_block") and hasattr(repr_vec,"_grid"):
if hasattr(repr_vec, "_block") and hasattr(repr_vec, "_grid"):
self.func.set_block_shape(*repr_vec._block)
self.func.prepared_call(repr_vec._grid, *invocation_args)
else:
......@@ -75,19 +87,20 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern = {}, name = None):
def __init__(self, scalar_op, inplace_pattern={}, name=None):
self.name = name
self.scalar_op = scalar_op
self.inplace_pattern=None
self.inplace_pattern = None
def __str__(self):
if self.name is None:
if self.inplace_pattern:
items = self.inplace_pattern.items()
items.sort()
return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items))
return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
str(items))
else:
return self.__class__.__name__+"{%s}" % (self.scalar_op)
return self.__class__.__name__ + "{%s}" % (self.scalar_op)
else:
return self.name
......@@ -101,17 +114,23 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs)==2#TODO remove
assert len(inputs) == 2 # TODO remove
otype = CudaNdarrayType(broadcastable=[False]*_inputs[0].type.ndim)
otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
assert self.nout == 1
fct_name = "pycuda_elemwise_%s"%str(self.scalar_op)
fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i"+str(id) for id in range(len(inputs))]
out_name = ["o"+str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {})
c_code_param = ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]+["int size"])
in_name = ["i" + str(id) for id in range(len(inputs))]
out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name",
tuple([n + "[i]" for n in in_name]),
tuple(n + "[i]" for n in out_name), {})
c_code_param = ", ".join([var.type.dtype_specs()[1] + " *" + name
for var, name in (zip(inputs, in_name) +
zip(out_node.outputs,
out_name))] +
["int size"])
mod = SourceModule("""
#include<Python.h>
#include <numpy/arrayobject.h>
......@@ -123,7 +142,7 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
%s
}
}
"""%(fct_name,c_code_param,c_code))
""" % (fct_name, c_code_param, c_code))
self.pycuda_fct = mod.get_function(fct_name)
return out_node
......@@ -131,37 +150,40 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
#TODO support broadcast!
#TODO assert all input have the same shape
z, = out
if z[0] is None or z[0].shape!=inputs[0].shape:
if z[0] is None or z[0].shape != inputs[0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
if inputs[0].shape != inputs[1].shape:
raise TypeError("PycudaElemwiseSourceModuleOp: inputs don't have the same shape!")
raise TypeError("PycudaElemwiseSourceModuleOp:"
" inputs don't have the same shape!")
if inputs[0].size > 512:
grid = (int(numpy.ceil(inputs[0].size / 512.)),1)
block = (512,1,1)
grid = (int(numpy.ceil(inputs[0].size / 512.)), 1)
block = (512, 1, 1)
else:
grid = (1,1)
block = (inputs[0].shape[0],inputs[0].shape[1],1)
self.pycuda_fct(inputs[0], inputs[1], z[0], numpy.intc(inputs[1].size), block=block, grid=grid)
grid = (1, 1)
block = (inputs[0].shape[0], inputs[0].shape[1], 1)
self.pycuda_fct(inputs[0], inputs[1], z[0],
numpy.intc(inputs[1].size), block=block, grid=grid)
class PycudaElemwiseKernelOp(GpuOp):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern = {}, name = None):
def __init__(self, scalar_op, inplace_pattern={}, name=None):
self.name = name
self.scalar_op = scalar_op
self.inplace_pattern=None
self.inplace_pattern = None
def __str__(self):
if self.name is None:
if self.inplace_pattern:
items = self.inplace_pattern.items()
items.sort()
return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items))
return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
str(items))
else:
return self.__class__.__name__+"{%s}" % (self.scalar_op)
return self.__class__.__name__ + "{%s}" % (self.scalar_op)
else:
return self.name
......@@ -175,9 +197,10 @@ class PycudaElemwiseKernelOp(GpuOp):
if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs)==2#TODO remove
assert len(inputs) == 2 # TODO remove
# output is broadcastable only along dimensions where all inputs are broadcastable
# output is broadcastable only along dimensions where all inputs are
# broadcastable
broadcastable = []
for d in xrange(_inputs[0].type.ndim):
bcast_d = True
......@@ -192,14 +215,18 @@ class PycudaElemwiseKernelOp(GpuOp):
assert self.nout == 1
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i"+str(id) for id in range(len(inputs))]
out_name = ["o"+str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {})
in_name = ["i" + str(id) for id in range(len(inputs))]
out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name",
tuple([n + "[i]"for n in in_name]),
tuple(n + "[i]"for n in out_name), {})
self.pycuda_fct = TheanoElementwiseKernel(
", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]),
", ".join([var.type.dtype_specs()[1] + " *" + name
for var, name in (zip(inputs, in_name) +
zip(out_node.outputs, out_name))]),
c_code,
"pycuda_elemwise_kernel_%s"%str(self.scalar_op),
"pycuda_elemwise_kernel_%s" % str(self.scalar_op),
preamble="""#include<Python.h>
#include <numpy/arrayobject.h>""")
return out_node
......@@ -207,7 +234,7 @@ class PycudaElemwiseKernelOp(GpuOp):
def perform(self, node, inputs, out):
#TODO assert all input have the same shape
z, = out
if z[0] is None or z[0].shape!=inputs[0].shape:
if z[0] is None or z[0].shape != inputs[0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
i = inputs + z
self.pycuda_fct(*i)
......@@ -215,17 +242,23 @@ class PycudaElemwiseKernelOp(GpuOp):
pycuda_optimizer = EquilibriumDB()
gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")
@local_optimizer([])
def local_pycuda_gpu_elemwise(node):
"""
GpuElemwise -> PycudaElemwiseSourceModuleOp
"""
if isinstance(node.op, GpuElemwise):
if not any([ any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim<=2 for i in node.inputs]):
new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
if (not any([any(i.type.broadcastable) for i in node.inputs]) and
all([i.ndim <= 2 for i in node.inputs])):
new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op,
node.op.inplace_pattern)(
*node.inputs)
return [new_op]
pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise)
pycuda_optimizer.register("local_pycuda_gpu_elemwise",
local_pycuda_gpu_elemwise)
@local_optimizer([])
def local_pycuda_gpu_elemwise_kernel(node):
......@@ -233,8 +266,11 @@ def local_pycuda_gpu_elemwise_kernel(node):
GpuElemwise -> PycudaElemwiseKernelOp
"""
if isinstance(node.op, GpuElemwise):
if not any([ any(i.type.broadcastable) for i in node.inputs]):
new_op = PycudaElemwiseKernelOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
if not any([any(i.type.broadcastable) for i in node.inputs]):
new_op = PycudaElemwiseKernelOp(node.op.scalar_op,
node.op.inplace_pattern)(
*node.inputs)
return [new_op]
pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel", local_pycuda_gpu_elemwise_kernel, 1.5)
pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel",
local_pycuda_gpu_elemwise_kernel, 1.5)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论