提交 33da7e20 authored 作者: Frederic's avatar Frederic

pep8

上级 b0e55935
""" """This file show how we can use Pycuda compiled fct in a Theano
This file show how we can use Pycuda compiled fct in a Theano Op. Do no use those op in production code. See the TODO. Op. Do no use those op in production code. See the TODO.
You can use them as a guide to use your pycuda code into a Theano op. You can use them as a guide to use your pycuda code into a Theano op.
The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code generated with pycuda.compiler.SourceModule The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code
generated with pycuda.compiler.SourceModule
The PycudaElemwiseKernelOp op use pycuda code generated with pycuda.elementwise.ElementwiseKernel. It must be wrapper by TheanoElementwiseKernel. The PycudaElemwiseKernelOp op use pycuda code generated with
pycuda.elementwise.ElementwiseKernel. It must be wrapper by
TheanoElementwiseKernel.
Their is a test in test_pycuda.py. Their is a test in test_pycuda.py.
This don't work with broadcast and non-contiguous memory as pycuda don't support that, but we make sure we don't introduce problem. This don't work with broadcast and non-contiguous memory as pycuda
don't support that, but we make sure we don't introduce problem.
If the memory is non-contiguous, we create a new copy that is contiguous. If the memory is non-contiguous, we create a new copy that is contiguous.
If their is broadcasted dimensions, we raise an error. If their is broadcasted dimensions, we raise an error.
""" """
import numpy import numpy
...@@ -19,7 +24,8 @@ import numpy ...@@ -19,7 +24,8 @@ import numpy
import theano import theano
from theano.gof import Op, Apply, local_optimizer, EquilibriumDB from theano.gof import Op, Apply, local_optimizer, EquilibriumDB
from theano.sandbox.cuda import GpuElemwise, CudaNdarrayType, GpuOp from theano.sandbox.cuda import GpuElemwise, CudaNdarrayType, GpuOp
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, gpu_contiguous from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous)
from theano.sandbox.cuda.opt import gpu_seqopt from theano.sandbox.cuda.opt import gpu_seqopt
import pycuda_init import pycuda_init
...@@ -30,30 +36,36 @@ import pycuda ...@@ -30,30 +36,36 @@ import pycuda
from pycuda.elementwise import ElementwiseKernel from pycuda.elementwise import ElementwiseKernel
from pycuda.compiler import SourceModule from pycuda.compiler import SourceModule
from pycuda.tools import VectorArg from pycuda.tools import VectorArg
import pycuda.gpuarray
def theano_parse_c_arg(c_arg): def theano_parse_c_arg(c_arg):
c_arg = c_arg.replace('npy_float32','float') c_arg = c_arg.replace('npy_float32', 'float')
c_arg = c_arg.replace('npy_float64','double') c_arg = c_arg.replace('npy_float64', 'double')
c_arg = c_arg.replace('npy_int32','int') c_arg = c_arg.replace('npy_int32', 'int')
c_arg = c_arg.replace('npy_int8','char') c_arg = c_arg.replace('npy_int8', 'char')
c_arg = c_arg.replace('npy_ucs4','unsigned int') c_arg = c_arg.replace('npy_ucs4', 'unsigned int')
c_arg = c_arg.replace('npy_uint32','unsigned int') c_arg = c_arg.replace('npy_uint32', 'unsigned int')
c_arg = c_arg.replace('npy_uint16','unsigned short') c_arg = c_arg.replace('npy_uint16', 'unsigned short')
c_arg = c_arg.replace('npy_uint8','unsigned char') c_arg = c_arg.replace('npy_uint8', 'unsigned char')
return pycuda.tools.parse_c_arg(c_arg) return pycuda.tools.parse_c_arg(c_arg)
class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel): class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
def __init__(self, arguments, operation, def __init__(self, arguments, operation,
name="kernel", keep=False, options=[], **kwargs): name="kernel", keep=False, options=[], **kwargs):
if isinstance(arguments, basestring): if isinstance(arguments, basestring):
arguments = [theano_parse_c_arg(arg) for arg in arguments.split(",")] arguments = [theano_parse_c_arg(arg)
pycuda.elementwise.ElementwiseKernel.__init__(self, arguments, operation, name, keep, options, **kwargs) for arg in arguments.split(",")]
pycuda.elementwise.ElementwiseKernel.__init__(self, arguments,
operation, name, keep,
options, **kwargs)
def __call__(self, *args): def __call__(self, *args):
vectors = [] vectors = []
invocation_args = [] invocation_args = []
for arg, arg_descr in zip(args, self.arguments): for arg, arg_descr in zip(args, self.gen_kwargs["arguments"]):
if isinstance(arg_descr, VectorArg): if isinstance(arg_descr, VectorArg):
vectors.append(arg) vectors.append(arg)
invocation_args.append(arg.gpudata) invocation_args.append(arg.gpudata)
...@@ -62,7 +74,7 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel): ...@@ -62,7 +74,7 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
repr_vec = vectors[0] repr_vec = vectors[0]
invocation_args.append(repr_vec.mem_size) invocation_args.append(repr_vec.mem_size)
if hasattr(repr_vec,"_block") and hasattr(repr_vec,"_grid"): if hasattr(repr_vec, "_block") and hasattr(repr_vec, "_grid"):
self.func.set_block_shape(*repr_vec._block) self.func.set_block_shape(*repr_vec._block)
self.func.prepared_call(repr_vec._grid, *invocation_args) self.func.prepared_call(repr_vec._grid, *invocation_args)
else: else:
...@@ -75,19 +87,20 @@ class PycudaElemwiseSourceModuleOp(GpuOp): ...@@ -75,19 +87,20 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
nin = property(lambda self: self.scalar_op.nin) nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout) nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern = {}, name = None): def __init__(self, scalar_op, inplace_pattern={}, name=None):
self.name = name self.name = name
self.scalar_op = scalar_op self.scalar_op = scalar_op
self.inplace_pattern=None self.inplace_pattern = None
def __str__(self): def __str__(self):
if self.name is None: if self.name is None:
if self.inplace_pattern: if self.inplace_pattern:
items = self.inplace_pattern.items() items = self.inplace_pattern.items()
items.sort() items.sort()
return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items)) return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
str(items))
else: else:
return self.__class__.__name__+"{%s}" % (self.scalar_op) return self.__class__.__name__ + "{%s}" % (self.scalar_op)
else: else:
return self.name return self.name
...@@ -101,17 +114,23 @@ class PycudaElemwiseSourceModuleOp(GpuOp): ...@@ -101,17 +114,23 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
if any([any(i.type.broadcastable) for i in inputs]): if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions") raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs)==2#TODO remove assert len(inputs) == 2 # TODO remove
otype = CudaNdarrayType(broadcastable=[False]*_inputs[0].type.ndim) otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
assert self.nout == 1 assert self.nout == 1
fct_name = "pycuda_elemwise_%s"%str(self.scalar_op) fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)]) out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i"+str(id) for id in range(len(inputs))] in_name = ["i" + str(id) for id in range(len(inputs))]
out_name = ["o"+str(id) for id in range(self.nout)] out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {}) c_code = self.scalar_op.c_code(out_node, "some_name",
c_code_param = ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]+["int size"]) tuple([n + "[i]" for n in in_name]),
tuple(n + "[i]" for n in out_name), {})
c_code_param = ", ".join([var.type.dtype_specs()[1] + " *" + name
for var, name in (zip(inputs, in_name) +
zip(out_node.outputs,
out_name))] +
["int size"])
mod = SourceModule(""" mod = SourceModule("""
#include<Python.h> #include<Python.h>
#include <numpy/arrayobject.h> #include <numpy/arrayobject.h>
...@@ -123,7 +142,7 @@ class PycudaElemwiseSourceModuleOp(GpuOp): ...@@ -123,7 +142,7 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
%s %s
} }
} }
"""%(fct_name,c_code_param,c_code)) """ % (fct_name, c_code_param, c_code))
self.pycuda_fct = mod.get_function(fct_name) self.pycuda_fct = mod.get_function(fct_name)
return out_node return out_node
...@@ -131,37 +150,40 @@ class PycudaElemwiseSourceModuleOp(GpuOp): ...@@ -131,37 +150,40 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
#TODO support broadcast! #TODO support broadcast!
#TODO assert all input have the same shape #TODO assert all input have the same shape
z, = out z, = out
if z[0] is None or z[0].shape!=inputs[0].shape: if z[0] is None or z[0].shape != inputs[0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape) z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
if inputs[0].shape != inputs[1].shape: if inputs[0].shape != inputs[1].shape:
raise TypeError("PycudaElemwiseSourceModuleOp: inputs don't have the same shape!") raise TypeError("PycudaElemwiseSourceModuleOp:"
" inputs don't have the same shape!")
if inputs[0].size > 512: if inputs[0].size > 512:
grid = (int(numpy.ceil(inputs[0].size / 512.)),1) grid = (int(numpy.ceil(inputs[0].size / 512.)), 1)
block = (512,1,1) block = (512, 1, 1)
else: else:
grid = (1,1) grid = (1, 1)
block = (inputs[0].shape[0],inputs[0].shape[1],1) block = (inputs[0].shape[0], inputs[0].shape[1], 1)
self.pycuda_fct(inputs[0], inputs[1], z[0], numpy.intc(inputs[1].size), block=block, grid=grid) self.pycuda_fct(inputs[0], inputs[1], z[0],
numpy.intc(inputs[1].size), block=block, grid=grid)
class PycudaElemwiseKernelOp(GpuOp): class PycudaElemwiseKernelOp(GpuOp):
nin = property(lambda self: self.scalar_op.nin) nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout) nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern = {}, name = None): def __init__(self, scalar_op, inplace_pattern={}, name=None):
self.name = name self.name = name
self.scalar_op = scalar_op self.scalar_op = scalar_op
self.inplace_pattern=None self.inplace_pattern = None
def __str__(self): def __str__(self):
if self.name is None: if self.name is None:
if self.inplace_pattern: if self.inplace_pattern:
items = self.inplace_pattern.items() items = self.inplace_pattern.items()
items.sort() items.sort()
return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items)) return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
str(items))
else: else:
return self.__class__.__name__+"{%s}" % (self.scalar_op) return self.__class__.__name__ + "{%s}" % (self.scalar_op)
else: else:
return self.name return self.name
...@@ -175,9 +197,10 @@ class PycudaElemwiseKernelOp(GpuOp): ...@@ -175,9 +197,10 @@ class PycudaElemwiseKernelOp(GpuOp):
if any([any(i.type.broadcastable) for i in inputs]): if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions") raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs)==2#TODO remove assert len(inputs) == 2 # TODO remove
# output is broadcastable only along dimensions where all inputs are broadcastable # output is broadcastable only along dimensions where all inputs are
# broadcastable
broadcastable = [] broadcastable = []
for d in xrange(_inputs[0].type.ndim): for d in xrange(_inputs[0].type.ndim):
bcast_d = True bcast_d = True
...@@ -192,14 +215,18 @@ class PycudaElemwiseKernelOp(GpuOp): ...@@ -192,14 +215,18 @@ class PycudaElemwiseKernelOp(GpuOp):
assert self.nout == 1 assert self.nout == 1
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)]) out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i"+str(id) for id in range(len(inputs))] in_name = ["i" + str(id) for id in range(len(inputs))]
out_name = ["o"+str(id) for id in range(self.nout)] out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {}) c_code = self.scalar_op.c_code(out_node, "some_name",
tuple([n + "[i]"for n in in_name]),
tuple(n + "[i]"for n in out_name), {})
self.pycuda_fct = TheanoElementwiseKernel( self.pycuda_fct = TheanoElementwiseKernel(
", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]), ", ".join([var.type.dtype_specs()[1] + " *" + name
for var, name in (zip(inputs, in_name) +
zip(out_node.outputs, out_name))]),
c_code, c_code,
"pycuda_elemwise_kernel_%s"%str(self.scalar_op), "pycuda_elemwise_kernel_%s" % str(self.scalar_op),
preamble="""#include<Python.h> preamble="""#include<Python.h>
#include <numpy/arrayobject.h>""") #include <numpy/arrayobject.h>""")
return out_node return out_node
...@@ -207,7 +234,7 @@ class PycudaElemwiseKernelOp(GpuOp): ...@@ -207,7 +234,7 @@ class PycudaElemwiseKernelOp(GpuOp):
def perform(self, node, inputs, out): def perform(self, node, inputs, out):
#TODO assert all input have the same shape #TODO assert all input have the same shape
z, = out z, = out
if z[0] is None or z[0].shape!=inputs[0].shape: if z[0] is None or z[0].shape != inputs[0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape) z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
i = inputs + z i = inputs + z
self.pycuda_fct(*i) self.pycuda_fct(*i)
...@@ -215,17 +242,23 @@ class PycudaElemwiseKernelOp(GpuOp): ...@@ -215,17 +242,23 @@ class PycudaElemwiseKernelOp(GpuOp):
pycuda_optimizer = EquilibriumDB() pycuda_optimizer = EquilibriumDB()
gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run") gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")
@local_optimizer([]) @local_optimizer([])
def local_pycuda_gpu_elemwise(node): def local_pycuda_gpu_elemwise(node):
""" """
GpuElemwise -> PycudaElemwiseSourceModuleOp GpuElemwise -> PycudaElemwiseSourceModuleOp
""" """
if isinstance(node.op, GpuElemwise): if isinstance(node.op, GpuElemwise):
if not any([ any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim<=2 for i in node.inputs]): if (not any([any(i.type.broadcastable) for i in node.inputs]) and
new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs) all([i.ndim <= 2 for i in node.inputs])):
new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op,
node.op.inplace_pattern)(
*node.inputs)
return [new_op] return [new_op]
pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise) pycuda_optimizer.register("local_pycuda_gpu_elemwise",
local_pycuda_gpu_elemwise)
@local_optimizer([]) @local_optimizer([])
def local_pycuda_gpu_elemwise_kernel(node): def local_pycuda_gpu_elemwise_kernel(node):
...@@ -233,8 +266,11 @@ def local_pycuda_gpu_elemwise_kernel(node): ...@@ -233,8 +266,11 @@ def local_pycuda_gpu_elemwise_kernel(node):
GpuElemwise -> PycudaElemwiseKernelOp GpuElemwise -> PycudaElemwiseKernelOp
""" """
if isinstance(node.op, GpuElemwise): if isinstance(node.op, GpuElemwise):
if not any([ any(i.type.broadcastable) for i in node.inputs]): if not any([any(i.type.broadcastable) for i in node.inputs]):
new_op = PycudaElemwiseKernelOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs) new_op = PycudaElemwiseKernelOp(node.op.scalar_op,
node.op.inplace_pattern)(
*node.inputs)
return [new_op] return [new_op]
pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel", local_pycuda_gpu_elemwise_kernel, 1.5) pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel",
local_pycuda_gpu_elemwise_kernel, 1.5)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论