提交 3f5e0726 authored 作者: Ian Goodfellow's avatar Ian Goodfellow

merged in changes

"""
This file show how we can use Pycuda compiled fct in a Theano Op. Do no use them in production code. See the TODO.
This file show how we can use Pycuda compiled fct in a Theano Op. Do no use those op in production code. See the TODO.
You can use them as a guide to use your pycuda code into a Theano op.
The PycudaElemwiseSourceModule op use pycuda code generated with pycuda.compiler.SourceModule
The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code generated with pycuda.compiler.SourceModule
The PycudaElemwiseKernel op use pycuda code generated with pycuda.elementwise.ElementwiseKernel
The PycudaElemwiseKernelOp op use pycuda code generated with pycuda.elementwise.ElementwiseKernel. It must be wrapper by TheanoElementwiseKernel.
Their is a test in test_pycuda.py.
This don't work with broadcast and non-contiguous memory as pycuda don't support that, but we make sure we don't introduce problem.
This don't work with broadcast and non-contiguous memory as pycuda don't support that, but we make sure we don't introduce problem.
If the memory is non-contiguous, we create a new copy that is contiguous.
If their is broadcasted dimensions, we raise an error.
"""
import numpy
......@@ -25,10 +26,51 @@ from theano.sandbox.cuda.opt import gpu_seqopt
from pycuda.elementwise import ElementwiseKernel
from pycuda.compiler import SourceModule
from pycuda.gpuarray import splay
from pycuda.tools import VectorArg
import pycuda.autoinit
class PycudaElemwiseSourceModule(Op):
def theano_parse_c_arg(c_arg):
c_arg = c_arg.replace('npy_float32','float')
c_arg = c_arg.replace('npy_float64','double')
c_arg = c_arg.replace('npy_int32','int')
c_arg = c_arg.replace('npy_int8','char')
c_arg = c_arg.replace('npy_ucs4','unsigned int')
c_arg = c_arg.replace('npy_uint32','unsigned int')
c_arg = c_arg.replace('npy_uint16','unsigned short')
c_arg = c_arg.replace('npy_uint8','unsigned char')
return pycuda.tools.parse_c_arg(c_arg)
class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
def __init__(self, arguments, operation,
name="kernel", keep=False, options=[], **kwargs):
if isinstance(arguments, str):
arguments = [theano_parse_c_arg(arg) for arg in arguments.split(",")]
pycuda.elementwise.ElementwiseKernel.__init__(self, arguments, operation, name, keep, options, **kwargs)
def __call__(self, *args):
vectors = []
invocation_args = []
for arg, arg_descr in zip(args, self.arguments):
if isinstance(arg_descr, VectorArg):
vectors.append(arg)
invocation_args.append(arg.gpudata)
else:
invocation_args.append(arg)
repr_vec = vectors[0]
invocation_args.append(repr_vec.mem_size)
if hasattr(repr_vec,"_block") and hasattr(repr_vec,"_grid"):
self.func.set_block_shape(*repr_vec._block)
self.func.prepared_call(repr_vec._grid, *invocation_args)
else:
_grid, _block = pycuda.gpuarray.splay(repr_vec.mem_size)
self.func.set_block_shape(*_block)
self.func.prepared_call(_grid, *invocation_args)
class PycudaElemwiseSourceModuleOp(Op):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
......@@ -42,9 +84,9 @@ class PycudaElemwiseSourceModule(Op):
if self.inplace_pattern:
items = self.inplace_pattern.items()
items.sort()
return "PycudaElemwiseSourceModule{%s}%s" % (self.scalar_op, str(items))
return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items))
else:
return "PycudaElemwiseSourceModule{%s}" % (self.scalar_op)
return self.__class__.__name__+"{%s}" % (self.scalar_op)
else:
return self.name
......@@ -56,7 +98,8 @@ class PycudaElemwiseSourceModule(Op):
if i.type.ndim != inputs[0].type.ndim:
raise TypeError('different ranks among inputs')
assert not any([any(i.type.broadcastable) for i in inputs])
if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs)==2#TODO remove
otype = CudaNdarrayType(broadcastable=[False]*_inputs[0].type.ndim)
......@@ -89,7 +132,7 @@ class PycudaElemwiseSourceModule(Op):
self.pycuda_fct(inputs[0],inputs[1],z[0], block=(inputs[0].shape[0],inputs[0].shape[1],1))
class PycudaElemwiseKernel(Op):
class PycudaElemwiseKernelOp(Op):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
......@@ -103,9 +146,9 @@ class PycudaElemwiseKernel(Op):
if self.inplace_pattern:
items = self.inplace_pattern.items()
items.sort()
return "PycudaElemwiseKernel{%s}%s" % (self.scalar_op, str(items))
return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items))
else:
return "PycudaElemwiseKernel{%s}" % (self.scalar_op)
return self.__class__.__name__+"{%s}" % (self.scalar_op)
else:
return self.name
......@@ -117,7 +160,8 @@ class PycudaElemwiseKernel(Op):
if i.type.ndim != inputs[0].type.ndim:
raise TypeError('different ranks among inputs')
assert not any([any(i.type.broadcastable) for i in inputs])
if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs)==2#TODO remove
# output is broadcastable only along dimensions where all inputs are broadcastable
......@@ -139,7 +183,7 @@ class PycudaElemwiseKernel(Op):
out_name = ["o"+str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {})
self.pycuda_fct = ElementwiseKernel(
self.pycuda_fct = TheanoElementwiseKernel(
", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]),
c_code,
"pycuda_elemwise_kernel_%s"%str(self.scalar_op),
......@@ -152,8 +196,7 @@ class PycudaElemwiseKernel(Op):
if z[0] is None or z[0].shape!=inputs[0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
i = inputs + z
sp = splay(i[0].mem_size)
self.pycuda_fct(*i)#, grid=sp[0], block=sp[1])
self.pycuda_fct(*i)
pycuda_optimizer = EquilibriumDB()
gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")
......@@ -161,11 +204,11 @@ gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")
@local_optimizer([])
def local_pycuda_gpu_elemwise(node):
"""
GpuElemwise -> PycudaElemwiseSourceModule
GpuElemwise -> PycudaElemwiseSourceModuleOp
"""
if isinstance(node.op, GpuElemwise):
if not any([ any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim<=2 for i in node.inputs]):
new_op = PycudaElemwiseSourceModule(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
return [new_op]
pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise)
......@@ -173,11 +216,11 @@ pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise
@local_optimizer([])
def local_pycuda_gpu_elemwise_kernel(node):
"""
GpuElemwise -> PycudaElemwiseKernel
GpuElemwise -> PycudaElemwiseKernelOp
"""
if isinstance(node.op, GpuElemwise):
if not any([ any(i.type.broadcastable) for i in node.inputs]):
new_op = PycudaElemwiseKernel(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
new_op = PycudaElemwiseKernelOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
return [new_op]
pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel", local_pycuda_gpu_elemwise_kernel, 1.5)
......@@ -6,22 +6,36 @@ except ImportError:
from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda code.")
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
from nose.plugins.skip import SkipTest
raise SkipTest('Optional package cuda disabled')
import theano
import theano.tensor as T
from theano.misc.pycuda_example import PycudaElemwiseSourceModule, PycudaElemwiseKernel
from theano.misc.pycuda_example import PycudaElemwiseSourceModuleOp, PycudaElemwiseKernelOp
from theano.sandbox.cuda import GpuContiguous
import theano.misc.pycuda_example
import theano.sandbox.cuda as cuda_ndarray
if theano.config.mode=='FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def test_pycuda_elemwise_source_module():
x=T.fmatrix('x')
y=T.fmatrix('y')
f=theano.function([x,y],x*y)
f=theano.function([x,y],x*y, mode=mode_with_gpu)
print f.maker.env.toposort()
f2 = theano.function([x,y],x*y, mode=theano.compile.mode.get_default_mode().including("local_pycuda_gpu_elemwise"))
f2 = theano.function([x,y],x*y, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise"))
print f2.maker.env.toposort()
assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
assert any([ isinstance(node.op, PycudaElemwiseSourceModule) for node in f2.maker.env.toposort()])
assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f2.maker.env.toposort()])
val1 = numpy.random.rand(5,5)
val2 = numpy.random.rand(5,5)
......@@ -34,13 +48,13 @@ def test_pycuda_elemwise_source_module():
def test_pycuda_elemwise_kernel():
x=T.fmatrix('x')
y=T.fmatrix('y')
f=theano.function([x,y],x+y)
f=theano.function([x,y],x+y, mode=mode_with_gpu)
print f.maker.env.toposort()
f2 = theano.function([x,y],x+y, mode=theano.compile.mode.get_default_mode().including("local_pycuda_gpu_elemwise_kernel"))
f2 = theano.function([x,y],x+y, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel"))
print f2.maker.env.toposort()
assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
assert any([ isinstance(node.op, PycudaElemwiseKernel) for node in f2.maker.env.toposort()])
assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f2.maker.env.toposort()])
val1 = numpy.random.rand(5,5)
val2 = numpy.random.rand(5,5)
......@@ -55,9 +69,9 @@ def test_pycuda_elemwise_kernel():
y3=T.ftensor3('y')
z3=T.ftensor3('y')
f4 = theano.function([x3,y3,z3],x3*y3+z3, mode=theano.compile.mode.get_default_mode().including("local_pycuda_gpu_elemwise_kernel"))
f4 = theano.function([x3,y3,z3],x3*y3+z3, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel"))
print f4.maker.env.toposort()
assert any([ isinstance(node.op, PycudaElemwiseKernel) for node in f4.maker.env.toposort()])
assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f4.maker.env.toposort()])
val1 = numpy.random.rand(2,2,2)
print val1
......
......@@ -333,14 +333,14 @@ class PPrinter:
use_ascii = True
if use_ascii:
special = dict(middle_dot = "\dot",
big_sigma = "\Sigma")
greek = dict(alpha = "\alpha",
beta = "\beta",
gamma = "\gamma",
delta = "\delta",
epsilon = "\epsilon")
special = dict(middle_dot = "\\dot",
big_sigma = "\\Sigma")
greek = dict(alpha = "\\alpha",
beta = "\\beta",
gamma = "\\gamma",
delta = "\\delta",
epsilon = "\\epsilon")
else:
special = dict(middle_dot = u"\u00B7",
......
......@@ -296,7 +296,8 @@ class GpuConv(Op):
and self.logical_kern_hw == other.logical_kern_hw \
and self.logical_kern_align_top == other.logical_kern_align_top \
and self.version == other.version \
and self.verbose == other.verbose
and self.verbose == other.verbose \
and self.kshp == other.kshp
def __hash__(self):
# don't use hash(self.version) as hash(-1)==-2 and hash(-2)==-2 in python!
......@@ -307,7 +308,8 @@ class GpuConv(Op):
^ hash(self.logical_kern_hw) \
^ hash(self.logical_kern_align_top) \
^ self.version \
^ self.verbose
^ hash(self.verbose) \
^ hash(self.kshp)
def __str__(self):
return '%s{%s, %s, %s, %s, %s}' %(self.__class__.__name__,
......@@ -336,7 +338,7 @@ class GpuConv(Op):
return ['cuda_ndarray.cuh','<stdio.h>']
def c_code_cache_version(self):
return (0,6)
return (0,7)
def c_support_code_apply(self, node, nodename):
return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\
......
......@@ -1052,8 +1052,13 @@ def local_argmax_pushdown(node):
(softmax, softplus, tensor.exp, tensor.log, tensor.tanh, sigmoid,
softmax_with_bias):
if theano.config.warn.argmax_pushdown_bug:
logging.getLogger('theano.tensor.nnet.nnet').warn("WARNING: their was a bug in Theano fixed the 27 may 2010 in this case. I.E. when we take the max of a softplus, softmax, exp, log, tanh, sigmoid, softmax_with_bias op, we where doing the max of the parent of the input. To remove this warning set the Theano flags 'warn.argmax_pushdown_bug' to False")
logging.getLogger('theano.tensor.nnet.nnet').warn("WARNING: there "
"was a bug in Theano fixed on May 27th, 2010 in this case."
" I.E. when we take the max of a softplus, softmax, exp, "
"log, tanh, sigmoid, softmax_with_bias op, we were doing "
"the max of the parent of the input. To remove this "
"warning set the Theano flags 'warn.argmax_pushdown_bug' "
"to False")
if node.op == tensor._max_and_argmax and node.inputs[0].owner and len(node.outputs[0].clients)==0:
x_max, x_argmax = node.outputs
......
......@@ -1569,6 +1569,7 @@ def test_constant_get_stabilized():
f2 = theano.function([x2],y2)
assert len(f2.maker.env.toposort())==1
assert f2.maker.env.toposort()[0].op==theano.tensor.nnet.sigm.softplus
raise KnownFailureTest("Theano optimize constant before stabilization! This break stabilization optimization is some case!")
assert f2(800)==800
x = T.as_tensor_variable(800)
......@@ -1576,7 +1577,6 @@ def test_constant_get_stabilized():
f = theano.function([],y)
assert len(f.maker.env.toposort())==0
assert numpy.isinf(f())
raise KnownFailureTest("Theano optimize constant before stabilization! This break stabilization optimization is some case!")
#When this error is fixed, the following line should be ok.
assert f()==800,f()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论