提交 c1366d70 authored 作者: lamblin's avatar lamblin

Merge pull request #590 from nouiz/test_fix

Test fix
...@@ -59,7 +59,7 @@ echo "Number of elements in the compiledir:" ...@@ -59,7 +59,7 @@ echo "Number of elements in the compiledir:"
ls ${COMPILEDIR}|wc -l ls ${COMPILEDIR}|wc -l
echo "Executing nosetests with mode=FAST_RUN" echo "Executing nosetests with mode=FAST_RUN"
THEANO_FLAGS=${FLAGS},mode=FAST_RUN ${NOSETESTS} ${PROFILING} ${ARGS} THEANO_FLAGS=cmodule.warn_no_version=True,${FLAGS},mode=FAST_RUN ${NOSETESTS} ${PROFILING} ${ARGS}
echo "Number of elements in the compiledir:" echo "Number of elements in the compiledir:"
ls ${COMPILEDIR}|wc -l ls ${COMPILEDIR}|wc -l
......
""" """This file show how we can use Pycuda compiled fct in a Theano
This file show how we can use Pycuda compiled fct in a Theano Op. Do no use those op in production code. See the TODO. Op. Do no use those op in production code. See the TODO.
You can use them as a guide to use your pycuda code into a Theano op. You can use them as a guide to use your pycuda code into a Theano op.
The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code generated with pycuda.compiler.SourceModule The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code
generated with pycuda.compiler.SourceModule
The PycudaElemwiseKernelOp op use pycuda code generated with pycuda.elementwise.ElementwiseKernel. It must be wrapper by TheanoElementwiseKernel.
Their is a test in test_pycuda.py. Their is a test in test_pycuda.py.
This don't work with broadcast and non-contiguous memory as pycuda don't support that, but we make sure we don't introduce problem. This don't work with broadcast and non-contiguous memory as pycuda
don't support that, but we make sure we don't introduce problem.
If the memory is non-contiguous, we create a new copy that is contiguous. If the memory is non-contiguous, we create a new copy that is contiguous.
If their is broadcasted dimensions, we raise an error. If their is broadcasted dimensions, we raise an error.
#The following is commented as it work only with old pycuda version
The PycudaElemwiseKernelOp op use pycuda code generated with
pycuda.elementwise.ElementwiseKernel. It must be wrapper by
TheanoElementwiseKernel.
""" """
import numpy import numpy
...@@ -19,7 +25,8 @@ import numpy ...@@ -19,7 +25,8 @@ import numpy
import theano import theano
from theano.gof import Op, Apply, local_optimizer, EquilibriumDB from theano.gof import Op, Apply, local_optimizer, EquilibriumDB
from theano.sandbox.cuda import GpuElemwise, CudaNdarrayType, GpuOp from theano.sandbox.cuda import GpuElemwise, CudaNdarrayType, GpuOp
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, gpu_contiguous from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous)
from theano.sandbox.cuda.opt import gpu_seqopt from theano.sandbox.cuda.opt import gpu_seqopt
import pycuda_init import pycuda_init
...@@ -30,30 +37,36 @@ import pycuda ...@@ -30,30 +37,36 @@ import pycuda
from pycuda.elementwise import ElementwiseKernel from pycuda.elementwise import ElementwiseKernel
from pycuda.compiler import SourceModule from pycuda.compiler import SourceModule
from pycuda.tools import VectorArg from pycuda.tools import VectorArg
import pycuda.gpuarray
def theano_parse_c_arg(c_arg): def theano_parse_c_arg(c_arg):
c_arg = c_arg.replace('npy_float32','float') c_arg = c_arg.replace('npy_float32', 'float')
c_arg = c_arg.replace('npy_float64','double') c_arg = c_arg.replace('npy_float64', 'double')
c_arg = c_arg.replace('npy_int32','int') c_arg = c_arg.replace('npy_int32', 'int')
c_arg = c_arg.replace('npy_int8','char') c_arg = c_arg.replace('npy_int8', 'char')
c_arg = c_arg.replace('npy_ucs4','unsigned int') c_arg = c_arg.replace('npy_ucs4', 'unsigned int')
c_arg = c_arg.replace('npy_uint32','unsigned int') c_arg = c_arg.replace('npy_uint32', 'unsigned int')
c_arg = c_arg.replace('npy_uint16','unsigned short') c_arg = c_arg.replace('npy_uint16', 'unsigned short')
c_arg = c_arg.replace('npy_uint8','unsigned char') c_arg = c_arg.replace('npy_uint8', 'unsigned char')
return pycuda.tools.parse_c_arg(c_arg) return pycuda.tools.parse_c_arg(c_arg)
"""
class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel): class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
def __init__(self, arguments, operation, def __init__(self, arguments, operation,
name="kernel", keep=False, options=[], **kwargs): name="kernel", keep=False, options=[], **kwargs):
if isinstance(arguments, basestring): if isinstance(arguments, basestring):
arguments = [theano_parse_c_arg(arg) for arg in arguments.split(",")] arguments = [theano_parse_c_arg(arg)
pycuda.elementwise.ElementwiseKernel.__init__(self, arguments, operation, name, keep, options, **kwargs) for arg in arguments.split(",")]
pycuda.elementwise.ElementwiseKernel.__init__(self, arguments,
operation, name, keep,
options, **kwargs)
def __call__(self, *args): def __call__(self, *args):
vectors = [] vectors = []
invocation_args = [] invocation_args = []
for arg, arg_descr in zip(args, self.arguments): for arg, arg_descr in zip(args, self.gen_kwargs["arguments"]):
if isinstance(arg_descr, VectorArg): if isinstance(arg_descr, VectorArg):
vectors.append(arg) vectors.append(arg)
invocation_args.append(arg.gpudata) invocation_args.append(arg.gpudata)
...@@ -62,7 +75,7 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel): ...@@ -62,7 +75,7 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
repr_vec = vectors[0] repr_vec = vectors[0]
invocation_args.append(repr_vec.mem_size) invocation_args.append(repr_vec.mem_size)
if hasattr(repr_vec,"_block") and hasattr(repr_vec,"_grid"): if hasattr(repr_vec, "_block") and hasattr(repr_vec, "_grid"):
self.func.set_block_shape(*repr_vec._block) self.func.set_block_shape(*repr_vec._block)
self.func.prepared_call(repr_vec._grid, *invocation_args) self.func.prepared_call(repr_vec._grid, *invocation_args)
else: else:
...@@ -71,26 +84,120 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel): ...@@ -71,26 +84,120 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
self.func.prepared_call(_grid, *invocation_args) self.func.prepared_call(_grid, *invocation_args)
class PycudaElemwiseKernelOp(GpuOp):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern={}, name=None):
self.name = name
self.scalar_op = scalar_op
self.inplace_pattern = None
def __str__(self):
if self.name is None:
if self.inplace_pattern:
items = self.inplace_pattern.items()
items.sort()
return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
str(items))
else:
return self.__class__.__name__ + "{%s}" % (self.scalar_op)
else:
return self.name
def __eq__(self, other):
return (type(self) == type(other) and
self.scalar_op == other.scalar_op and
self.inplace_pattern == other.inplace_pattern)
def __hash__(self):
return (hash(type(self)) ^ hash(self.scalar_op) ^
hash(self.inplace_pattern))
def make_node(self, *inputs):
_inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
for i in _inputs[1:]:
if i.type.ndim != inputs[0].type.ndim:
raise TypeError('different ranks among inputs')
if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs) == 2 # TODO remove
# output is broadcastable only along dimensions where all inputs are
# broadcastable
broadcastable = []
for d in xrange(_inputs[0].type.ndim):
bcast_d = True
for i in _inputs:
if not i.type.broadcastable[d]:
bcast_d = False
break
broadcastable.append(bcast_d)
assert len(broadcastable) == _inputs[0].type.ndim
otype = CudaNdarrayType(broadcastable=broadcastable)
assert self.nout == 1
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i" + str(id) for id in range(len(inputs))]
out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name",
tuple([n + "[i]"for n in in_name]),
tuple(n + "[i]"for n in out_name), {})
self.pycuda_fct = TheanoElementwiseKernel(
", ".join([var.type.dtype_specs()[1] + " *" + name
for var, name in (zip(inputs, in_name) +
zip(out_node.outputs, out_name))]),
c_code,
"pycuda_elemwise_kernel_%s" % str(self.scalar_op),
preamble=("#include<Python.h>\n"
"#include <numpy/arrayobject.h>"))
return out_node
def perform(self, node, inputs, out):
#TODO assert all input have the same shape
z, = out
if z[0] is None or z[0].shape != inputs[0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
i = inputs + z
self.pycuda_fct(*i)
"""
class PycudaElemwiseSourceModuleOp(GpuOp): class PycudaElemwiseSourceModuleOp(GpuOp):
nin = property(lambda self: self.scalar_op.nin) nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout) nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern = {}, name = None): def __init__(self, scalar_op, inplace_pattern={}, name=None):
self.name = name self.name = name
self.scalar_op = scalar_op self.scalar_op = scalar_op
self.inplace_pattern=None self.inplace_pattern = None
def __str__(self): def __str__(self):
if self.name is None: if self.name is None:
if self.inplace_pattern: if self.inplace_pattern:
items = self.inplace_pattern.items() items = self.inplace_pattern.items()
items.sort() items.sort()
return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items)) return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
str(items))
else: else:
return self.__class__.__name__+"{%s}" % (self.scalar_op) return self.__class__.__name__ + "{%s}" % (self.scalar_op)
else: else:
return self.name return self.name
def __eq__(self, other):
return (type(self) == type(other) and
self.scalar_op == other.scalar_op and
self.inplace_pattern == other.inplace_pattern)
def __hash__(self):
return (hash(type(self)) ^ hash(self.scalar_op) ^
hash(self.inplace_pattern))
def make_node(self, *inputs): def make_node(self, *inputs):
_inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs] _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin: if self.nin > 0 and len(_inputs) != self.nin:
...@@ -101,17 +208,23 @@ class PycudaElemwiseSourceModuleOp(GpuOp): ...@@ -101,17 +208,23 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
if any([any(i.type.broadcastable) for i in inputs]): if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions") raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs)==2#TODO remove assert len(inputs) == 2 # TODO remove
otype = CudaNdarrayType(broadcastable=[False]*_inputs[0].type.ndim) otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
assert self.nout == 1 assert self.nout == 1
fct_name = "pycuda_elemwise_%s"%str(self.scalar_op) fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)]) out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i"+str(id) for id in range(len(inputs))] in_name = ["i" + str(id) for id in range(len(inputs))]
out_name = ["o"+str(id) for id in range(self.nout)] out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {}) c_code = self.scalar_op.c_code(out_node, "some_name",
c_code_param = ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]+["int size"]) tuple([n + "[i]" for n in in_name]),
tuple(n + "[i]" for n in out_name), {})
c_code_param = ", ".join([var.type.dtype_specs()[1] + " *" + name
for var, name in (zip(inputs, in_name) +
zip(out_node.outputs,
out_name))] +
["int size"])
mod = SourceModule(""" mod = SourceModule("""
#include<Python.h> #include<Python.h>
#include <numpy/arrayobject.h> #include <numpy/arrayobject.h>
...@@ -123,7 +236,7 @@ class PycudaElemwiseSourceModuleOp(GpuOp): ...@@ -123,7 +236,7 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
%s %s
} }
} }
"""%(fct_name,c_code_param,c_code)) """ % (fct_name, c_code_param, c_code))
self.pycuda_fct = mod.get_function(fct_name) self.pycuda_fct = mod.get_function(fct_name)
return out_node return out_node
...@@ -131,41 +244,46 @@ class PycudaElemwiseSourceModuleOp(GpuOp): ...@@ -131,41 +244,46 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
#TODO support broadcast! #TODO support broadcast!
#TODO assert all input have the same shape #TODO assert all input have the same shape
z, = out z, = out
if z[0] is None or z[0].shape!=inputs[0].shape: if z[0] is None or z[0].shape != inputs[0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape) z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
if inputs[0].shape != inputs[1].shape: if inputs[0].shape != inputs[1].shape:
raise TypeError("PycudaElemwiseSourceModuleOp: inputs don't have the same shape!") raise TypeError("PycudaElemwiseSourceModuleOp:"
" inputs don't have the same shape!")
if inputs[0].size > 512: if inputs[0].size > 512:
grid = (int(numpy.ceil(inputs[0].size / 512.)),1) grid = (int(numpy.ceil(inputs[0].size / 512.)), 1)
block = (512,1,1) block = (512, 1, 1)
else: else:
grid = (1,1) grid = (1, 1)
block = (inputs[0].shape[0],inputs[0].shape[1],1) block = (inputs[0].shape[0], inputs[0].shape[1], 1)
self.pycuda_fct(inputs[0], inputs[1], z[0], numpy.intc(inputs[1].size), block=block, grid=grid) self.pycuda_fct(inputs[0], inputs[1], z[0],
numpy.intc(inputs[1].size), block=block, grid=grid)
class PycudaElemwiseKernelOp(GpuOp): class PycudaElemwiseSourceModuleMakeThunkOp(Op):
nin = property(lambda self: self.scalar_op.nin) nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout) nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern = {}, name = None): def __init__(self, scalar_op, inplace_pattern={}, name=None):
self.name = name self.name = name
self.scalar_op = scalar_op self.scalar_op = scalar_op
self.inplace_pattern=None self.inplace_pattern = None
def __str__(self): def __str__(self):
if self.name is None: if self.name is None:
if self.inplace_pattern: if self.inplace_pattern:
items = self.inplace_pattern.items() items = self.inplace_pattern.items()
items.sort() items.sort()
return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items)) return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
str(items))
else: else:
return self.__class__.__name__+"{%s}" % (self.scalar_op) return self.__class__.__name__ + "{%s}" % (self.scalar_op)
else: else:
return self.name return self.name
def make_node(self, *inputs): def make_node(self, *inputs):
assert self.nout == 1
assert len(inputs) == 2 # TODO remove
_inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs] _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin: if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs))) raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
...@@ -175,57 +293,86 @@ class PycudaElemwiseKernelOp(GpuOp): ...@@ -175,57 +293,86 @@ class PycudaElemwiseKernelOp(GpuOp):
if any([any(i.type.broadcastable) for i in inputs]): if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions") raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs)==2#TODO remove
# output is broadcastable only along dimensions where all inputs are broadcastable
broadcastable = []
for d in xrange(_inputs[0].type.ndim):
bcast_d = True
for i in _inputs:
if not i.type.broadcastable[d]:
bcast_d = False
break
broadcastable.append(bcast_d)
assert len(broadcastable) == _inputs[0].type.ndim
otype = CudaNdarrayType(broadcastable=broadcastable)
assert self.nout == 1
otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)]) out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i"+str(id) for id in range(len(inputs))]
out_name = ["o"+str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {})
self.pycuda_fct = TheanoElementwiseKernel(
", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]),
c_code,
"pycuda_elemwise_kernel_%s"%str(self.scalar_op),
preamble="""#include<Python.h>
#include <numpy/arrayobject.h>""")
return out_node return out_node
def perform(self, node, inputs, out): def make_thunk(self, node, storage_map, _, _2):
#TODO support broadcast!
#TODO assert all input have the same shape #TODO assert all input have the same shape
z, = out fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
if z[0] is None or z[0].shape!=inputs[0].shape: in_name = ["i" + str(id) for id in range(len(node.inputs))]
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape) out_name = ["o" + str(id) for id in range(self.nout)]
i = inputs + z
self.pycuda_fct(*i) c_code = self.scalar_op.c_code(node, "some_name",
tuple([n + "[i]" for n in in_name]),
tuple(n + "[i]" for n in out_name), {})
c_code_param = ", ".join([var.type.dtype_specs()[1] + " *" + name
for var, name in
zip(node.inputs, in_name) +
zip(node.outputs, out_name)] + ["int size"])
mod = SourceModule("""
#include<Python.h>
#include <numpy/arrayobject.h>
__global__ void %s(%s)
{
int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
i += threadIdx.x + threadIdx.y*blockDim.x;
if(i<size){
%s
}
}
""" % (fct_name, c_code_param, c_code))
pycuda_fct = mod.get_function(fct_name)
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
def thunk():
z = outputs[0]
if z[0] is None or z[0].shape != inputs[0][0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(
inputs[0][0].shape)
if inputs[0][0].shape != inputs[1][0].shape:
raise TypeError("PycudaElemwiseSourceModuleMakeThunkOp:"
" inputs don't have the same shape!")
if inputs[0][0].size > 512:
grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1)
block = (512, 1, 1)
else:
grid = (1, 1)
block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1)
out = pycuda_fct(inputs[0][0], inputs[1][0], z[0],
numpy.intc(inputs[1][0].size), block=block,
grid=grid)
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
return thunk
pycuda_optimizer = EquilibriumDB() pycuda_optimizer = EquilibriumDB()
gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run") gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")
@local_optimizer([]) @local_optimizer([])
def local_pycuda_gpu_elemwise(node): def local_pycuda_gpu_elemwise(node):
""" """
GpuElemwise -> PycudaElemwiseSourceModuleOp GpuElemwise -> PycudaElemwiseSourceModuleOp
""" """
if isinstance(node.op, GpuElemwise): if isinstance(node.op, GpuElemwise):
if not any([ any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim<=2 for i in node.inputs]): if (not any([any(i.type.broadcastable) for i in node.inputs]) and
new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs) all([i.ndim <= 2 for i in node.inputs])):
new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op,
node.op.inplace_pattern)(
*node.inputs)
return [new_op] return [new_op]
pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise) pycuda_optimizer.register("local_pycuda_gpu_elemwise",
local_pycuda_gpu_elemwise)
@local_optimizer([]) @local_optimizer([])
def local_pycuda_gpu_elemwise_kernel(node): def local_pycuda_gpu_elemwise_kernel(node):
...@@ -233,8 +380,11 @@ def local_pycuda_gpu_elemwise_kernel(node): ...@@ -233,8 +380,11 @@ def local_pycuda_gpu_elemwise_kernel(node):
GpuElemwise -> PycudaElemwiseKernelOp GpuElemwise -> PycudaElemwiseKernelOp
""" """
if isinstance(node.op, GpuElemwise): if isinstance(node.op, GpuElemwise):
if not any([ any(i.type.broadcastable) for i in node.inputs]): if not any([any(i.type.broadcastable) for i in node.inputs]):
new_op = PycudaElemwiseKernelOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs) new_op = PycudaElemwiseKernelOp(node.op.scalar_op,
node.op.inplace_pattern)(
*node.inputs)
return [new_op] return [new_op]
pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel", local_pycuda_gpu_elemwise_kernel, 1.5) pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel",
local_pycuda_gpu_elemwise_kernel, 1.5)
...@@ -5,7 +5,8 @@ import theano.misc.pycuda_init ...@@ -5,7 +5,8 @@ import theano.misc.pycuda_init
if not theano.misc.pycuda_init.pycuda_available: if not theano.misc.pycuda_init.pycuda_available:
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda code.") raise SkipTest("Pycuda not installed. Skip test of theano op"
" with pycuda code.")
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available == False:
...@@ -14,71 +15,93 @@ if cuda_ndarray.cuda_available == False: ...@@ -14,71 +15,93 @@ if cuda_ndarray.cuda_available == False:
import theano import theano
import theano.tensor as T import theano.tensor as T
from theano.misc.pycuda_example import PycudaElemwiseSourceModuleOp, PycudaElemwiseKernelOp, PycudaElemwiseSourceModuleMakeThunkOp from theano.misc.pycuda_example import (PycudaElemwiseSourceModuleOp,
# PycudaElemwiseKernelOp,
PycudaElemwiseSourceModuleMakeThunkOp)
if theano.config.mode=='FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu') mode_without_gpu = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpu')
else: else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu') mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def test_pycuda_elemwise_source_module(): def test_pycuda_elemwise_source_module():
for shape in [(5,5), (10,49), (50,49),(500,501),(5000,5001)]: for shape in [(5, 5), (10, 49), (50, 49), (500, 501)]:
for op in [theano.scalar.basic.mul, theano.scalar.basic.add]: for op in [theano.scalar.basic.mul, theano.scalar.basic.add]:
x=T.fmatrix('x') x = T.fmatrix('x')
y=T.fmatrix('y') y = T.fmatrix('y')
elemwise_op = theano.tensor.Elemwise(op) elemwise_op = theano.tensor.Elemwise(op)
pycuda_op = PycudaElemwiseSourceModuleOp(op) pycuda_op = PycudaElemwiseSourceModuleOp(op)
pycuda_op_thunk = PycudaElemwiseSourceModuleMakeThunkOp(op) pycuda_op_thunk = PycudaElemwiseSourceModuleMakeThunkOp(op)
f=theano.function([x,y], elemwise_op(x,y), mode=mode_with_gpu) f = theano.function([x, y], elemwise_op(x, y), mode=mode_with_gpu)
f2 = theano.function([x,y], theano.sandbox.cuda.host_from_gpu(pycuda_op(x,y))) f2 = theano.function([x, y],
f3 = theano.function([x,y], elemwise_op(x,y), theano.sandbox.cuda.host_from_gpu(
mode=mode_with_gpu.including("local_pycuda_gpu_elemwise")) pycuda_op(x, y)),
f4 = theano.function([x,y], theano.sandbox.cuda.host_from_gpu(pycuda_op_thunk(x,y))) mode=mode_with_gpu)
mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise")
f3 = theano.function([x, y], elemwise_op(x, y),
mode=mode_pycuda)
f4 = theano.function([x, y],
theano.sandbox.cuda.host_from_gpu(
pycuda_op_thunk(x, y)),
mode=mode_with_gpu)
assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()]) assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f2.maker.env.toposort()]) for node in f.maker.env.toposort()])
assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f3.maker.env.toposort()]) assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
assert any([ isinstance(node.op, PycudaElemwiseSourceModuleMakeThunkOp) for node in f4.maker.env.toposort()]) for node in f2.maker.env.toposort()])
assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
for node in f3.maker.env.toposort()])
assert any([isinstance(node.op,
PycudaElemwiseSourceModuleMakeThunkOp)
for node in f4.maker.env.toposort()])
val1 = numpy.asarray(numpy.random.rand(*shape), dtype='float32') val1 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
val2 = numpy.asarray(numpy.random.rand(*shape), dtype='float32') val2 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
assert (f(val1,val2) == f2(val1,val2)).all() assert (f(val1, val2) == f2(val1, val2)).all()
assert (f(val1,val2) == f3(val1,val2)).all() assert (f(val1, val2) == f3(val1, val2)).all()
assert (f(val1,val2) == f4(val1,val2)).all() assert (f(val1, val2) == f4(val1, val2)).all()
#print f(val1,val2) #print f(val1,val2)
#print f2(val1,val2) #print f2(val1,val2)
"""
#commented as it work only with old pycuda version.
def test_pycuda_elemwise_kernel(): def test_pycuda_elemwise_kernel():
x=T.fmatrix('x') x = T.fmatrix('x')
y=T.fmatrix('y') y = T.fmatrix('y')
f=theano.function([x,y],x+y, mode=mode_with_gpu) f = theano.function([x, y], x + y, mode=mode_with_gpu)
print f.maker.env.toposort() print f.maker.env.toposort()
f2 = theano.function([x,y],x+y, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")) mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")
f2 = theano.function([x, y], x + y, mode=mode_pycuda)
print f2.maker.env.toposort() print f2.maker.env.toposort()
assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()]) assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f2.maker.env.toposort()]) for node in f.maker.env.toposort()])
assert any([isinstance(node.op, PycudaElemwiseKernelOp)
for node in f2.maker.env.toposort()])
val1 = numpy.asarray(numpy.random.rand(5,5), dtype='float32') val1 = numpy.asarray(numpy.random.rand(5, 5), dtype='float32')
val2 = numpy.asarray(numpy.random.rand(5,5), dtype='float32') val2 = numpy.asarray(numpy.random.rand(5, 5), dtype='float32')
#val1 = numpy.ones((5,5)) #val1 = numpy.ones((5,5))
#val2 = numpy.arange(25).reshape(5,5) #val2 = numpy.arange(25).reshape(5,5)
assert (f(val1,val2) == f2(val1,val2)).all() assert (f(val1, val2) == f2(val1, val2)).all()
print f(val1,val2) print f(val1, val2)
print f2(val1,val2) print f2(val1, val2)
x3=T.ftensor3('x') x3 = T.ftensor3('x')
y3=T.ftensor3('y') y3 = T.ftensor3('y')
z3=T.ftensor3('y') z3 = T.ftensor3('y')
f4 = theano.function([x3,y3,z3],x3*y3+z3, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")) f4 = theano.function([x3, y3, z3], x3 * y3 + z3, mode=mode_pycuda)
print f4.maker.env.toposort() print f4.maker.env.toposort()
assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f4.maker.env.toposort()]) assert any([isinstance(node.op, PycudaElemwiseKernelOp)
for node in f4.maker.env.toposort()])
val1 = numpy.random.rand(2,2,2) val1 = numpy.random.rand(2, 2, 2)
print val1 print val1
print f4(val1,val1,val1) print f4(val1, val1, val1)
assert numpy.allclose(f4(val1,val1,val1),val1*val1+val1) assert numpy.allclose(f4(val1, val1, val1), val1 * val1 + val1)
"""
...@@ -78,7 +78,10 @@ __global__ void multiply_them(float *dest, float *a, float *b) ...@@ -78,7 +78,10 @@ __global__ void multiply_them(float *dest, float *a, float *b)
def test_pycuda_memory_to_theano(): def test_pycuda_memory_to_theano():
#Test that we can use the GpuArray memory space in pycuda in a CudaNdarray #Test that we can use the GpuArray memory space in pycuda in a CudaNdarray
y = pycuda.gpuarray.zeros((3, 4, 5), 'float32') y = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
print numpy.asarray(y) print sys.getrefcount(y)
# This increase the ref count with never pycuda. Do pycuda also
# cache ndarray?
# print y.get()
print "gpuarray ref count before creating a CudaNdarray", print "gpuarray ref count before creating a CudaNdarray",
print sys.getrefcount(y) print sys.getrefcount(y)
assert sys.getrefcount(y) == 2 assert sys.getrefcount(y) == 2
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论