提交 c1366d70 authored 作者: lamblin's avatar lamblin

Merge pull request #590 from nouiz/test_fix

Test fix
......@@ -59,7 +59,7 @@ echo "Number of elements in the compiledir:"
ls ${COMPILEDIR}|wc -l
echo "Executing nosetests with mode=FAST_RUN"
THEANO_FLAGS=${FLAGS},mode=FAST_RUN ${NOSETESTS} ${PROFILING} ${ARGS}
THEANO_FLAGS=cmodule.warn_no_version=True,${FLAGS},mode=FAST_RUN ${NOSETESTS} ${PROFILING} ${ARGS}
echo "Number of elements in the compiledir:"
ls ${COMPILEDIR}|wc -l
......
"""
This file show how we can use Pycuda compiled fct in a Theano Op. Do no use those op in production code. See the TODO.
"""This file show how we can use Pycuda compiled fct in a Theano
Op. Do no use those op in production code. See the TODO.
You can use them as a guide to use your pycuda code into a Theano op.
The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code generated with pycuda.compiler.SourceModule
The PycudaElemwiseKernelOp op use pycuda code generated with pycuda.elementwise.ElementwiseKernel. It must be wrapper by TheanoElementwiseKernel.
The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code
generated with pycuda.compiler.SourceModule
Their is a test in test_pycuda.py.
This don't work with broadcast and non-contiguous memory as pycuda don't support that, but we make sure we don't introduce problem.
This don't work with broadcast and non-contiguous memory as pycuda
don't support that, but we make sure we don't introduce problem.
If the memory is non-contiguous, we create a new copy that is contiguous.
If their is broadcasted dimensions, we raise an error.
#The following is commented as it work only with old pycuda version
The PycudaElemwiseKernelOp op use pycuda code generated with
pycuda.elementwise.ElementwiseKernel. It must be wrapper by
TheanoElementwiseKernel.
"""
import numpy
......@@ -19,7 +25,8 @@ import numpy
import theano
from theano.gof import Op, Apply, local_optimizer, EquilibriumDB
from theano.sandbox.cuda import GpuElemwise, CudaNdarrayType, GpuOp
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, gpu_contiguous
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous)
from theano.sandbox.cuda.opt import gpu_seqopt
import pycuda_init
......@@ -30,30 +37,36 @@ import pycuda
from pycuda.elementwise import ElementwiseKernel
from pycuda.compiler import SourceModule
from pycuda.tools import VectorArg
import pycuda.gpuarray
def theano_parse_c_arg(c_arg):
c_arg = c_arg.replace('npy_float32','float')
c_arg = c_arg.replace('npy_float64','double')
c_arg = c_arg.replace('npy_int32','int')
c_arg = c_arg.replace('npy_int8','char')
c_arg = c_arg.replace('npy_ucs4','unsigned int')
c_arg = c_arg.replace('npy_uint32','unsigned int')
c_arg = c_arg.replace('npy_uint16','unsigned short')
c_arg = c_arg.replace('npy_uint8','unsigned char')
c_arg = c_arg.replace('npy_float32', 'float')
c_arg = c_arg.replace('npy_float64', 'double')
c_arg = c_arg.replace('npy_int32', 'int')
c_arg = c_arg.replace('npy_int8', 'char')
c_arg = c_arg.replace('npy_ucs4', 'unsigned int')
c_arg = c_arg.replace('npy_uint32', 'unsigned int')
c_arg = c_arg.replace('npy_uint16', 'unsigned short')
c_arg = c_arg.replace('npy_uint8', 'unsigned char')
return pycuda.tools.parse_c_arg(c_arg)
"""
class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
def __init__(self, arguments, operation,
name="kernel", keep=False, options=[], **kwargs):
if isinstance(arguments, basestring):
arguments = [theano_parse_c_arg(arg) for arg in arguments.split(",")]
pycuda.elementwise.ElementwiseKernel.__init__(self, arguments, operation, name, keep, options, **kwargs)
arguments = [theano_parse_c_arg(arg)
for arg in arguments.split(",")]
pycuda.elementwise.ElementwiseKernel.__init__(self, arguments,
operation, name, keep,
options, **kwargs)
def __call__(self, *args):
vectors = []
invocation_args = []
for arg, arg_descr in zip(args, self.arguments):
for arg, arg_descr in zip(args, self.gen_kwargs["arguments"]):
if isinstance(arg_descr, VectorArg):
vectors.append(arg)
invocation_args.append(arg.gpudata)
......@@ -62,7 +75,7 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
repr_vec = vectors[0]
invocation_args.append(repr_vec.mem_size)
if hasattr(repr_vec,"_block") and hasattr(repr_vec,"_grid"):
if hasattr(repr_vec, "_block") and hasattr(repr_vec, "_grid"):
self.func.set_block_shape(*repr_vec._block)
self.func.prepared_call(repr_vec._grid, *invocation_args)
else:
......@@ -71,26 +84,120 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
self.func.prepared_call(_grid, *invocation_args)
class PycudaElemwiseKernelOp(GpuOp):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern={}, name=None):
self.name = name
self.scalar_op = scalar_op
self.inplace_pattern = None
def __str__(self):
if self.name is None:
if self.inplace_pattern:
items = self.inplace_pattern.items()
items.sort()
return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
str(items))
else:
return self.__class__.__name__ + "{%s}" % (self.scalar_op)
else:
return self.name
def __eq__(self, other):
return (type(self) == type(other) and
self.scalar_op == other.scalar_op and
self.inplace_pattern == other.inplace_pattern)
def __hash__(self):
return (hash(type(self)) ^ hash(self.scalar_op) ^
hash(self.inplace_pattern))
def make_node(self, *inputs):
_inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
for i in _inputs[1:]:
if i.type.ndim != inputs[0].type.ndim:
raise TypeError('different ranks among inputs')
if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs) == 2 # TODO remove
# output is broadcastable only along dimensions where all inputs are
# broadcastable
broadcastable = []
for d in xrange(_inputs[0].type.ndim):
bcast_d = True
for i in _inputs:
if not i.type.broadcastable[d]:
bcast_d = False
break
broadcastable.append(bcast_d)
assert len(broadcastable) == _inputs[0].type.ndim
otype = CudaNdarrayType(broadcastable=broadcastable)
assert self.nout == 1
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i" + str(id) for id in range(len(inputs))]
out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name",
tuple([n + "[i]"for n in in_name]),
tuple(n + "[i]"for n in out_name), {})
self.pycuda_fct = TheanoElementwiseKernel(
", ".join([var.type.dtype_specs()[1] + " *" + name
for var, name in (zip(inputs, in_name) +
zip(out_node.outputs, out_name))]),
c_code,
"pycuda_elemwise_kernel_%s" % str(self.scalar_op),
preamble=("#include<Python.h>\n"
"#include <numpy/arrayobject.h>"))
return out_node
def perform(self, node, inputs, out):
#TODO assert all input have the same shape
z, = out
if z[0] is None or z[0].shape != inputs[0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
i = inputs + z
self.pycuda_fct(*i)
"""
class PycudaElemwiseSourceModuleOp(GpuOp):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern = {}, name = None):
def __init__(self, scalar_op, inplace_pattern={}, name=None):
self.name = name
self.scalar_op = scalar_op
self.inplace_pattern=None
self.inplace_pattern = None
def __str__(self):
if self.name is None:
if self.inplace_pattern:
items = self.inplace_pattern.items()
items.sort()
return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items))
return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
str(items))
else:
return self.__class__.__name__+"{%s}" % (self.scalar_op)
return self.__class__.__name__ + "{%s}" % (self.scalar_op)
else:
return self.name
def __eq__(self, other):
return (type(self) == type(other) and
self.scalar_op == other.scalar_op and
self.inplace_pattern == other.inplace_pattern)
def __hash__(self):
return (hash(type(self)) ^ hash(self.scalar_op) ^
hash(self.inplace_pattern))
def make_node(self, *inputs):
_inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin:
......@@ -101,17 +208,23 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs)==2#TODO remove
assert len(inputs) == 2 # TODO remove
otype = CudaNdarrayType(broadcastable=[False]*_inputs[0].type.ndim)
otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
assert self.nout == 1
fct_name = "pycuda_elemwise_%s"%str(self.scalar_op)
fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i"+str(id) for id in range(len(inputs))]
out_name = ["o"+str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {})
c_code_param = ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]+["int size"])
in_name = ["i" + str(id) for id in range(len(inputs))]
out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name",
tuple([n + "[i]" for n in in_name]),
tuple(n + "[i]" for n in out_name), {})
c_code_param = ", ".join([var.type.dtype_specs()[1] + " *" + name
for var, name in (zip(inputs, in_name) +
zip(out_node.outputs,
out_name))] +
["int size"])
mod = SourceModule("""
#include<Python.h>
#include <numpy/arrayobject.h>
......@@ -123,7 +236,7 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
%s
}
}
"""%(fct_name,c_code_param,c_code))
""" % (fct_name, c_code_param, c_code))
self.pycuda_fct = mod.get_function(fct_name)
return out_node
......@@ -131,41 +244,46 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
#TODO support broadcast!
#TODO assert all input have the same shape
z, = out
if z[0] is None or z[0].shape!=inputs[0].shape:
if z[0] is None or z[0].shape != inputs[0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
if inputs[0].shape != inputs[1].shape:
raise TypeError("PycudaElemwiseSourceModuleOp: inputs don't have the same shape!")
raise TypeError("PycudaElemwiseSourceModuleOp:"
" inputs don't have the same shape!")
if inputs[0].size > 512:
grid = (int(numpy.ceil(inputs[0].size / 512.)),1)
block = (512,1,1)
grid = (int(numpy.ceil(inputs[0].size / 512.)), 1)
block = (512, 1, 1)
else:
grid = (1,1)
block = (inputs[0].shape[0],inputs[0].shape[1],1)
self.pycuda_fct(inputs[0], inputs[1], z[0], numpy.intc(inputs[1].size), block=block, grid=grid)
grid = (1, 1)
block = (inputs[0].shape[0], inputs[0].shape[1], 1)
self.pycuda_fct(inputs[0], inputs[1], z[0],
numpy.intc(inputs[1].size), block=block, grid=grid)
class PycudaElemwiseKernelOp(GpuOp):
class PycudaElemwiseSourceModuleMakeThunkOp(Op):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern = {}, name = None):
def __init__(self, scalar_op, inplace_pattern={}, name=None):
self.name = name
self.scalar_op = scalar_op
self.inplace_pattern=None
self.inplace_pattern = None
def __str__(self):
if self.name is None:
if self.inplace_pattern:
items = self.inplace_pattern.items()
items.sort()
return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items))
return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
str(items))
else:
return self.__class__.__name__+"{%s}" % (self.scalar_op)
return self.__class__.__name__ + "{%s}" % (self.scalar_op)
else:
return self.name
def make_node(self, *inputs):
assert self.nout == 1
assert len(inputs) == 2 # TODO remove
_inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
......@@ -175,57 +293,86 @@ class PycudaElemwiseKernelOp(GpuOp):
if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs)==2#TODO remove
# output is broadcastable only along dimensions where all inputs are broadcastable
broadcastable = []
for d in xrange(_inputs[0].type.ndim):
bcast_d = True
for i in _inputs:
if not i.type.broadcastable[d]:
bcast_d = False
break
broadcastable.append(bcast_d)
assert len(broadcastable) == _inputs[0].type.ndim
otype = CudaNdarrayType(broadcastable=broadcastable)
assert self.nout == 1
otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i"+str(id) for id in range(len(inputs))]
out_name = ["o"+str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {})
self.pycuda_fct = TheanoElementwiseKernel(
", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]),
c_code,
"pycuda_elemwise_kernel_%s"%str(self.scalar_op),
preamble="""#include<Python.h>
#include <numpy/arrayobject.h>""")
return out_node
def perform(self, node, inputs, out):
def make_thunk(self, node, storage_map, _, _2):
#TODO support broadcast!
#TODO assert all input have the same shape
z, = out
if z[0] is None or z[0].shape!=inputs[0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
i = inputs + z
self.pycuda_fct(*i)
fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
in_name = ["i" + str(id) for id in range(len(node.inputs))]
out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(node, "some_name",
tuple([n + "[i]" for n in in_name]),
tuple(n + "[i]" for n in out_name), {})
c_code_param = ", ".join([var.type.dtype_specs()[1] + " *" + name
for var, name in
zip(node.inputs, in_name) +
zip(node.outputs, out_name)] + ["int size"])
mod = SourceModule("""
#include<Python.h>
#include <numpy/arrayobject.h>
__global__ void %s(%s)
{
int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
i += threadIdx.x + threadIdx.y*blockDim.x;
if(i<size){
%s
}
}
""" % (fct_name, c_code_param, c_code))
pycuda_fct = mod.get_function(fct_name)
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
def thunk():
z = outputs[0]
if z[0] is None or z[0].shape != inputs[0][0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(
inputs[0][0].shape)
if inputs[0][0].shape != inputs[1][0].shape:
raise TypeError("PycudaElemwiseSourceModuleMakeThunkOp:"
" inputs don't have the same shape!")
if inputs[0][0].size > 512:
grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1)
block = (512, 1, 1)
else:
grid = (1, 1)
block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1)
out = pycuda_fct(inputs[0][0], inputs[1][0], z[0],
numpy.intc(inputs[1][0].size), block=block,
grid=grid)
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
return thunk
pycuda_optimizer = EquilibriumDB()
gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")
@local_optimizer([])
def local_pycuda_gpu_elemwise(node):
"""
GpuElemwise -> PycudaElemwiseSourceModuleOp
"""
if isinstance(node.op, GpuElemwise):
if not any([ any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim<=2 for i in node.inputs]):
new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
if (not any([any(i.type.broadcastable) for i in node.inputs]) and
all([i.ndim <= 2 for i in node.inputs])):
new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op,
node.op.inplace_pattern)(
*node.inputs)
return [new_op]
pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise)
pycuda_optimizer.register("local_pycuda_gpu_elemwise",
local_pycuda_gpu_elemwise)
@local_optimizer([])
def local_pycuda_gpu_elemwise_kernel(node):
......@@ -233,8 +380,11 @@ def local_pycuda_gpu_elemwise_kernel(node):
GpuElemwise -> PycudaElemwiseKernelOp
"""
if isinstance(node.op, GpuElemwise):
if not any([ any(i.type.broadcastable) for i in node.inputs]):
new_op = PycudaElemwiseKernelOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
if not any([any(i.type.broadcastable) for i in node.inputs]):
new_op = PycudaElemwiseKernelOp(node.op.scalar_op,
node.op.inplace_pattern)(
*node.inputs)
return [new_op]
pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel", local_pycuda_gpu_elemwise_kernel, 1.5)
pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel",
local_pycuda_gpu_elemwise_kernel, 1.5)
......@@ -5,7 +5,8 @@ import theano.misc.pycuda_init
if not theano.misc.pycuda_init.pycuda_available:
from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda code.")
raise SkipTest("Pycuda not installed. Skip test of theano op"
" with pycuda code.")
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
......@@ -14,71 +15,93 @@ if cuda_ndarray.cuda_available == False:
import theano
import theano.tensor as T
from theano.misc.pycuda_example import PycudaElemwiseSourceModuleOp, PycudaElemwiseKernelOp, PycudaElemwiseSourceModuleMakeThunkOp
from theano.misc.pycuda_example import (PycudaElemwiseSourceModuleOp,
# PycudaElemwiseKernelOp,
PycudaElemwiseSourceModuleMakeThunkOp)
if theano.config.mode=='FAST_COMPILE':
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def test_pycuda_elemwise_source_module():
for shape in [(5,5), (10,49), (50,49),(500,501),(5000,5001)]:
for shape in [(5, 5), (10, 49), (50, 49), (500, 501)]:
for op in [theano.scalar.basic.mul, theano.scalar.basic.add]:
x=T.fmatrix('x')
y=T.fmatrix('y')
x = T.fmatrix('x')
y = T.fmatrix('y')
elemwise_op = theano.tensor.Elemwise(op)
pycuda_op = PycudaElemwiseSourceModuleOp(op)
pycuda_op_thunk = PycudaElemwiseSourceModuleMakeThunkOp(op)
f=theano.function([x,y], elemwise_op(x,y), mode=mode_with_gpu)
f2 = theano.function([x,y], theano.sandbox.cuda.host_from_gpu(pycuda_op(x,y)))
f3 = theano.function([x,y], elemwise_op(x,y),
mode=mode_with_gpu.including("local_pycuda_gpu_elemwise"))
f4 = theano.function([x,y], theano.sandbox.cuda.host_from_gpu(pycuda_op_thunk(x,y)))
f = theano.function([x, y], elemwise_op(x, y), mode=mode_with_gpu)
f2 = theano.function([x, y],
theano.sandbox.cuda.host_from_gpu(
pycuda_op(x, y)),
mode=mode_with_gpu)
mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise")
f3 = theano.function([x, y], elemwise_op(x, y),
mode=mode_pycuda)
f4 = theano.function([x, y],
theano.sandbox.cuda.host_from_gpu(
pycuda_op_thunk(x, y)),
mode=mode_with_gpu)
assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f2.maker.env.toposort()])
assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f3.maker.env.toposort()])
assert any([ isinstance(node.op, PycudaElemwiseSourceModuleMakeThunkOp) for node in f4.maker.env.toposort()])
assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
for node in f.maker.env.toposort()])
assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
for node in f2.maker.env.toposort()])
assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
for node in f3.maker.env.toposort()])
assert any([isinstance(node.op,
PycudaElemwiseSourceModuleMakeThunkOp)
for node in f4.maker.env.toposort()])
val1 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
val2 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
assert (f(val1,val2) == f2(val1,val2)).all()
assert (f(val1,val2) == f3(val1,val2)).all()
assert (f(val1,val2) == f4(val1,val2)).all()
assert (f(val1, val2) == f2(val1, val2)).all()
assert (f(val1, val2) == f3(val1, val2)).all()
assert (f(val1, val2) == f4(val1, val2)).all()
#print f(val1,val2)
#print f2(val1,val2)
"""
#commented as it work only with old pycuda version.
def test_pycuda_elemwise_kernel():
x=T.fmatrix('x')
y=T.fmatrix('y')
f=theano.function([x,y],x+y, mode=mode_with_gpu)
x = T.fmatrix('x')
y = T.fmatrix('y')
f = theano.function([x, y], x + y, mode=mode_with_gpu)
print f.maker.env.toposort()
f2 = theano.function([x,y],x+y, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel"))
mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")
f2 = theano.function([x, y], x + y, mode=mode_pycuda)
print f2.maker.env.toposort()
assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f2.maker.env.toposort()])
assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
for node in f.maker.env.toposort()])
assert any([isinstance(node.op, PycudaElemwiseKernelOp)
for node in f2.maker.env.toposort()])
val1 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
val2 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
val1 = numpy.asarray(numpy.random.rand(5, 5), dtype='float32')
val2 = numpy.asarray(numpy.random.rand(5, 5), dtype='float32')
#val1 = numpy.ones((5,5))
#val2 = numpy.arange(25).reshape(5,5)
assert (f(val1,val2) == f2(val1,val2)).all()
print f(val1,val2)
print f2(val1,val2)
assert (f(val1, val2) == f2(val1, val2)).all()
print f(val1, val2)
print f2(val1, val2)
x3=T.ftensor3('x')
y3=T.ftensor3('y')
z3=T.ftensor3('y')
x3 = T.ftensor3('x')
y3 = T.ftensor3('y')
z3 = T.ftensor3('y')
f4 = theano.function([x3,y3,z3],x3*y3+z3, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel"))
f4 = theano.function([x3, y3, z3], x3 * y3 + z3, mode=mode_pycuda)
print f4.maker.env.toposort()
assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f4.maker.env.toposort()])
assert any([isinstance(node.op, PycudaElemwiseKernelOp)
for node in f4.maker.env.toposort()])
val1 = numpy.random.rand(2,2,2)
val1 = numpy.random.rand(2, 2, 2)
print val1
print f4(val1,val1,val1)
assert numpy.allclose(f4(val1,val1,val1),val1*val1+val1)
print f4(val1, val1, val1)
assert numpy.allclose(f4(val1, val1, val1), val1 * val1 + val1)
"""
......@@ -78,7 +78,10 @@ __global__ void multiply_them(float *dest, float *a, float *b)
def test_pycuda_memory_to_theano():
#Test that we can use the GpuArray memory space in pycuda in a CudaNdarray
y = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
print numpy.asarray(y)
print sys.getrefcount(y)
# This increase the ref count with never pycuda. Do pycuda also
# cache ndarray?
# print y.get()
print "gpuarray ref count before creating a CudaNdarray",
print sys.getrefcount(y)
assert sys.getrefcount(y) == 2
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论