提交 23e43b1b authored 作者: Frederic Bastien's avatar Frederic Bastien

Change the prepare_node logic to make it safe to call make_py_thunk and make_c_thunk directly.

上级 179e4085
......@@ -99,7 +99,7 @@ possibilities you may encounter or need. For that refer to
pass
# Other implementations (pycuda, ...):
def make_thunk(self, node, storage_map, _, _2):
def make_thunk(self, node, storage_map, _, _2, impl=None):
pass
# optional:
......@@ -190,11 +190,12 @@ or :func:`make_thunk`.
valid, but shouldn't be required anymore for this call.
The returned function must ensure that it sets the computed
variables as computed in the `compute_map`.
- ``impl`` allow to select between multiple implementation.
It should have a default value of None.
:func:`make_thunk` is useful if you want to generate code and compile
it yourself. For example, this allows you to use PyCUDA to compile GPU
code.
code and keep state in the thunk.
If :func:`make_thunk()` is defined by an op, it will be used by Theano
to obtain the op's implementation.
......
......@@ -171,7 +171,7 @@ Optional methods or attributes
returned, unless it is of length 1, where the single element will be
returned by itself.
.. function:: make_thunk(node, storage_map, compute_map, no_recycling)
.. function:: make_thunk(node, storage_map, compute_map, no_recycling, impl=None)
This function must return a thunk, that is a zero-arguments
function that encapsulates the computation to be performed by this
......@@ -192,6 +192,8 @@ Optional methods or attributes
valid, but shouldn't be required anymore for this call.
:param no_recycling: WRITEME
WRITEME
:param impl: None, 'c' or 'py'
Which implementation to use.
The returned function must ensure that is sets the computed
variables as computed in the `compute_map`.
......
......@@ -92,7 +92,7 @@ You can use a GPU function compiled with PyCUDA in a Theano op:
cuda.basic_ops.as_cuda_ndarray_variable(inp))
assert inp.dtype == "float32"
return theano.Apply(self, [inp], [inp.type()])
def make_thunk(self, node, storage_map, _, _2):
def make_thunk(self, node, storage_map, _, _2, impl=None):
mod = SourceModule("""
__global__ void my_fct(float * i0, float * o0, int size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
......
......@@ -586,7 +586,7 @@ Modify and execute to work for a matrix of shape (20, 10).
assert inp.dtype == "float32"
return theano.Apply(self, [inp], [inp.type()])
def make_thunk(self, node, storage_map, _, _2):
def make_thunk(self, node, storage_map, _, _2, impl):
mod = SourceModule("""
__global__ void my_fct(float * i0, float * o0, int size) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
......
......@@ -124,8 +124,8 @@ class OpFromGraph(gof.Op):
list(inputs) + self.shared_inputs,
[type() for type in self.output_types])
def prepare_node(self, node, storage_map, compute_map):
if not hasattr(node.tag, "fn"):
def prepare_node(self, node, storage_map, compute_map, impl):
if not hasattr(node.tag, "fn") and impl == 'py':
node.tag.fn = orig_function(self.new_inputs,
self.new_outputs,
**self.kwargs)
......
......@@ -1837,8 +1837,6 @@ class _Linker(gof.link.LocalLinker):
thunk.inputs = [storage_map[v] for v in node.inputs]
thunk.outputs = [storage_map[v] for v in node.outputs]
thunk_other = thunk
else:
node.op.prepare_node(node, storage_map, compute_map)
debug = hasattr(node.op, 'debug_perform')
......@@ -1852,6 +1850,7 @@ class _Linker(gof.link.LocalLinker):
if not isinstance(node.op, gof.op.Op):
raise utils.MethodNotDefined()
node.op.prepare_node(node, storage_map, compute_map, 'c')
thunk = node.op.make_c_thunk(node, storage_map, compute_map,
no_recycling)
thunks_c.append(thunk)
......@@ -1864,6 +1863,7 @@ class _Linker(gof.link.LocalLinker):
if (((self.maker.mode.check_py_code or thunks_c[-1] is None) and
node.op.perform.__code__ != gof.op.PureOp.perform.__code__) or
debug):
node.op.prepare_node(node, storage_map, compute_map, 'py')
thunk = node.op.make_py_thunk(node, storage_map, compute_map,
no_recycling, debug=debug)
thunks_py.append(thunk)
......@@ -1873,6 +1873,7 @@ class _Linker(gof.link.LocalLinker):
if not self.maker.mode.check_c_code and thunks_py[-1] is None:
_logger.warn("Op %s doesn't have a perform, "
"forcing check of the C code" % node.op)
node.op.prepare_node(node, storage_map, compute_map, 'c')
thunk = node.op.make_c_thunk(node, storage_map, compute_map,
no_recycling)
thunks_c[-1] = thunk
......
......@@ -1584,7 +1584,7 @@ class CLinker(link.Linker):
else:
# Set compute_map as None as clinker do not support lazy evaluation
for node in self.node_order:
node.op.prepare_node(node, storage_map, None)
node.op.prepare_node(node, storage_map, None, 'c')
module = get_module_cache().module_from_key(
key=key, lnk=self, keep_lock=keep_lock)
......@@ -1787,17 +1787,8 @@ class OpWiseCLinker(link.LocalLinker):
thunks = []
for node in order:
# Maker sure we use the C version of the code whenever
# possible
# There are ops that don't have _op_use_c_code property
# for example ifelse (or any ops that come with their own
# make_thunk
if theano.config.cxx:
thunks += [node.op.make_c_thunk(node,
storage_map,
compute_map,
no_recycling)]
else:
# make_thunk will try by default C code, otherwise
# it fall back to python.
thunks += [node.op.make_thunk(node,
storage_map,
compute_map,
......
......@@ -823,10 +823,11 @@ class PerformLinker(LocalLinker):
# the python version
# Note : ops that implement their own make thunk don't usually
# have this attribute defiend !!
thunks += [node.op.make_py_thunk(node,
thunks += [node.op.make_thunk(node,
storage_map,
compute_map,
no_recycling)]
no_recycling,
'py')]
thunks[-1].inputs = [storage_map[v] for v in node.inputs]
thunks[-1].outputs = [storage_map[v] for v in node.outputs]
......
......@@ -792,19 +792,22 @@ class Op(utils.object2, PureOp, CLinkerOp):
def __init__(self, use_c_code=theano.config.cxx):
self._op_use_c_code = use_c_code
def prepare_node(self, node, storage_map, compute_map):
def prepare_node(self, node, storage_map, compute_map, impl):
"""
Make any special modifications that the Op needs before doing
make_thunk().
This can modify the node inplace and should return nothing.
It can be called multiple time with different impl. It is the
op responsability to don't re-prepare the node when it isn't
good to do so.
"""
pass
def make_c_thunk(self, node, storage_map, compute_map, no_recycling):
"""
Like make_thunk, but will only try to make a C thunk.
"""Like make_thunk, but will only try to make a C thunk.
"""
node_input_storage = [storage_map[r] for r in node.inputs]
......@@ -883,7 +886,8 @@ class Op(utils.object2, PureOp, CLinkerOp):
rval.lazy = False
return rval
def make_thunk(self, node, storage_map, compute_map, no_recycling):
def make_thunk(self, node, storage_map, compute_map, no_recycling,
impl=None):
"""
This function must return a thunk, that is a zero-arguments
function that encapsulates the computation to be performed
......@@ -904,6 +908,9 @@ class Op(utils.object2, PureOp, CLinkerOp):
no_recycling
List of variables for which it is forbidden to reuse memory
allocated by a previous call.
impl
Currently, None, 'c' or 'py'. If 'c' or 'py' we will only try
that version of the code.
Notes
-----
......@@ -913,26 +920,26 @@ class Op(utils.object2, PureOp, CLinkerOp):
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
self.prepare_node(node, ...) is always called. If we try 'c' and it
fail and we try again 'py', prepare_node will be called twice.
"""
if impl is None or impl == 'c':
self.prepare_node(node, storage_map=storage_map,
compute_map=compute_map)
if not hasattr(self, '_op_use_c_code'):
warnings.warn(
"The __getstate__ method of '%s' is not implemented correctly."
" It should keep the attributes added by the base class."
" To implement it correctly, it should keep all attributes"
" and only remove those it does not want." % (self),
stacklevel=2)
if getattr(self, '_op_use_c_code', theano.config.cxx):
compute_map=compute_map, impl='c')
try:
return self.make_c_thunk(node, storage_map, compute_map,
no_recycling)
except (NotImplementedError, utils.MethodNotDefined):
# We requested the c code, so don't catch the error.
if impl == 'c':
raise
_logger.debug('Falling back on perform')
# condition: either there was no c_code, or it failed
# condition: either there was no c_code, or it failed or
# python code was requested.
self.prepare_node(node, storage_map=storage_map,
compute_map=compute_map, impl='py')
return self.make_py_thunk(node, storage_map, compute_map, no_recycling)
def make_node(self, *inputs):
......@@ -1195,8 +1202,8 @@ int main( int argc, const char* argv[] )
self.openmp = False
theano.config.openmp = False
def prepare_node(self, node, storage_map,
compute_map):
def prepare_node(self, node, storage_map, compute_map, impl):
if impl == 'c':
self.update_self_openmp()
......
......@@ -25,7 +25,7 @@ class IfElseIfElseIf(PureOp):
assert t3.type == f3.type
return Apply(self, [c1, t1, c2, t2, c3, t3, f3], [t1.type()])
def make_thunk(self, node, storage_map, compute_map, no_recycling):
def make_thunk(self, node, storage_map, compute_map, no_recycling, impl):
input_computed = [compute_map[v] for v in node.inputs]
output_computed = [compute_map[v] for v in node.outputs]
......@@ -93,7 +93,7 @@ class NotImplementedOp(PureOp):
def make_node(self, x):
return Apply(self, [x], [x.type()])
def make_thunk(self, node, storage_map, compute_map, no_recycling):
def make_thunk(self, node, storage_map, compute_map, no_recycling, impl):
def thunk():
raise self.E()
thunk.lazy = False
......
......@@ -1043,16 +1043,14 @@ class VM_Linker(link.LocalLinker):
t0 = time.time()
for node in order:
try:
impl = None
if self.c_thunks is False:
thunks.append(node.op.make_py_thunk(node,
storage_map,
compute_map,
no_recycling))
else:
impl = 'py'
thunks.append(node.op.make_thunk(node,
storage_map,
compute_map,
no_recycling))
no_recycling,
impl=impl))
if not hasattr(thunks[-1], 'lazy'):
# We don't want all ops maker to think about lazy Ops.
# So if they didn't specify that its lazy or not, it isn't.
......
......@@ -2640,7 +2640,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
def get_params(self, node):
return node.outputs[0].type.context
def prepare_node(self, node, storage_map, compute_map):
def prepare_node(self, node, storage_map, compute_map, impl):
# cache the kernel object
self.get_kernel_cache(node)
......
......@@ -73,7 +73,7 @@ class CuRFFTOp(Op):
return theano.Apply(self, [inp, s], [self.output_type(inp)()])
def make_thunk(self, node, storage_map, _, _2):
def make_thunk(self, node, storage_map, _, _2, impl=None):
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
......@@ -198,7 +198,7 @@ class CuIRFFTOp(Op):
return theano.Apply(self, [inp, s], [self.output_type(inp)()])
def make_thunk(self, node, storage_map, _, _2):
def make_thunk(self, node, storage_map, _, _2, impl=None):
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
......
......@@ -235,7 +235,7 @@ class IfElse(Op):
if_true_op(*if_true, **dict(return_list=True)) +
if_false_op(*if_false, **dict(return_list=True)))
def make_py_thunk(self, node, storage_map, compute_map, no_recycling):
def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
cond = node.inputs[0]
ts = node.inputs[1:][:self.n_outs]
fs = node.inputs[1:][self.n_outs:]
......
......@@ -320,7 +320,7 @@ class PycudaElemwiseSourceModuleMakeThunkOp(Op):
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
return out_node
def make_thunk(self, node, storage_map, _, _2):
def make_thunk(self, node, storage_map, _, _2, impl=None):
# TODO support broadcast!
# TODO assert all input have the same shape
fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
......
......@@ -246,7 +246,7 @@ class GpuOp(theano.gof.Op):
"""
def prepare_node(self, node, storage_map, compute_map):
def prepare_node(self, node, storage_map, compute_map, impl):
if use.device_number is None:
use("gpu",
force=True,
......
......@@ -2119,7 +2119,7 @@ class GpuConv(GpuOp):
images[2] * images[3] * 2)
return flops
def prepare_node(self, node, storage_map, compute_map):
def prepare_node(self, node, storage_map, compute_map, impl):
super(GpuConv, self).prepare_node(node, storage_map, compute_map, impl)
if node.op.max_threads_dim0 is None:
......
......@@ -51,10 +51,7 @@ class GpuSolve(GpuOp):
assert inp2.ndim == 2
return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
def make_thunk(self,
node,
storage_map, _,
no_recycling=[]):
def make_thunk(self, node, storage_map, _, no_recycling, impl=None):
# Initialize CULA the first time it is needed
global cula_initialized
......
......@@ -1512,8 +1512,9 @@ class GpuDnnPool(DnnBase):
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
def prepare_node(self, node, storage_map, compute_map):
super(GpuDnnPool, self).prepare_node(node, storage_map, compute_map)
def prepare_node(self, node, storage_map, compute_map, impl):
super(GpuDnnPool, self).prepare_node(
node, storage_map, compute_map, impl)
if len(node.inputs) == 2:
warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3)
......@@ -1752,7 +1753,7 @@ class GpuDnnPoolGrad(DnnBase):
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
def prepare_node(self, node, storage_map, compute_map):
def prepare_node(self, node, storage_map, compute_map, impl):
if len(node.inputs) == 4:
warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3)
# Old interface
......
......@@ -49,20 +49,12 @@ class GpuCumsum(CumsumOp, GpuOp):
return theano.Apply(self, [x], [x.type()])
def make_thunk(self, node, storage_map, compute_map, no_recycling):
def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
node_ = copy.copy(node)
assert node.op is node_.op
if node_.op.max_threads_dim0 is None or node_.op.max_grid_size1 is None or node_.op.max_grid_size2 is None:
cuda = theano.sandbox.cuda
device_id = cuda.use.device_number
if device_id is None:
cuda.use("gpu",
force=False,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False,
test_driver=True)
device_id = cuda.use.device_number
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
prop = cuda_ndarray.device_properties(device_id)
node_.op.max_threads_dim0 = prop['maxThreadsDim0']
......@@ -70,7 +62,7 @@ class GpuCumsum(CumsumOp, GpuOp):
node_.op.max_grid_size2 = prop['maxGridSize2']
return super(GpuCumsum, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling)
compute_map, no_recycling, impl)
def __str__(self):
return "%s{%s}" % (self.__class__.__name__, self.axis)
......
......@@ -48,7 +48,7 @@ class ScikitsCudaOp(GpuOp):
return theano.Apply(self, [inp], [self.output_type(inp)()])
def make_thunk(self, node, storage_map, _, _2):
def make_thunk(self, node, storage_map, _, _2, impl=None):
if not scikits_cuda_available:
raise RuntimeError(
"scikits.cuda is needed for all GPU fft implementation,"
......@@ -61,7 +61,7 @@ class CuFFTOp(ScikitsCudaOp):
return CudaNdarrayType(
broadcastable=[False] * (inp.type.ndim + 1))
def make_thunk(self, node, storage_map, _, _2):
def make_thunk(self, node, storage_map, _, _2, impl=None):
super(CuFFTOp, self).make_thunk(node, storage_map, _, _2)
from theano.misc.pycuda_utils import to_gpuarray
......@@ -118,7 +118,7 @@ class CuIFFTOp(ScikitsCudaOp):
return CudaNdarrayType(
broadcastable=[False] * (inp.type.ndim - 1))
def make_thunk(self, node, storage_map, _, _2):
def make_thunk(self, node, storage_map, _, _2, impl=None):
super(CuIFFTOp, self).make_thunk(node, storage_map, _, _2)
from theano.misc.pycuda_utils import to_gpuarray
......@@ -314,7 +314,7 @@ class BatchedComplexDotOp(ScikitsCudaOp):
def output_type(self, inp):
return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
def make_thunk(self, node, storage_map, _, _2):
def make_thunk(self, node, storage_map, _, _2, impl=None):
super(BatchedComplexDotOp, self).make_thunk(node, storage_map, _, _2)
inputs = [storage_map[v] for v in node.inputs]
......
......@@ -3664,10 +3664,12 @@ class Composite(ScalarOp):
# self.init_name() # self.name
self.name = None
def prepare_node(self, node, storage_map, compute_map):
def prepare_node(self, node, storage_map, compute_map, impl):
if impl == 'py':
self.init_py_impls() # self._impls
elif impl == 'c':
for n in theano.gof.graph.list_of_nodes(self.inputs, self.outputs):
n.op.prepare_node(n, None, None)
n.op.prepare_node(n, None, None, impl)
def output_types(self, input_types):
if tuple(input_types) != self.inputs_type:
......
......@@ -698,7 +698,7 @@ class Scan(PureOp):
scan_utils.hash_listsDictsTuples(self.info)))
def make_thunk(self, node, storage_map, compute_map, no_recycling,
python_exec=False):
impl=None):
"""
Parameters
......@@ -716,8 +716,8 @@ class Scan(PureOp):
no_recycling
List of variables for which it is forbidden to reuse memory
allocated by a previous call.
python_exec
If we want python execution.
impl
Use 'py' if we want python execution.
Notes
-----
If the thunk consults the storage_map on every call, it is safe
......@@ -866,7 +866,7 @@ class Scan(PureOp):
for out in self.fn.maker.fgraph.outputs]
try:
if python_exec is True:
if impl == 'py':
raise theano.gof.cmodule.MissingGXX
cython_mintaps = numpy.asarray(self.mintaps, dtype='int32')
cython_tap_array_len = \
......@@ -965,13 +965,6 @@ class Scan(PureOp):
rval.lazy = False
return rval
def make_py_thunk(self, node, storage_map, compute_map, no_recycling):
return self.make_thunk(node=node,
storage_map=storage_map,
compute_map=compute_map,
no_recycling=no_recycling,
python_exec=True)
def inner_seqs(self, list_inputs):
# Given the list of inner inputs this function grabs those
# corresponding to sequences
......
......@@ -297,9 +297,6 @@ class Ger(Op):
This interface to GER allows non-destructive operation on A via the
`destructive` argument to the constructor.
:TODO: Create better classes ScipyGer and CGer that inherit from this class
and override the make_thunk() method to use Scipy and C respectively.
"""
__props__ = ("destructive",)
......
......@@ -22,7 +22,7 @@ if have_fblas:
class ScipyGer(Ger):
def prepare_node(self, node, storage_map, compute_map):
def prepare_node(self, node, storage_map, compute_map, impl):
if impl == 'py':
node.tag.local_ger = _blas_ger_fns[numpy.dtype(
node.inputs[0].type.dtype)]
......
......@@ -787,14 +787,15 @@ second dimension
return ret
def prepare_node(self, node, storage_map, compute_map):
def prepare_node(self, node, storage_map, compute_map, impl):
# Postpone the ufunc building to the last minutes
# NumPy ufunc support only up to 31 inputs.
# But our c code support more.
if (len(node.inputs) < 32 and
(self.nfunc is None or
self.scalar_op.nin != len(node.inputs)) and
self.ufunc is None):
self.ufunc is None and
impl == 'py'):
ufunc = numpy.frompyfunc(self.scalar_op.impl,
len(node.inputs),
......@@ -830,7 +831,7 @@ second dimension
[get_scalar_type(dtype=output.type.dtype).make_variable()
for output in node.outputs])
self.scalar_op.prepare_node(node.tag.fake_node, None, None)
self.scalar_op.prepare_node(node.tag.fake_node, None, None, impl)
def perform(self, node, inputs, output_storage):
if len(node.inputs) >= 32:
......@@ -891,13 +892,6 @@ second dimension
if self.ufunc:
ufunc = self.ufunc
else:
if not hasattr(node.tag, 'ufunc'):
# It happen that make_thunk isn't called, like in
# get_scalar_constant_value
node.tag.ufunc = numpy.frompyfunc(self.scalar_op.impl,
len(node.inputs),
self.scalar_op.nout)
ufunc = node.tag.ufunc
nout = ufunc.nout
......@@ -977,7 +971,7 @@ second dimension
# To not request all of them to call prepare_node(), do it here.
# There is no harm if it get called multile time.
if not hasattr(node.tag, 'fake_node'):
self.prepare_node(node, None, None)
self.prepare_node(node, None, None, 'c')
_inames = inames
_onames = onames
......
......@@ -6295,15 +6295,12 @@ def constant_folding(node):
for o in node.outputs:
storage_map[o] = [None]
compute_map[o] = [False]
impl = None
if (hasattr(node.op, 'python_constant_folding') and
node.op.python_constant_folding(node)):
thunk = node.op.make_py_thunk(node,
storage_map,
compute_map,
[])
else:
impl = 'py'
thunk = node.op.make_thunk(node, storage_map, compute_map,
no_recycling=[])
no_recycling=[], impl=impl)
required = thunk()
assert not required # a node whose inputs are all provided should always
......
......@@ -241,7 +241,7 @@ class Pool(OpenMPOp):
" 'average_inc_pad' and 'average_exc_pad'. Got %s" % mode)
self.mode = mode
def prepare_node(self, node, storage_map, compute_map):
def prepare_node(self, node, storage_map, compute_map, impl):
if len(node.inputs) == 1:
# Old interface
self.mode = node.op.mode
......@@ -686,7 +686,7 @@ class PoolGrad(OpenMPOp):
self.mode = mode
super(PoolGrad, self).__init__(openmp=openmp)
def prepare_node(self, node, storage_map, compute_map):
def prepare_node(self, node, storage_map, compute_map, impl):
if len(node.inputs) < 5: # 5 for AveragePoolGrad, 6 for MaxPoolGrad
# Old interface
self.mode = node.op.mode
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论