提交 4bc95151 authored 作者: --global's avatar --global

Adapt Cython backend for mitmot prealloc

上级 589c9967
@@ -5808,7 +5808,7 @@ @@ -6667,7 +6667,7 @@
* cdef list stack * cdef list stack
* cdef int offset * cdef int offset
*/ */
- __pyx_t_4 = ((PyObject *)__pyx_v_self->descr); - __pyx_t_3 = ((PyObject *)__pyx_v_self->descr);
+ __pyx_t_4 = ((PyObject *)PyArray_DESCR(__pyx_v_self)); + __pyx_t_3 = ((PyObject *)PyArray_DESCR(__pyx_v_self));
__Pyx_INCREF(__pyx_t_4); __Pyx_INCREF(__pyx_t_3);
__pyx_v_descr = ((PyArray_Descr *)__pyx_t_4); __pyx_v_descr = ((PyArray_Descr *)__pyx_t_3);
__pyx_t_4 = 0; __pyx_t_3 = 0;
@@ -7337,7 +7337,7 @@ @@ -8237,7 +8237,7 @@
* arr.base = baseptr * arr.base = baseptr
* *
*/ */
- Py_XDECREF(__pyx_v_arr->base); - Py_XDECREF(__pyx_v_arr->base);
+ Py_XDECREF(PyArray_BASE(__pyx_v_arr)); + Py_XDECREF(PyArray_BASE(__pyx_v_arr));
/* "numpy.pxd":973 /* "numpy.pxd":973
* baseptr = <PyObject*>base * baseptr = <PyObject*>base
@@ -7346,7 +7346,11 @@ @@ -8246,7 +8246,11 @@
* *
* cdef inline object get_array_base(ndarray arr): * cdef inline object get_array_base(ndarray arr):
*/ */
- __pyx_v_arr->base = __pyx_v_baseptr; - __pyx_v_arr->base = __pyx_v_baseptr;
...@@ -26,19 +26,19 @@ ...@@ -26,19 +26,19 @@
+ #else + #else
+ PyArray_SetBaseObject(__pyx_v_arr, __pyx_v_baseptr); + PyArray_SetBaseObject(__pyx_v_arr, __pyx_v_baseptr);
+ #endif + #endif
__Pyx_RefNannyFinishContext(); __Pyx_RefNannyFinishContext();
} }
@@ -7376,7 +7376,7 @@ @@ -8285,7 +8285,7 @@
* return None * return None
* else: * else:
*/ */
- __pyx_t_1 = ((__pyx_v_arr->base == NULL) != 0); - __pyx_t_1 = ((__pyx_v_arr->base == NULL) != 0);
+ __pyx_t_1 = ((PyArray_BASE(__pyx_v_arr) == NULL) != 0); + __pyx_t_1 = ((PyArray_BASE(__pyx_v_arr) == NULL) != 0);
if (__pyx_t_1) { if (__pyx_t_1) {
/* "numpy.pxd":977 /* "numpy.pxd":977
@@ -7400,8 +7404,8 @@ @@ -8307,8 +8311,8 @@
* return <object>arr.base # <<<<<<<<<<<<<< * return <object>arr.base # <<<<<<<<<<<<<<
*/ */
__Pyx_XDECREF(__pyx_r); __Pyx_XDECREF(__pyx_r);
......
...@@ -879,6 +879,8 @@ class Scan(PureOp): ...@@ -879,6 +879,8 @@ class Scan(PureOp):
dtype='int32') dtype='int32')
cython_vector_outs = numpy.asarray(self.vector_outs, cython_vector_outs = numpy.asarray(self.vector_outs,
dtype='int32') dtype='int32')
cython_mitmots_preallocated = numpy.asarray(self.mitmots_preallocated,
dtype='int32')
if hasattr(self, 'destroy_map'): if hasattr(self, 'destroy_map'):
cython_destroy_map = [x in self.destroy_map cython_destroy_map = [x in self.destroy_map
...@@ -906,6 +908,7 @@ class Scan(PureOp): ...@@ -906,6 +908,7 @@ class Scan(PureOp):
cython_vector_outs, cython_vector_outs,
cython_mit_mot_out_slices, cython_mit_mot_out_slices,
cython_mit_mot_out_nslices, cython_mit_mot_out_nslices,
cython_mitmots_preallocated,
self.fn.fn, self.fn.fn,
self.fn, self.fn,
cython_destroy_map, cython_destroy_map,
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -62,7 +62,7 @@ import copy ...@@ -62,7 +62,7 @@ import copy
def get_version(): def get_version():
return 0.286 return 0.287
@cython.boundscheck(False) @cython.boundscheck(False)
def perform( def perform(
...@@ -82,6 +82,7 @@ def perform( ...@@ -82,6 +82,7 @@ def perform(
numpy.ndarray[numpy.int32_t,ndim=1] vector_outs, numpy.ndarray[numpy.int32_t,ndim=1] vector_outs,
numpy.ndarray[numpy.int32_t,ndim=2] mit_mot_out_slices, numpy.ndarray[numpy.int32_t,ndim=2] mit_mot_out_slices,
numpy.ndarray[numpy.int32_t,ndim=1] mit_mot_out_nslices, numpy.ndarray[numpy.int32_t,ndim=1] mit_mot_out_nslices,
numpy.ndarray[numpy.int32_t,ndim=1] mitmots_preallocated,
fn, fn,
fnct, fnct,
numpy.ndarray[numpy.int32_t,ndim=1] destroy_map, numpy.ndarray[numpy.int32_t,ndim=1] destroy_map,
...@@ -183,7 +184,7 @@ def perform( ...@@ -183,7 +184,7 @@ def perform(
cdef unsigned int idx cdef unsigned int idx
cdef unsigned int i cdef unsigned int i
cdef unsigned int j cdef unsigned int j
cdef unsigned int k cdef int k
cdef unsigned int kdx cdef unsigned int kdx
cdef unsigned int tdx cdef unsigned int tdx
cdef unsigned int pdx cdef unsigned int pdx
...@@ -194,6 +195,7 @@ def perform( ...@@ -194,6 +195,7 @@ def perform(
cdef unsigned int len_output_storage = (n_mit_mot_outs + n_mit_sot + cdef unsigned int len_output_storage = (n_mit_mot_outs + n_mit_sot +
n_sit_sot + n_nit_sot + n_sit_sot + n_nit_sot +
n_shared_outs) n_shared_outs)
cdef int input_reused[500] # max 500 inputs
cdef int output_reused[500] # max 500 outputs cdef int output_reused[500] # max 500 outputs
...@@ -254,6 +256,9 @@ def perform( ...@@ -254,6 +256,9 @@ def perform(
offset = nit_sot_arg_offset + n_nit_sot offset = nit_sot_arg_offset + n_nit_sot
other_args = args[offset:] other_args = args[offset:]
input_storage = fnct.input_storage input_storage = fnct.input_storage
len_input_storage = len(input_storage)
old_input_storage = [None] * len_input_storage
old_input_data = [None] * len_input_storage
output_storage = fnct.output_storage output_storage = fnct.output_storage
old_output_storage = [None] * len_output_storage old_output_storage = [None] * len_output_storage
old_output_data = [None] * len_output_storage old_output_data = [None] * len_output_storage
...@@ -312,11 +317,13 @@ def perform( ...@@ -312,11 +317,13 @@ def perform(
# 4. collecting slices where the output should be stored # 4. collecting slices where the output should be stored
# 4.1. Collect slices for mitmots # 4.1. Collect slices for mitmots
offset = 0
for idx in range(n_mit_mot_outs): for idx in range(n_mit_mot_outs):
output_storage[idx].storage[0] = None if not mitmots_preallocated[<unsigned int>idx]:
output_storage[<unsigned int>offset].storage[0] = None
offset += 1
# 4.2. Collect slices for mitsots, sitsots and nitsots # 4.2. Collect slices for mitsots, sitsots and nitsots
offset = n_mit_mot_outs
if i != 0: if i != 0:
for idx in range(n_outs + n_nit_sot - n_mit_mot): for idx in range(n_outs + n_nit_sot - n_mit_mot):
if ( store_steps[<unsigned int>(idx+n_mit_mot)] == 1 or if ( store_steps[<unsigned int>(idx+n_mit_mot)] == 1 or
...@@ -358,6 +365,23 @@ def perform( ...@@ -358,6 +365,23 @@ def perform(
else: else:
old_output_data[idx] = None old_output_data[idx] = None
# 4.6. Keep a reference to the variables (ndarrays, CudaNdarrays,
# etc) currently in the input_storage to be able to compare them
# with the content of the input_storage after the execution of the
# function. Also keep pointers to their data to be able to detect
# cases where outputs reused the allocated object but alter the
# memory region they refer to.
for idx in xrange(len(input_storage)):
var = input_storage[idx].storage[0]
old_input_storage[idx] = var
if hasattr(var, 'gpudata'):
old_input_data[idx] = var.gpudata
elif hasattr(var, 'data'):
old_input_data[idx] = var.data
else:
old_input_data[idx] = None
# 5. compute outputs # 5. compute outputs
t0_fn = time.time() t0_fn = time.time()
...@@ -402,13 +426,57 @@ def perform( ...@@ -402,13 +426,57 @@ def perform(
else: else:
output_reused[idx] = False output_reused[idx] = False
# Check which of the input storage have been modified by the inner
# function
for idx in xrange(len(input_storage)):
# If the storage map does not contain the same object, then
# the pre-allocated output has not been reused
new_var = input_storage[idx].storage[0]
if old_input_storage[idx] is new_var:
# The pre-allocated output is only considered as having
# been reused if it still points to the same data as it
# did before the execution of the inner function
if old_input_data[idx] is None:
input_reused[idx] = False
else:
if hasattr(new_var, 'gpudata'):
input_reused[idx] = (new_var.gpudata ==
old_input_data[idx])
elif hasattr(new_var, 'data'):
input_reused[idx] = (new_var.data ==
old_input_data[idx])
else:
input_reused[idx] = False
offset_out = 0 offset_out = 0
# 5.1 Copy over the values for mit_mot outputs # 5.1 Copy over the values for mit_mot outputs
for j in range(n_mit_mot): mitmot_inp_offset = self.n_seqs
for kdx in range(mit_mot_out_nslices[j]): mitmot_out_idx = 0
k = mit_mot_out_slices[j,kdx] for j in xrange(self.n_mit_mot):
outs[j][0][<unsigned int>(k+pos[j])] = output_storage[offset_out].storage[0] for k in self.mit_mot_out_slices[j]:
offset_out += 1 if mitmots_preallocated[<unsigned int>mitmot_out_idx]:
# This output tap has been preallocated. If the
# corresponding input storage has been replaced,
# recover the value as usual. Otherwise, the input was
# modified inplace and nothing needs to be done.
inp_idx = (mitmot_inp_offset +
self.tap_array[j].index(k))
if not input_reused[inp_idx]:
outs[j][0][<unsigned int>(k + pos[j])] = \
input_storage[<unsigned int>inp_idx].storage[0]
else:
# This output tap has not been preallocated, recover
# its value as usual
outs[j][0][<unsigned int>(k + pos[j])] = \
output_storage[<unsigned int>offset_out].storage[0]
offset_out += 1
mitmot_out_idx += 1
mitmot_inp_offset += len(self.tap_array[j])
# 5.2 Copy over the values for mit_sot/sit_sot outputs # 5.2 Copy over the values for mit_sot/sit_sot outputs
begin = n_mit_mot begin = n_mit_mot
......
...@@ -17,7 +17,7 @@ from theano.gof import cmodule ...@@ -17,7 +17,7 @@ from theano.gof import cmodule
_logger = logging.getLogger('theano.scan_module.scan_perform') _logger = logging.getLogger('theano.scan_module.scan_perform')
version = 0.286 # must match constant returned in function get_version() version = 0.287 # must match constant returned in function get_version()
need_reload = False need_reload = False
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论