Catch MemoryError in PE's block matching algo

fixes #264 (hopefully)
2025-09-11 17:58:17 +00:00 · 2014-10-05 17:13:36 -04:00 · 2014-10-05 17:13:36 -04:00 · 372a682610
commit 372a682610
parent 44266273bf
3 changed files with 95 additions and 62 deletions
--- a/core_pe/matchblock.py
+++ b/core_pe/matchblock.py
@ -10,7 +10,7 @@ import logging
 import multiprocessing
 from itertools import combinations
-from hscommon.util import extract
+from hscommon.util import extract, iterconsume
 from hscommon.trans import tr
 from hscommon.jobprogress import job
@ -175,6 +175,7 @@ def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nul
    comparisons_to_do = list(combinations(chunks + [None], 2))
    comparison_count = 0
    j.start_job(len(comparisons_to_do))
    try:
        for ref_chunk, other_chunk in comparisons_to_do:
            picinfo = {p.cache_id: get_picinfo(p) for p in ref_chunk}
            ref_ids = [p.cache_id for p in ref_chunk]
@ -187,13 +188,21 @@ def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nul
            async_results.append(pool.apply_async(async_compare, args))
            collect_results()
        collect_results(collect_all=True)
    except MemoryError:
        # Rare, but possible, even in 64bit situations (ref #264). What do we do now? We free us
        # some wiggle room, log about the incident, and stop matching right here. We then process
        # the matches we have. The rest of the process doesn't allocate much and we should be
        # alright.
        del matches[-1000:] # some wiggle room to ensure we don't run out of memory again.
        logging.warning("Ran out of memory when scanning! We had %d matches.", len(matches) + 1000)
    pool.close()
    result = []
    myiter = j.iter_with_progress(
-        matches,
+        iterconsume(matches),
        tr("Verified %d/%d matches"),
-        every=10
+        every=10,
        count=len(matches),
    )
    for ref_id, other_id, percentage in myiter:
        ref = id2picture[ref_id]
--- a/hscommon/jobprogress/job.py
+++ b/hscommon/jobprogress/job.py
@ -80,21 +80,27 @@ class Job:
    def check_if_cancelled(self):
        self._do_update('')
-    def iter_with_progress(self, sequence, desc_format=None, every=1):
+    def iter_with_progress(self, iterable, desc_format=None, every=1, count=None):
-        ''' Iterate through sequence while automatically adding progress.
+        """Iterate through ``iterable`` while automatically adding progress.
-        '''
+
        WARNING: We need our iterable's length. If ``iterable`` is not a sequence (that is,
        something we can call ``len()`` on), you *have* to specify a count through the ``count``
        argument. If ``count`` is ``None``, ``len(iterable)`` is used.
        """
        if count is None:
            count = len(iterable)
        desc = ''
        if desc_format:
-            desc = desc_format % (0, len(sequence))
+            desc = desc_format % (0, count)
-        self.start_job(len(sequence), desc)
+        self.start_job(count, desc)
-        for i, element in enumerate(sequence, start=1):
+        for i, element in enumerate(iterable, start=1):
            yield element
            if i % every == 0:
                if desc_format:
-                    desc = desc_format % (i, len(sequence))
+                    desc = desc_format % (i, count)
                self.add_progress(progress=every, desc=desc)
        if desc_format:
-            desc = desc_format % (len(sequence), len(sequence))
+            desc = desc_format % (count, count)
        self.set_progress(100, desc)
    def start_job(self, max_progress=100, desc=''):
--- a/hscommon/util.py
+++ b/hscommon/util.py
@ -117,6 +117,24 @@ def trailiter(iterable, skipfirst=False):
        yield prev, item
        prev = item
 def iterconsume(seq):
    """Iterate over ``seq`` and discard yielded objects.
    Right after the ``yield``, we replace the element we've just yielded by ``None`` in the
    sequence.
    This is useful in tight memory situation where you are looping over a sequence of objects that
    are going to be discarded afterwards. If you're creating other objects during that iteration
    you might want to use this to avoid ``MemoryError``.
    Note that this only works for sequence (index accessible), not all iterables.
    """
    # We don't use ``del``, because it would be disastrous performance-wise as the array would have
    # to be constantly re-allocated.
    for index, elem in enumerate(seq):
        seq[index] = None
        yield elem
 #--- String related
 def escape(s, to_escape, escape_with='\\'):