1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2025-03-10 05:34:36 +00:00

Catch MemoryError in PE's block matching algo

fixes #264 (hopefully)
This commit is contained in:
Virgil Dupras 2014-10-05 17:13:36 -04:00
parent 44266273bf
commit 372a682610
3 changed files with 95 additions and 62 deletions

View File

@ -10,7 +10,7 @@ import logging
import multiprocessing import multiprocessing
from itertools import combinations from itertools import combinations
from hscommon.util import extract from hscommon.util import extract, iterconsume
from hscommon.trans import tr from hscommon.trans import tr
from hscommon.jobprogress import job from hscommon.jobprogress import job
@ -175,6 +175,7 @@ def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nul
comparisons_to_do = list(combinations(chunks + [None], 2)) comparisons_to_do = list(combinations(chunks + [None], 2))
comparison_count = 0 comparison_count = 0
j.start_job(len(comparisons_to_do)) j.start_job(len(comparisons_to_do))
try:
for ref_chunk, other_chunk in comparisons_to_do: for ref_chunk, other_chunk in comparisons_to_do:
picinfo = {p.cache_id: get_picinfo(p) for p in ref_chunk} picinfo = {p.cache_id: get_picinfo(p) for p in ref_chunk}
ref_ids = [p.cache_id for p in ref_chunk] ref_ids = [p.cache_id for p in ref_chunk]
@ -187,13 +188,21 @@ def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nul
async_results.append(pool.apply_async(async_compare, args)) async_results.append(pool.apply_async(async_compare, args))
collect_results() collect_results()
collect_results(collect_all=True) collect_results(collect_all=True)
except MemoryError:
# Rare, but possible, even in 64bit situations (ref #264). What do we do now? We free us
# some wiggle room, log about the incident, and stop matching right here. We then process
# the matches we have. The rest of the process doesn't allocate much and we should be
# alright.
del matches[-1000:] # some wiggle room to ensure we don't run out of memory again.
logging.warning("Ran out of memory when scanning! We had %d matches.", len(matches) + 1000)
pool.close() pool.close()
result = [] result = []
myiter = j.iter_with_progress( myiter = j.iter_with_progress(
matches, iterconsume(matches),
tr("Verified %d/%d matches"), tr("Verified %d/%d matches"),
every=10 every=10,
count=len(matches),
) )
for ref_id, other_id, percentage in myiter: for ref_id, other_id, percentage in myiter:
ref = id2picture[ref_id] ref = id2picture[ref_id]

View File

@ -80,21 +80,27 @@ class Job:
def check_if_cancelled(self): def check_if_cancelled(self):
self._do_update('') self._do_update('')
def iter_with_progress(self, sequence, desc_format=None, every=1): def iter_with_progress(self, iterable, desc_format=None, every=1, count=None):
''' Iterate through sequence while automatically adding progress. """Iterate through ``iterable`` while automatically adding progress.
'''
WARNING: We need our iterable's length. If ``iterable`` is not a sequence (that is,
something we can call ``len()`` on), you *have* to specify a count through the ``count``
argument. If ``count`` is ``None``, ``len(iterable)`` is used.
"""
if count is None:
count = len(iterable)
desc = '' desc = ''
if desc_format: if desc_format:
desc = desc_format % (0, len(sequence)) desc = desc_format % (0, count)
self.start_job(len(sequence), desc) self.start_job(count, desc)
for i, element in enumerate(sequence, start=1): for i, element in enumerate(iterable, start=1):
yield element yield element
if i % every == 0: if i % every == 0:
if desc_format: if desc_format:
desc = desc_format % (i, len(sequence)) desc = desc_format % (i, count)
self.add_progress(progress=every, desc=desc) self.add_progress(progress=every, desc=desc)
if desc_format: if desc_format:
desc = desc_format % (len(sequence), len(sequence)) desc = desc_format % (count, count)
self.set_progress(100, desc) self.set_progress(100, desc)
def start_job(self, max_progress=100, desc=''): def start_job(self, max_progress=100, desc=''):

View File

@ -117,6 +117,24 @@ def trailiter(iterable, skipfirst=False):
yield prev, item yield prev, item
prev = item prev = item
def iterconsume(seq):
"""Iterate over ``seq`` and discard yielded objects.
Right after the ``yield``, we replace the element we've just yielded by ``None`` in the
sequence.
This is useful in tight memory situation where you are looping over a sequence of objects that
are going to be discarded afterwards. If you're creating other objects during that iteration
you might want to use this to avoid ``MemoryError``.
Note that this only works for sequence (index accessible), not all iterables.
"""
# We don't use ``del``, because it would be disastrous performance-wise as the array would have
# to be constantly re-allocated.
for index, elem in enumerate(seq):
seq[index] = None
yield elem
#--- String related #--- String related
def escape(s, to_escape, escape_with='\\'): def escape(s, to_escape, escape_with='\\'):