Catch MemoryError better in PE's block matching algo

fixes #264 (for good this time, hopefully)
2025-07-15 18:53:19 +00:00 · 2014-10-05 22:22:59 -04:00 · 2014-10-05 22:22:59 -04:00 · 321f8ab406
commit 321f8ab406
parent 5b3d5f5d1c
3 changed files with 43 additions and 40 deletions
--- a/core_pe/matchblock.py
+++ b/core_pe/matchblock.py
@ -193,13 +193,13 @@ def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nul
        # some wiggle room, log about the incident, and stop matching right here. We then process
        # the matches we have. The rest of the process doesn't allocate much and we should be
        # alright.
-        del matches[-1000:] # some wiggle room to ensure we don't run out of memory again.
+        del comparisons_to_do, chunks, pictures # some wiggle room for the next statements
-        logging.warning("Ran out of memory when scanning! We had %d matches.", len(matches) + 1000)
+        logging.warning("Ran out of memory when scanning! We had %d matches.", len(matches))
        del matches[-len(matches)//3:] # some wiggle room to ensure we don't run out of memory again.
    pool.close()
    result = []
    myiter = j.iter_with_progress(
-        iterconsume(matches),
+        iterconsume(matches, reverse=False),
        tr("Verified %d/%d matches"),
        every=10,
        count=len(matches),
--- a/hscommon/tests/util_test.py
+++ b/hscommon/tests/util_test.py
@ -1,9 +1,9 @@
 # Created By: Virgil Dupras
 # Created On: 2011-01-11
 # Copyright 2014 Hardcoded Software (http://www.hardcoded.net)
-# 
+#
-# This software is licensed under the "BSD" License as described in the "LICENSE" file, 
+# This software is licensed under the "BSD" License as described in the "LICENSE" file,
-# which should be included with this package. The terms are also available at 
+# which should be included with this package. The terms are also available at
 # http://www.hardcoded.net/licenses/bsd_license
 from io import StringIO
@ -65,6 +65,12 @@ def test_trailiter():
    eq_(list(trailiter(['foo', 'bar'], skipfirst=True)), [('foo', 'bar')])
    eq_(list(trailiter([], skipfirst=True)), []) # no crash
 def test_iterconsume():
    # We just want to make sure that we return *all* items and that we're not mistakenly skipping
    # one.
    eq_(list(range(2500)), list(iterconsume(list(range(2500)))))
    eq_(list(reversed(range(2500))), list(iterconsume(list(range(2500)), reverse=False)))
 #--- String
 def test_escape():
@ -188,63 +194,63 @@ class TestCase_modified_after:
        monkeyplus.patch_osstat('first', st_mtime=42)
        monkeyplus.patch_osstat('second', st_mtime=41)
        assert modified_after('first', 'second')
-    
+
    def test_second_is_modified_after(self, monkeyplus):
        monkeyplus.patch_osstat('first', st_mtime=42)
        monkeyplus.patch_osstat('second', st_mtime=43)
        assert not modified_after('first', 'second')
-    
+
    def test_same_mtime(self, monkeyplus):
        monkeyplus.patch_osstat('first', st_mtime=42)
        monkeyplus.patch_osstat('second', st_mtime=42)
        assert not modified_after('first', 'second')
-    
+
    def test_first_file_does_not_exist(self, monkeyplus):
        # when the first file doesn't exist, we return False
        monkeyplus.patch_osstat('second', st_mtime=42)
        assert not modified_after('does_not_exist', 'second') # no crash
-    
+
    def test_second_file_does_not_exist(self, monkeyplus):
        # when the second file doesn't exist, we return True
        monkeyplus.patch_osstat('first', st_mtime=42)
        assert modified_after('first', 'does_not_exist') # no crash
-    
+
    def test_first_file_is_none(self, monkeyplus):
        # when the first file is None, we return False
        monkeyplus.patch_osstat('second', st_mtime=42)
        assert not modified_after(None, 'second') # no crash
-    
+
    def test_second_file_is_none(self, monkeyplus):
        # when the second file is None, we return True
        monkeyplus.patch_osstat('first', st_mtime=42)
        assert modified_after('first', None) # no crash
-    
+
 class TestCase_delete_if_empty:
    def test_is_empty(self, tmpdir):
        testpath = Path(str(tmpdir))
        assert delete_if_empty(testpath)
        assert not testpath.exists()
-    
+
    def test_not_empty(self, tmpdir):
        testpath = Path(str(tmpdir))
        testpath['foo'].mkdir()
        assert not delete_if_empty(testpath)
        assert testpath.exists()
-    
+
    def test_with_files_to_delete(self, tmpdir):
        testpath = Path(str(tmpdir))
        testpath['foo'].open('w')
        testpath['bar'].open('w')
        assert delete_if_empty(testpath, ['foo', 'bar'])
        assert not testpath.exists()
-    
+
    def test_directory_in_files_to_delete(self, tmpdir):
        testpath = Path(str(tmpdir))
        testpath['foo'].mkdir()
        assert not delete_if_empty(testpath, ['foo'])
        assert testpath.exists()
-    
+
    def test_delete_files_to_delete_only_if_dir_is_empty(self, tmpdir):
        testpath = Path(str(tmpdir))
        testpath['foo'].open('w')
@ -252,25 +258,25 @@ class TestCase_delete_if_empty:
        assert not delete_if_empty(testpath, ['foo'])
        assert testpath.exists()
        assert testpath['foo'].exists()
-    
+
    def test_doesnt_exist(self):
        # When the 'path' doesn't exist, just do nothing.
        delete_if_empty(Path('does_not_exist')) # no crash
-    
+
    def test_is_file(self, tmpdir):
        # When 'path' is a file, do nothing.
        p = Path(str(tmpdir)) + 'filename'
        p.open('w').close()
        delete_if_empty(p) # no crash
-    
+
    def test_ioerror(self, tmpdir, monkeypatch):
        # if an IO error happens during the operation, ignore it.
        def do_raise(*args, **kw):
            raise OSError()
-        
+
        monkeypatch.setattr(Path, 'rmdir', do_raise)
        delete_if_empty(Path(str(tmpdir))) # no crash
-    
+
 class TestCase_open_if_filename:
    def test_file_name(self, tmpdir):
@ -280,7 +286,7 @@ class TestCase_open_if_filename:
        assert close
        eq_(b'test_data', file.read())
        file.close()
-    
+
    def test_opened_file(self):
        sio = StringIO()
        sio.write('test_data')
@ -288,14 +294,14 @@ class TestCase_open_if_filename:
        file, close = open_if_filename(sio)
        assert not close
        eq_('test_data', file.read())
-    
+
    def test_mode_is_passed_to_open(self, tmpdir):
        filepath = str(tmpdir.join('test.txt'))
        open(filepath, 'w').close()
        file, close = open_if_filename(filepath, 'a')
        eq_('a', file.mode)
        file.close()
-    
+
 class TestCase_FileOrPath:
    def test_path(self, tmpdir):
@ -303,17 +309,17 @@ class TestCase_FileOrPath:
        open(filepath, 'wb').write(b'test_data')
        with FileOrPath(filepath) as fp:
            eq_(b'test_data', fp.read())
-    
+
    def test_opened_file(self):
        sio = StringIO()
        sio.write('test_data')
        sio.seek(0)
        with FileOrPath(sio) as fp:
            eq_('test_data', fp.read())
-    
+
    def test_mode_is_passed_to_open(self, tmpdir):
        filepath = str(tmpdir.join('test.txt'))
        open(filepath, 'w').close()
        with FileOrPath(filepath, 'a') as fp:
            eq_('a', fp.mode)
-    
+
--- a/hscommon/util.py
+++ b/hscommon/util.py
@ -117,23 +117,20 @@ def trailiter(iterable, skipfirst=False):
        yield prev, item
        prev = item
-def iterconsume(seq):
+def iterconsume(seq, reverse=True):
-    """Iterate over ``seq`` and discard yielded objects.
+    """Iterate over ``seq`` and pops yielded objects.
-    Right after the ``yield``, we replace the element we've just yielded by ``None`` in the
+    Because we use the ``pop()`` method, we reverse ``seq`` before proceeding. If you don't need
-    sequence.
+    to do that, set ``reverse`` to ``False``.
    This is useful in tight memory situation where you are looping over a sequence of objects that
    are going to be discarded afterwards. If you're creating other objects during that iteration
    you might want to use this to avoid ``MemoryError``.
    Note that this only works for sequence (index accessible), not all iterables.
    """
-    # We don't use ``del``, because it would be disastrous performance-wise as the array would have
+    if reverse:
-    # to be constantly re-allocated.
+        seq.reverse()
-    for index, elem in enumerate(seq):
+    while seq:
-        seq[index] = None
+        yield seq.pop()
        yield elem
 #--- String related