1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2025-05-08 09:49:51 +00:00

Catch MemoryError better in PE's block matching algo

fixes #264 (for good this time, hopefully)
This commit is contained in:
Virgil Dupras 2014-10-05 22:22:59 -04:00
parent 5b3d5f5d1c
commit 321f8ab406
3 changed files with 43 additions and 40 deletions

View File

@ -193,13 +193,13 @@ def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nul
# some wiggle room, log about the incident, and stop matching right here. We then process # some wiggle room, log about the incident, and stop matching right here. We then process
# the matches we have. The rest of the process doesn't allocate much and we should be # the matches we have. The rest of the process doesn't allocate much and we should be
# alright. # alright.
del matches[-1000:] # some wiggle room to ensure we don't run out of memory again. del comparisons_to_do, chunks, pictures # some wiggle room for the next statements
logging.warning("Ran out of memory when scanning! We had %d matches.", len(matches) + 1000) logging.warning("Ran out of memory when scanning! We had %d matches.", len(matches))
del matches[-len(matches)//3:] # some wiggle room to ensure we don't run out of memory again.
pool.close() pool.close()
result = [] result = []
myiter = j.iter_with_progress( myiter = j.iter_with_progress(
iterconsume(matches), iterconsume(matches, reverse=False),
tr("Verified %d/%d matches"), tr("Verified %d/%d matches"),
every=10, every=10,
count=len(matches), count=len(matches),

View File

@ -1,9 +1,9 @@
# Created By: Virgil Dupras # Created By: Virgil Dupras
# Created On: 2011-01-11 # Created On: 2011-01-11
# Copyright 2014 Hardcoded Software (http://www.hardcoded.net) # Copyright 2014 Hardcoded Software (http://www.hardcoded.net)
# #
# This software is licensed under the "BSD" License as described in the "LICENSE" file, # This software is licensed under the "BSD" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at # which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/bsd_license # http://www.hardcoded.net/licenses/bsd_license
from io import StringIO from io import StringIO
@ -65,6 +65,12 @@ def test_trailiter():
eq_(list(trailiter(['foo', 'bar'], skipfirst=True)), [('foo', 'bar')]) eq_(list(trailiter(['foo', 'bar'], skipfirst=True)), [('foo', 'bar')])
eq_(list(trailiter([], skipfirst=True)), []) # no crash eq_(list(trailiter([], skipfirst=True)), []) # no crash
def test_iterconsume():
# We just want to make sure that we return *all* items and that we're not mistakenly skipping
# one.
eq_(list(range(2500)), list(iterconsume(list(range(2500)))))
eq_(list(reversed(range(2500))), list(iterconsume(list(range(2500)), reverse=False)))
#--- String #--- String
def test_escape(): def test_escape():
@ -188,63 +194,63 @@ class TestCase_modified_after:
monkeyplus.patch_osstat('first', st_mtime=42) monkeyplus.patch_osstat('first', st_mtime=42)
monkeyplus.patch_osstat('second', st_mtime=41) monkeyplus.patch_osstat('second', st_mtime=41)
assert modified_after('first', 'second') assert modified_after('first', 'second')
def test_second_is_modified_after(self, monkeyplus): def test_second_is_modified_after(self, monkeyplus):
monkeyplus.patch_osstat('first', st_mtime=42) monkeyplus.patch_osstat('first', st_mtime=42)
monkeyplus.patch_osstat('second', st_mtime=43) monkeyplus.patch_osstat('second', st_mtime=43)
assert not modified_after('first', 'second') assert not modified_after('first', 'second')
def test_same_mtime(self, monkeyplus): def test_same_mtime(self, monkeyplus):
monkeyplus.patch_osstat('first', st_mtime=42) monkeyplus.patch_osstat('first', st_mtime=42)
monkeyplus.patch_osstat('second', st_mtime=42) monkeyplus.patch_osstat('second', st_mtime=42)
assert not modified_after('first', 'second') assert not modified_after('first', 'second')
def test_first_file_does_not_exist(self, monkeyplus): def test_first_file_does_not_exist(self, monkeyplus):
# when the first file doesn't exist, we return False # when the first file doesn't exist, we return False
monkeyplus.patch_osstat('second', st_mtime=42) monkeyplus.patch_osstat('second', st_mtime=42)
assert not modified_after('does_not_exist', 'second') # no crash assert not modified_after('does_not_exist', 'second') # no crash
def test_second_file_does_not_exist(self, monkeyplus): def test_second_file_does_not_exist(self, monkeyplus):
# when the second file doesn't exist, we return True # when the second file doesn't exist, we return True
monkeyplus.patch_osstat('first', st_mtime=42) monkeyplus.patch_osstat('first', st_mtime=42)
assert modified_after('first', 'does_not_exist') # no crash assert modified_after('first', 'does_not_exist') # no crash
def test_first_file_is_none(self, monkeyplus): def test_first_file_is_none(self, monkeyplus):
# when the first file is None, we return False # when the first file is None, we return False
monkeyplus.patch_osstat('second', st_mtime=42) monkeyplus.patch_osstat('second', st_mtime=42)
assert not modified_after(None, 'second') # no crash assert not modified_after(None, 'second') # no crash
def test_second_file_is_none(self, monkeyplus): def test_second_file_is_none(self, monkeyplus):
# when the second file is None, we return True # when the second file is None, we return True
monkeyplus.patch_osstat('first', st_mtime=42) monkeyplus.patch_osstat('first', st_mtime=42)
assert modified_after('first', None) # no crash assert modified_after('first', None) # no crash
class TestCase_delete_if_empty: class TestCase_delete_if_empty:
def test_is_empty(self, tmpdir): def test_is_empty(self, tmpdir):
testpath = Path(str(tmpdir)) testpath = Path(str(tmpdir))
assert delete_if_empty(testpath) assert delete_if_empty(testpath)
assert not testpath.exists() assert not testpath.exists()
def test_not_empty(self, tmpdir): def test_not_empty(self, tmpdir):
testpath = Path(str(tmpdir)) testpath = Path(str(tmpdir))
testpath['foo'].mkdir() testpath['foo'].mkdir()
assert not delete_if_empty(testpath) assert not delete_if_empty(testpath)
assert testpath.exists() assert testpath.exists()
def test_with_files_to_delete(self, tmpdir): def test_with_files_to_delete(self, tmpdir):
testpath = Path(str(tmpdir)) testpath = Path(str(tmpdir))
testpath['foo'].open('w') testpath['foo'].open('w')
testpath['bar'].open('w') testpath['bar'].open('w')
assert delete_if_empty(testpath, ['foo', 'bar']) assert delete_if_empty(testpath, ['foo', 'bar'])
assert not testpath.exists() assert not testpath.exists()
def test_directory_in_files_to_delete(self, tmpdir): def test_directory_in_files_to_delete(self, tmpdir):
testpath = Path(str(tmpdir)) testpath = Path(str(tmpdir))
testpath['foo'].mkdir() testpath['foo'].mkdir()
assert not delete_if_empty(testpath, ['foo']) assert not delete_if_empty(testpath, ['foo'])
assert testpath.exists() assert testpath.exists()
def test_delete_files_to_delete_only_if_dir_is_empty(self, tmpdir): def test_delete_files_to_delete_only_if_dir_is_empty(self, tmpdir):
testpath = Path(str(tmpdir)) testpath = Path(str(tmpdir))
testpath['foo'].open('w') testpath['foo'].open('w')
@ -252,25 +258,25 @@ class TestCase_delete_if_empty:
assert not delete_if_empty(testpath, ['foo']) assert not delete_if_empty(testpath, ['foo'])
assert testpath.exists() assert testpath.exists()
assert testpath['foo'].exists() assert testpath['foo'].exists()
def test_doesnt_exist(self): def test_doesnt_exist(self):
# When the 'path' doesn't exist, just do nothing. # When the 'path' doesn't exist, just do nothing.
delete_if_empty(Path('does_not_exist')) # no crash delete_if_empty(Path('does_not_exist')) # no crash
def test_is_file(self, tmpdir): def test_is_file(self, tmpdir):
# When 'path' is a file, do nothing. # When 'path' is a file, do nothing.
p = Path(str(tmpdir)) + 'filename' p = Path(str(tmpdir)) + 'filename'
p.open('w').close() p.open('w').close()
delete_if_empty(p) # no crash delete_if_empty(p) # no crash
def test_ioerror(self, tmpdir, monkeypatch): def test_ioerror(self, tmpdir, monkeypatch):
# if an IO error happens during the operation, ignore it. # if an IO error happens during the operation, ignore it.
def do_raise(*args, **kw): def do_raise(*args, **kw):
raise OSError() raise OSError()
monkeypatch.setattr(Path, 'rmdir', do_raise) monkeypatch.setattr(Path, 'rmdir', do_raise)
delete_if_empty(Path(str(tmpdir))) # no crash delete_if_empty(Path(str(tmpdir))) # no crash
class TestCase_open_if_filename: class TestCase_open_if_filename:
def test_file_name(self, tmpdir): def test_file_name(self, tmpdir):
@ -280,7 +286,7 @@ class TestCase_open_if_filename:
assert close assert close
eq_(b'test_data', file.read()) eq_(b'test_data', file.read())
file.close() file.close()
def test_opened_file(self): def test_opened_file(self):
sio = StringIO() sio = StringIO()
sio.write('test_data') sio.write('test_data')
@ -288,14 +294,14 @@ class TestCase_open_if_filename:
file, close = open_if_filename(sio) file, close = open_if_filename(sio)
assert not close assert not close
eq_('test_data', file.read()) eq_('test_data', file.read())
def test_mode_is_passed_to_open(self, tmpdir): def test_mode_is_passed_to_open(self, tmpdir):
filepath = str(tmpdir.join('test.txt')) filepath = str(tmpdir.join('test.txt'))
open(filepath, 'w').close() open(filepath, 'w').close()
file, close = open_if_filename(filepath, 'a') file, close = open_if_filename(filepath, 'a')
eq_('a', file.mode) eq_('a', file.mode)
file.close() file.close()
class TestCase_FileOrPath: class TestCase_FileOrPath:
def test_path(self, tmpdir): def test_path(self, tmpdir):
@ -303,17 +309,17 @@ class TestCase_FileOrPath:
open(filepath, 'wb').write(b'test_data') open(filepath, 'wb').write(b'test_data')
with FileOrPath(filepath) as fp: with FileOrPath(filepath) as fp:
eq_(b'test_data', fp.read()) eq_(b'test_data', fp.read())
def test_opened_file(self): def test_opened_file(self):
sio = StringIO() sio = StringIO()
sio.write('test_data') sio.write('test_data')
sio.seek(0) sio.seek(0)
with FileOrPath(sio) as fp: with FileOrPath(sio) as fp:
eq_('test_data', fp.read()) eq_('test_data', fp.read())
def test_mode_is_passed_to_open(self, tmpdir): def test_mode_is_passed_to_open(self, tmpdir):
filepath = str(tmpdir.join('test.txt')) filepath = str(tmpdir.join('test.txt'))
open(filepath, 'w').close() open(filepath, 'w').close()
with FileOrPath(filepath, 'a') as fp: with FileOrPath(filepath, 'a') as fp:
eq_('a', fp.mode) eq_('a', fp.mode)

View File

@ -117,23 +117,20 @@ def trailiter(iterable, skipfirst=False):
yield prev, item yield prev, item
prev = item prev = item
def iterconsume(seq): def iterconsume(seq, reverse=True):
"""Iterate over ``seq`` and discard yielded objects. """Iterate over ``seq`` and pops yielded objects.
Right after the ``yield``, we replace the element we've just yielded by ``None`` in the Because we use the ``pop()`` method, we reverse ``seq`` before proceeding. If you don't need
sequence. to do that, set ``reverse`` to ``False``.
This is useful in tight memory situation where you are looping over a sequence of objects that This is useful in tight memory situation where you are looping over a sequence of objects that
are going to be discarded afterwards. If you're creating other objects during that iteration are going to be discarded afterwards. If you're creating other objects during that iteration
you might want to use this to avoid ``MemoryError``. you might want to use this to avoid ``MemoryError``.
Note that this only works for sequence (index accessible), not all iterables.
""" """
# We don't use ``del``, because it would be disastrous performance-wise as the array would have if reverse:
# to be constantly re-allocated. seq.reverse()
for index, elem in enumerate(seq): while seq:
seq[index] = None yield seq.pop()
yield elem
#--- String related #--- String related