Merge pull request #908 from glubsy/hash_sample_optimization

Hash sample optimization
This commit is contained in:
Andrew Senetar 2021-08-13 23:41:17 -05:00 committed by GitHub
commit e11f996dfc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 242 additions and 48 deletions

View File

@ -283,9 +283,11 @@ def getmatches(
return result
def getmatches_by_contents(files, j=job.nulljob):
def getmatches_by_contents(files, bigsize=0, j=job.nulljob):
"""Returns a list of :class:`Match` within ``files`` if their contents is the same.
:param bigsize: The size in bytes over which we consider files big enough to
justify taking samples of md5. If 0, compute md5 as usual.
:param j: A :ref:`job progress instance <jobs>`.
"""
size2files = defaultdict(set)
@ -302,8 +304,12 @@ def getmatches_by_contents(files, j=job.nulljob):
if first.is_ref and second.is_ref:
continue # Don't spend time comparing two ref pics together.
if first.md5partial == second.md5partial:
if first.md5 == second.md5:
result.append(Match(first, second, 100))
if bigsize > 0 and first.size > bigsize:
if first.md5samples == second.md5samples:
result.append(Match(first, second, 100))
else:
if first.md5 == second.md5:
result.append(Match(first, second, 100))
j.add_progress(desc=tr("%d matches found") % len(result))
return result

View File

@ -12,6 +12,7 @@
# and I'm doing it now.
import hashlib
from math import floor
import logging
from hscommon.util import nonone, get_file_ext
@ -30,6 +31,14 @@ __all__ = [
NOT_SET = object()
# The goal here is to not run out of memory on really big files. However, the chunk
# size has to be large enough so that the python loop isn't too costly in terms of
# CPU.
CHUNK_SIZE = 1024 * 1024 # 1 MiB
# Minimum size below which partial hashes don't need to be computed
MIN_FILE_SIZE = 3 * CHUNK_SIZE # 3MiB, because we take 3 samples
class FSError(Exception):
cls_message = "An error has occured on '{name}' in '{parent}'"
@ -76,8 +85,9 @@ class File:
INITIAL_INFO = {
"size": 0,
"mtime": 0,
"md5": "",
"md5partial": "",
"md5": b"",
"md5partial": b"",
"md5samples": b""
}
# Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
# files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
@ -112,37 +122,62 @@ class File:
return (0x4000, 0x4000) # 16Kb
def _read_info(self, field):
# print(f"_read_info({field}) for {self}")
if field in ("size", "mtime"):
stats = self.path.stat()
self.size = nonone(stats.st_size, 0)
self.mtime = nonone(stats.st_mtime, 0)
elif field == "md5partial":
try:
fp = self.path.open("rb")
offset, size = self._get_md5partial_offset_and_size()
fp.seek(offset)
partialdata = fp.read(size)
md5 = hashlib.md5(partialdata)
self.md5partial = md5.digest()
fp.close()
with self.path.open("rb") as fp:
offset, size = self._get_md5partial_offset_and_size()
fp.seek(offset)
partialdata = fp.read(size)
md5 = hashlib.md5(partialdata)
self.md5partial = md5.digest()
except Exception:
pass
elif field == "md5":
try:
fp = self.path.open("rb")
md5 = hashlib.md5()
# The goal here is to not run out of memory on really big files. However, the chunk
# size has to be large enough so that the python loop isn't too costly in terms of
# CPU.
CHUNK_SIZE = 1024 * 1024 # 1 mb
filedata = fp.read(CHUNK_SIZE)
while filedata:
md5.update(filedata)
with self.path.open("rb") as fp:
md5 = hashlib.md5()
filedata = fp.read(CHUNK_SIZE)
self.md5 = md5.digest()
fp.close()
while filedata:
md5.update(filedata)
filedata = fp.read(CHUNK_SIZE)
# FIXME For python 3.8 and later
# while filedata := fp.read(CHUNK_SIZE):
# md5.update(filedata)
self.md5 = md5.digest()
except Exception:
pass
elif field == "md5samples":
try:
with self.path.open("rb") as fp:
size = self.size
# Might as well hash such small files entirely.
if size <= MIN_FILE_SIZE:
setattr(self, field, self.md5)
return
# Chunk at 25% of the file
fp.seek(floor(size * 25 / 100), 0)
filedata = fp.read(CHUNK_SIZE)
md5 = hashlib.md5(filedata)
# Chunk at 60% of the file
fp.seek(floor(size * 60 / 100), 0)
filedata = fp.read(CHUNK_SIZE)
md5.update(filedata)
# Last chunk of the file
fp.seek(-CHUNK_SIZE, 2)
filedata = fp.read(CHUNK_SIZE)
md5.update(filedata)
setattr(self, field, md5.digest())
except Exception as e:
logging.error(f"Error computing md5samples: {e}")
pass
def _read_all_info(self, attrnames=None):
"""Cache all possible info.
@ -197,7 +232,7 @@ class File:
class Folder(File):
"""A wrapper around a folder path.
It has the size/md5 info of a File, but it's value are the sum of its subitems.
It has the size/md5 info of a File, but its value is the sum of its subitems.
"""
__slots__ = File.__slots__ + ("_subfolders",)
@ -212,15 +247,17 @@ class Folder(File):
return folders + files
def _read_info(self, field):
# print(f"_read_info({field}) for Folder {self}")
if field in {"size", "mtime"}:
size = sum((f.size for f in self._all_items()), 0)
self.size = size
stats = self.path.stat()
self.mtime = nonone(stats.st_mtime, 0)
elif field in {"md5", "md5partial"}:
elif field in {"md5", "md5partial", "md5samples"}:
# What's sensitive here is that we must make sure that subfiles'
# md5 are always added up in the same order, but we also want a
# different md5 if a file gets moved in a different subdirectory.
def get_dir_md5_concat():
items = self._all_items()
items.sort(key=lambda f: f.path)

View File

@ -87,7 +87,11 @@ class Scanner:
if self.size_threshold:
files = [f for f in files if f.size >= self.size_threshold]
if self.scan_type in {ScanType.Contents, ScanType.Folders}:
return engine.getmatches_by_contents(files, j=j)
return engine.getmatches_by_contents(
files,
bigsize=self.big_file_size_threshold,
j=j
)
else:
j = j.start_subjob([2, 8])
kw = {}
@ -218,4 +222,5 @@ class Scanner:
scan_type = ScanType.Filename
scanned_tags = {"artist", "title"}
size_threshold = 0
big_file_size_threshold = 0
word_weighting = False

View File

@ -88,6 +88,7 @@ class NamedObject:
self.size = size
self.md5partial = name
self.md5 = name
self.md5samples = name
if with_words:
self.words = getwords(name)
self.is_ref = False

View File

@ -551,6 +551,28 @@ class TestCaseGetMatchesByContents:
o1, o2 = no(size=0), no(size=0)
assert not getmatches_by_contents([o1, o2])
def test_big_file_partial_hashes(self):
smallsize = 1
bigsize = 100 * 1024 * 1024 # 100MB
f = [no("bigfoo", size=bigsize), no("bigbar", size=bigsize),
no("smallfoo", size=smallsize), no("smallbar", size=smallsize)]
f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
f[2].md5 = f[2].md5partial = "bleh"
f[3].md5 = f[3].md5partial = "bleh"
r = getmatches_by_contents(f, bigsize=bigsize)
eq_(len(r), 2)
# User disabled optimization for big files, compute hashes as usual
r = getmatches_by_contents(f, bigsize=0)
eq_(len(r), 2)
# Other file is now slightly different, md5partial is still the same
f[1].md5 = f[1].md5samples = "foobardiff"
r = getmatches_by_contents(f, bigsize=bigsize)
# Successfully filter it out
eq_(len(r), 1)
r = getmatches_by_contents(f, bigsize=0)
eq_(len(r), 1)
class TestCaseGroup:
def test_empy(self):

View File

@ -7,6 +7,7 @@
# http://www.gnu.org/licenses/gpl-3.0.html
import hashlib
from os import urandom
from hscommon.path import Path
from hscommon.testutil import eq_
@ -15,6 +16,36 @@ from core.tests.directories_test import create_fake_fs
from .. import fs
def create_fake_fs_with_random_data(rootpath):
rootpath = rootpath["fs"]
rootpath.mkdir()
rootpath["dir1"].mkdir()
rootpath["dir2"].mkdir()
rootpath["dir3"].mkdir()
fp = rootpath["file1.test"].open("wb")
data1 = urandom(200 * 1024) # 200KiB
data2 = urandom(1024 * 1024) # 1MiB
data3 = urandom(10 * 1024 * 1024) # 10MiB
fp.write(data1)
fp.close()
fp = rootpath["file2.test"].open("wb")
fp.write(data2)
fp.close()
fp = rootpath["file3.test"].open("wb")
fp.write(data3)
fp.close()
fp = rootpath["dir1"]["file1.test"].open("wb")
fp.write(data1)
fp.close()
fp = rootpath["dir2"]["file2.test"].open("wb")
fp.write(data2)
fp.close()
fp = rootpath["dir3"]["file3.test"].open("wb")
fp.write(data3)
fp.close()
return rootpath
def test_size_aggregates_subfiles(tmpdir):
p = create_fake_fs(Path(str(tmpdir)))
b = fs.Folder(p)
@ -25,7 +56,7 @@ def test_md5_aggregate_subfiles_sorted(tmpdir):
# dir.allfiles can return child in any order. Thus, bundle.md5 must aggregate
# all files' md5 it contains, but it must make sure that it does so in the
# same order everytime.
p = create_fake_fs(Path(str(tmpdir)))
p = create_fake_fs_with_random_data(Path(str(tmpdir)))
b = fs.Folder(p)
md51 = fs.File(p["dir1"]["file1.test"]).md5
md52 = fs.File(p["dir2"]["file2.test"]).md5
@ -41,6 +72,36 @@ def test_md5_aggregate_subfiles_sorted(tmpdir):
eq_(b.md5, md5.digest())
def test_partial_md5_aggregate_subfile_sorted(tmpdir):
p = create_fake_fs_with_random_data(Path(str(tmpdir)))
b = fs.Folder(p)
md51 = fs.File(p["dir1"]["file1.test"]).md5partial
md52 = fs.File(p["dir2"]["file2.test"]).md5partial
md53 = fs.File(p["dir3"]["file3.test"]).md5partial
md54 = fs.File(p["file1.test"]).md5partial
md55 = fs.File(p["file2.test"]).md5partial
md56 = fs.File(p["file3.test"]).md5partial
# The expected md5 is the md5 of md5s for folders and the direct md5 for files
folder_md51 = hashlib.md5(md51).digest()
folder_md52 = hashlib.md5(md52).digest()
folder_md53 = hashlib.md5(md53).digest()
md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56)
eq_(b.md5partial, md5.digest())
md51 = fs.File(p["dir1"]["file1.test"]).md5samples
md52 = fs.File(p["dir2"]["file2.test"]).md5samples
md53 = fs.File(p["dir3"]["file3.test"]).md5samples
md54 = fs.File(p["file1.test"]).md5samples
md55 = fs.File(p["file2.test"]).md5samples
md56 = fs.File(p["file3.test"]).md5samples
# The expected md5 is the md5 of md5s for folders and the direct md5 for files
folder_md51 = hashlib.md5(md51).digest()
folder_md52 = hashlib.md5(md52).digest()
folder_md53 = hashlib.md5(md53).digest()
md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56)
eq_(b.md5samples, md5.digest())
def test_has_file_attrs(tmpdir):
# a Folder must behave like a file, so it must have mtime attributes
b = fs.Folder(Path(str(tmpdir)))

View File

@ -56,6 +56,7 @@ def test_default_settings(fake_fileexists):
eq_(s.mix_file_kind, True)
eq_(s.word_weighting, False)
eq_(s.match_similar_words, False)
eq_(s.big_file_size_threshold, 0)
def test_simple_with_default_settings(fake_fileexists):
@ -120,9 +121,9 @@ def test_content_scan(fake_fileexists):
s = Scanner()
s.scan_type = ScanType.Contents
f = [no("foo"), no("bar"), no("bleh")]
f[0].md5 = f[0].md5partial = "foobar"
f[1].md5 = f[1].md5partial = "foobar"
f[2].md5 = f[2].md5partial = "bleh"
f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
f[2].md5 = f[2].md5partial = f[1].md5samples = "bleh"
r = s.get_dupe_groups(f)
eq_(len(r), 1)
eq_(len(r[0]), 2)
@ -141,13 +142,43 @@ def test_content_scan_compare_sizes_first(fake_fileexists):
eq_(len(s.get_dupe_groups(f)), 0)
def test_big_file_partial_hashes(fake_fileexists):
s = Scanner()
s.scan_type = ScanType.Contents
smallsize = 1
bigsize = 100 * 1024 * 1024 # 100MB
s.big_file_size_threshold = bigsize
f = [no("bigfoo", bigsize), no("bigbar", bigsize),
no("smallfoo", smallsize), no("smallbar", smallsize)]
f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
f[2].md5 = f[2].md5partial = "bleh"
f[3].md5 = f[3].md5partial = "bleh"
r = s.get_dupe_groups(f)
eq_(len(r), 2)
# md5partial is still the same, but the file is actually different
f[1].md5 = f[1].md5samples = "difffoobar"
# here we compare the full md5s, as the user disabled the optimization
s.big_file_size_threshold = 0
r = s.get_dupe_groups(f)
eq_(len(r), 1)
# here we should compare the md5samples, and see they are different
s.big_file_size_threshold = bigsize
r = s.get_dupe_groups(f)
eq_(len(r), 1)
def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists):
s = Scanner()
s.scan_type = ScanType.Contents
f = [no("foo"), no("bar"), no("bleh")]
f[0].md5 = f[0].md5partial = "foobar"
f[1].md5 = f[1].md5partial = "foobar"
f[2].md5 = f[2].md5partial = "bleh"
f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
f[2].md5 = f[2].md5partial = f[2].md5samples = "bleh"
s.min_match_percentage = 101
r = s.get_dupe_groups(f)
eq_(len(r), 1)
@ -162,13 +193,12 @@ def test_content_scan_doesnt_put_md5_in_words_at_the_end(fake_fileexists):
s = Scanner()
s.scan_type = ScanType.Contents
f = [no("foo"), no("bar")]
f[0].md5 = f[
0
].md5partial = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
f[1].md5 = f[
1
].md5partial = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
f[0].md5 = f[0].md5partial = f[0].md5samples =\
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
f[1].md5 = f[1].md5partial = f[1].md5samples =\
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
r = s.get_dupe_groups(f)
# FIXME looks like we are missing something here?
r[0]
@ -514,21 +544,21 @@ def test_folder_scan_exclude_subfolder_matches(fake_fileexists):
s = Scanner()
s.scan_type = ScanType.Folders
topf1 = no("top folder 1", size=42)
topf1.md5 = topf1.md5partial = b"some_md5_1"
topf1.md5 = topf1.md5partial = topf1.md5samples = b"some_md5_1"
topf1.path = Path("/topf1")
topf2 = no("top folder 2", size=42)
topf2.md5 = topf2.md5partial = b"some_md5_1"
topf2.md5 = topf2.md5partial = topf2.md5samples = b"some_md5_1"
topf2.path = Path("/topf2")
subf1 = no("sub folder 1", size=41)
subf1.md5 = subf1.md5partial = b"some_md5_2"
subf1.md5 = subf1.md5partial = subf1.md5samples = b"some_md5_2"
subf1.path = Path("/topf1/sub")
subf2 = no("sub folder 2", size=41)
subf2.md5 = subf2.md5partial = b"some_md5_2"
subf2.md5 = subf2.md5partial = subf2.md5samples = b"some_md5_2"
subf2.path = Path("/topf2/sub")
eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2])), 1) # only top folders
# however, if another folder matches a subfolder, keep in in the matches
otherf = no("other folder", size=41)
otherf.md5 = otherf.md5partial = b"some_md5_2"
otherf.md5 = otherf.md5partial = otherf.md5samples = b"some_md5_2"
otherf.path = Path("/otherfolder")
eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2, otherf])), 2)
@ -551,9 +581,9 @@ def test_dont_count_ref_files_as_discarded(fake_fileexists):
o1 = no("foo", path="p1")
o2 = no("foo", path="p2")
o3 = no("foo", path="p3")
o1.md5 = o1.md5partial = "foobar"
o2.md5 = o2.md5partial = "foobar"
o3.md5 = o3.md5partial = "foobar"
o1.md5 = o1.md5partial = o1.md5samples = "foobar"
o2.md5 = o2.md5partial = o2.md5samples = "foobar"
o3.md5 = o3.md5partial = o3.md5samples = "foobar"
o1.is_ref = True
o2.is_ref = True
eq_(len(s.get_dupe_groups([o1, o2, o3])), 1)

View File

@ -187,7 +187,14 @@ class DupeGuru(QObject):
)
self.model.options["size_threshold"] = (
threshold * 1024
) # threshold is in KB. the scanner wants bytes
) # threshold is in KB. The scanner wants bytes
big_file_size_threshold = (
self.prefs.big_file_size_threshold if self.prefs.big_file_partial_hashes else 0
)
self.model.options["big_file_size_threshold"] = (
big_file_size_threshold * 1024 * 1024
# threshold is in MiB. The scanner wants bytes
)
scanned_tags = set()
if self.prefs.scan_tag_track:
scanned_tags.add("track")

View File

@ -73,6 +73,8 @@ class Preferences(PreferencesBase):
self.match_similar = get("MatchSimilar", self.match_similar)
self.ignore_small_files = get("IgnoreSmallFiles", self.ignore_small_files)
self.small_file_threshold = get("SmallFileThreshold", self.small_file_threshold)
self.big_file_partial_hashes = get("BigFilePartialHashes", self.big_file_partial_hashes)
self.big_file_size_threshold = get("BigFileSizeThreshold", self.big_file_size_threshold)
self.scan_tag_track = get("ScanTagTrack", self.scan_tag_track)
self.scan_tag_artist = get("ScanTagArtist", self.scan_tag_artist)
self.scan_tag_album = get("ScanTagAlbum", self.scan_tag_album)
@ -117,6 +119,8 @@ class Preferences(PreferencesBase):
self.match_similar = False
self.ignore_small_files = True
self.small_file_threshold = 10 # KB
self.big_file_partial_hashes = False
self.big_file_size_threshold = 100 # MB
self.scan_tag_track = False
self.scan_tag_artist = True
self.scan_tag_album = True
@ -161,6 +165,8 @@ class Preferences(PreferencesBase):
set_("MatchSimilar", self.match_similar)
set_("IgnoreSmallFiles", self.ignore_small_files)
set_("SmallFileThreshold", self.small_file_threshold)
set_("BigFilePartialHashes", self.big_file_partial_hashes)
set_("BigFileSizeThreshold", self.big_file_size_threshold)
set_("ScanTagTrack", self.scan_tag_track)
set_("ScanTagArtist", self.scan_tag_artist)
set_("ScanTagAlbum", self.scan_tag_album)

View File

@ -72,6 +72,21 @@ class PreferencesDialog(PreferencesDialogBase):
spacerItem1 = QSpacerItem(40, 20, QSizePolicy.Expanding, QSizePolicy.Minimum)
self.horizontalLayout_2.addItem(spacerItem1)
self.verticalLayout_4.addLayout(self.horizontalLayout_2)
self.horizontalLayout_2b = QHBoxLayout()
self._setupAddCheckbox(
"bigFilePartialHashesBox", tr("Partially hash files bigger than"), self.widget
)
self.horizontalLayout_2b.addWidget(self.bigFilePartialHashesBox)
self.bigSizeThresholdEdit = QLineEdit(self.widget)
self.bigSizeThresholdEdit.setSizePolicy(sizePolicy)
self.bigSizeThresholdEdit.setMaximumSize(QSize(75, 16777215))
self.horizontalLayout_2b.addWidget(self.bigSizeThresholdEdit)
self.label_6b = QLabel(self.widget)
self.label_6b.setText(tr("MB"))
self.horizontalLayout_2b.addWidget(self.label_6b)
spacerItem2 = QSpacerItem(40, 20, QSizePolicy.Expanding, QSizePolicy.Minimum)
self.horizontalLayout_2b.addItem(spacerItem2)
self.verticalLayout_4.addLayout(self.horizontalLayout_2b)
self._setupAddCheckbox(
"ignoreHardlinkMatches",
tr("Ignore duplicates hardlinking to the same file"),
@ -90,6 +105,8 @@ class PreferencesDialog(PreferencesDialogBase):
setchecked(self.wordWeightingBox, prefs.word_weighting)
setchecked(self.ignoreSmallFilesBox, prefs.ignore_small_files)
self.sizeThresholdEdit.setText(str(prefs.small_file_threshold))
setchecked(self.bigFilePartialHashesBox, prefs.big_file_partial_hashes)
self.bigSizeThresholdEdit.setText(str(prefs.big_file_size_threshold))
# Update UI state based on selected scan type
scan_type = prefs.get_scan_type(AppMode.Standard)
@ -103,3 +120,5 @@ class PreferencesDialog(PreferencesDialogBase):
prefs.word_weighting = ischecked(self.wordWeightingBox)
prefs.ignore_small_files = ischecked(self.ignoreSmallFilesBox)
prefs.small_file_threshold = tryint(self.sizeThresholdEdit.text())
prefs.big_file_partial_hashes = ischecked(self.bigFilePartialHashesBox)
prefs.big_file_size_threshold = tryint(self.bigSizeThresholdEdit.text())