1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-22 14:41:39 +00:00

Add partial hashes optimization for big files

* Big files above the user selected threshold can be partially hashed in 3 places.
* If the user is willing to take the risk, we consider files with identical md5samples as being identical.
This commit is contained in:
glubsy
2021-06-21 19:03:21 +02:00
parent 4641bd6ec9
commit e07dfd5955
6 changed files with 97 additions and 24 deletions

View File

@@ -87,7 +87,11 @@ class Scanner:
if self.size_threshold:
files = [f for f in files if f.size >= self.size_threshold]
if self.scan_type in {ScanType.Contents, ScanType.Folders}:
return engine.getmatches_by_contents(files, j=j)
return engine.getmatches_by_contents(
files,
bigsize=self.big_file_size_threshold if self.big_file_partial_hashes else 0,
j=j
)
else:
j = j.start_subjob([2, 8])
kw = {}
@@ -218,4 +222,6 @@ class Scanner:
scan_type = ScanType.Filename
scanned_tags = {"artist", "title"}
size_threshold = 0
big_file_partial_hashes = True
big_file_size_threshold = 100 * 1024 * 1024
word_weighting = False