Add partial hashes optimization for big files

* Big files above the user selected threshold can be partially hashed in 3 places. * If the user is willing to take the risk, we consider files with identical md5samples as being identical.
2026-01-22 14:41:39 +00:00 · 2021-06-21 19:03:21 +02:00
parent 4641bd6ec9
commit e07dfd5955
6 changed files with 97 additions and 24 deletions
--- a/core/engine.py
+++ b/core/engine.py
@@ -283,9 +283,10 @@ def getmatches(
    return result


-def getmatches_by_contents(files, j=job.nulljob):
+def getmatches_by_contents(files, bigsize=0, j=job.nulljob):
    """Returns a list of :class:`Match` within ``files`` if their contents is the same.

+    :param bigsize: The size in bytes over which we consider files too big for a full md5.
    :param j: A :ref:`job progress instance <jobs>`.
    """
    size2files = defaultdict(set)
@@ -302,8 +303,13 @@ def getmatches_by_contents(files, j=job.nulljob):
            if first.is_ref and second.is_ref:
                continue  # Don't spend time comparing two ref pics together.
            if first.md5partial == second.md5partial:
-                if first.md5 == second.md5:
-                    result.append(Match(first, second, 100))
+                if bigsize > 0 and first.size > bigsize:
+                    print(f"first md5chunks {first} {first.md5samples}, second {second} {second.md5samples}")
+                    if first.md5samples == second.md5samples:
+                        result.append(Match(first, second, 100))
+                else:
+                    if first.md5 == second.md5:
+                        result.append(Match(first, second, 100))
        j.add_progress(desc=tr("%d matches found") % len(result))
    return result