Add unit tests for hash sample optimization

* Instead of keeping md5 samples separate, merge them as one hash computed from the various selected chunks we picked. * We don't need to keep a boolean to see whether or not the user chose to optimize; we can simply compare the value of the threshold, since 0 means no optimization currently active.
2026-04-08 22:51:39 +00:00 · 2021-06-21 22:44:05 +02:00
parent e07dfd5955
commit 277bc3fbb8
7 changed files with 152 additions and 43 deletions
--- a/core/tests/scanner_test.py
+++ b/core/tests/scanner_test.py
@@ -56,6 +56,7 @@ def test_default_settings(fake_fileexists):
    eq_(s.mix_file_kind, True)
    eq_(s.word_weighting, False)
    eq_(s.match_similar_words, False)
+    eq_(s.big_file_size_threshold, 0)


 def test_simple_with_default_settings(fake_fileexists):
@@ -120,9 +121,9 @@ def test_content_scan(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Contents
    f = [no("foo"), no("bar"), no("bleh")]
-    f[0].md5 = f[0].md5partial = "foobar"
-    f[1].md5 = f[1].md5partial = "foobar"
-    f[2].md5 = f[2].md5partial = "bleh"
+    f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
+    f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
+    f[2].md5 = f[2].md5partial = f[1].md5samples = "bleh"
    r = s.get_dupe_groups(f)
    eq_(len(r), 1)
    eq_(len(r[0]), 2)
@@ -141,13 +142,43 @@ def test_content_scan_compare_sizes_first(fake_fileexists):
    eq_(len(s.get_dupe_groups(f)), 0)


+def test_big_file_partial_hashes(fake_fileexists):
+    s = Scanner()
+    s.scan_type = ScanType.Contents
+
+    smallsize = 1
+    bigsize = 100 * 1024 * 1024  # 100MB
+    s.big_file_size_threshold = bigsize
+
+    f = [no("bigfoo", bigsize), no("bigbar", bigsize),
+         no("smallfoo", smallsize), no("smallbar", smallsize)]
+    f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
+    f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
+    f[2].md5 = f[2].md5partial = "bleh"
+    f[3].md5 = f[3].md5partial = "bleh"
+    r = s.get_dupe_groups(f)
+    eq_(len(r), 2)
+
+    # md5partial is still the same, but the file is actually different
+    f[1].md5 = f[1].md5samples = "difffoobar"
+    # here we compare the full md5s, as the user disabled the optimization
+    s.big_file_size_threshold = 0
+    r = s.get_dupe_groups(f)
+    eq_(len(r), 1)
+
+    # here we should compare the md5samples, and see they are different
+    s.big_file_size_threshold = bigsize
+    r = s.get_dupe_groups(f)
+    eq_(len(r), 1)
+
+
 def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Contents
    f = [no("foo"), no("bar"), no("bleh")]
-    f[0].md5 = f[0].md5partial = "foobar"
-    f[1].md5 = f[1].md5partial = "foobar"
-    f[2].md5 = f[2].md5partial = "bleh"
+    f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
+    f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
+    f[2].md5 = f[2].md5partial = f[2].md5samples = "bleh"
    s.min_match_percentage = 101
    r = s.get_dupe_groups(f)
    eq_(len(r), 1)
@@ -162,13 +193,12 @@ def test_content_scan_doesnt_put_md5_in_words_at_the_end(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Contents
    f = [no("foo"), no("bar")]
-    f[0].md5 = f[
-        0
-    ].md5partial = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-    f[1].md5 = f[
-        1
-    ].md5partial = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+    f[0].md5 = f[0].md5partial = f[0].md5samples =\
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+    f[1].md5 = f[1].md5partial = f[1].md5samples =\
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
    r = s.get_dupe_groups(f)
+    # FIXME looks like we are missing something here?
    r[0]


@@ -514,21 +544,21 @@ def test_folder_scan_exclude_subfolder_matches(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Folders
    topf1 = no("top folder 1", size=42)
-    topf1.md5 = topf1.md5partial = b"some_md5_1"
+    topf1.md5 = topf1.md5partial = topf1.md5samples = b"some_md5_1"
    topf1.path = Path("/topf1")
    topf2 = no("top folder 2", size=42)
-    topf2.md5 = topf2.md5partial = b"some_md5_1"
+    topf2.md5 = topf2.md5partial = topf2.md5samples = b"some_md5_1"
    topf2.path = Path("/topf2")
    subf1 = no("sub folder 1", size=41)
-    subf1.md5 = subf1.md5partial = b"some_md5_2"
+    subf1.md5 = subf1.md5partial = subf1.md5samples = b"some_md5_2"
    subf1.path = Path("/topf1/sub")
    subf2 = no("sub folder 2", size=41)
-    subf2.md5 = subf2.md5partial = b"some_md5_2"
+    subf2.md5 = subf2.md5partial = subf2.md5samples = b"some_md5_2"
    subf2.path = Path("/topf2/sub")
    eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2])), 1)  # only top folders
    # however, if another folder matches a subfolder, keep in in the matches
    otherf = no("other folder", size=41)
-    otherf.md5 = otherf.md5partial = b"some_md5_2"
+    otherf.md5 = otherf.md5partial = otherf.md5samples = b"some_md5_2"
    otherf.path = Path("/otherfolder")
    eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2, otherf])), 2)

@@ -551,9 +581,9 @@ def test_dont_count_ref_files_as_discarded(fake_fileexists):
    o1 = no("foo", path="p1")
    o2 = no("foo", path="p2")
    o3 = no("foo", path="p3")
-    o1.md5 = o1.md5partial = "foobar"
-    o2.md5 = o2.md5partial = "foobar"
-    o3.md5 = o3.md5partial = "foobar"
+    o1.md5 = o1.md5partial = o1.md5samples = "foobar"
+    o2.md5 = o2.md5partial = o2.md5samples = "foobar"
+    o3.md5 = o3.md5partial = o3.md5samples = "foobar"
    o1.is_ref = True
    o2.is_ref = True
    eq_(len(s.get_dupe_groups([o1, o2, o3])), 1)