Add unit tests for hash sample optimization

* Instead of keeping md5 samples separate, merge them as one hash computed from the various selected chunks we picked. * We don't need to keep a boolean to see whether or not the user chose to optimize; we can simply compare the value of the threshold, since 0 means no optimization currently active.
2026-01-22 14:41:39 +00:00 · 2021-06-21 22:44:05 +02:00
parent e07dfd5955
commit 277bc3fbb8
7 changed files with 152 additions and 43 deletions
--- a/core/tests/engine_test.py
+++ b/core/tests/engine_test.py
@@ -551,6 +551,28 @@ class TestCaseGetMatchesByContents:
        o1, o2 = no(size=0), no(size=0)
        assert not getmatches_by_contents([o1, o2])

+    def test_big_file_partial_hashes(self):
+        smallsize = 1
+        bigsize = 100 * 1024 * 1024  # 100MB
+        f = [no("bigfoo", size=bigsize), no("bigbar", size=bigsize),
+             no("smallfoo", size=smallsize), no("smallbar", size=smallsize)]
+        f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
+        f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
+        f[2].md5 = f[2].md5partial = "bleh"
+        f[3].md5 = f[3].md5partial = "bleh"
+        r = getmatches_by_contents(f, bigsize=bigsize)
+        eq_(len(r), 2)
+        # User disabled optimization for big files, compute hashes as usual
+        r = getmatches_by_contents(f, bigsize=0)
+        eq_(len(r), 2)
+        # Other file is now slightly different, md5partial is still the same
+        f[1].md5 = f[1].md5samples = "foobardiff"
+        r = getmatches_by_contents(f, bigsize=bigsize)
+        # Successfully filter it out
+        eq_(len(r), 1)
+        r = getmatches_by_contents(f, bigsize=0)
+        eq_(len(r), 1)
+

 class TestCaseGroup:
    def test_empy(self):