Add preference to ignore large files, close #430

2026-03-08 10:11:38 +00:00 · 2021-08-27 05:35:54 -05:00
parent 809116c764
commit 3045361243
5 changed files with 87 additions and 4 deletions
--- a/core/scanner.py
+++ b/core/scanner.py
@@ -77,15 +77,22 @@ class Scanner:
        self.discarded_file_count = 0

    def _getmatches(self, files, j):
-        if self.size_threshold or self.scan_type in {
-            ScanType.CONTENTS,
-            ScanType.FOLDERS,
-        }:
+        if (
+            self.size_threshold
+            or self.large_size_threshold
+            or self.scan_type
+            in {
+                ScanType.CONTENTS,
+                ScanType.FOLDERS,
+            }
+        ):
            j = j.start_subjob([2, 8])
            for f in j.iter_with_progress(files, tr("Read size of %d/%d files")):
                f.size  # pre-read, makes a smoother progress if read here (especially for bundles)
            if self.size_threshold:
                files = [f for f in files if f.size >= self.size_threshold]
+            if self.large_size_threshold:
+                files = [f for f in files if f.size <= self.large_size_threshold]
        if self.scan_type in {ScanType.CONTENTS, ScanType.FOLDERS}:
            return engine.getmatches_by_contents(files, bigsize=self.big_file_size_threshold, j=j)
        else:
@@ -202,5 +209,6 @@ class Scanner:
    scan_type = ScanType.FILENAME
    scanned_tags = {"artist", "title"}
    size_threshold = 0
+    large_size_threshold = 0
    big_file_size_threshold = 0
    word_weighting = False
--- a/core/tests/scanner_test.py
+++ b/core/tests/scanner_test.py
@@ -56,6 +56,8 @@ def test_default_settings(fake_fileexists):
    eq_(s.mix_file_kind, True)
    eq_(s.word_weighting, False)
    eq_(s.match_similar_words, False)
+    eq_(s.size_threshold, 0)
+    eq_(s.large_size_threshold, 0)
    eq_(s.big_file_size_threshold, 0)


@@ -142,6 +144,50 @@ def test_content_scan_compare_sizes_first(fake_fileexists):
    eq_(len(s.get_dupe_groups(f)), 0)


+def test_ignore_file_size(fake_fileexists):
+    s = Scanner()
+    s.scan_type = ScanType.CONTENTS
+    small_size = 10  # 10KB
+    s.size_threshold = 0
+    large_size = 100 * 1024 * 1024  # 100MB
+    s.large_size_threshold = 0
+    f = [
+        no("smallignore1", small_size - 1),
+        no("smallignore2", small_size - 1),
+        no("small1", small_size),
+        no("small2", small_size),
+        no("large1", large_size),
+        no("large2", large_size),
+        no("largeignore1", large_size + 1),
+        no("largeignore2", large_size + 1),
+    ]
+    f[0].md5 = f[0].md5partial = f[0].md5samples = "smallignore"
+    f[1].md5 = f[1].md5partial = f[1].md5samples = "smallignore"
+    f[2].md5 = f[2].md5partial = f[2].md5samples = "small"
+    f[3].md5 = f[3].md5partial = f[3].md5samples = "small"
+    f[4].md5 = f[4].md5partial = f[4].md5samples = "large"
+    f[5].md5 = f[5].md5partial = f[5].md5samples = "large"
+    f[6].md5 = f[6].md5partial = f[6].md5samples = "largeignore"
+    f[7].md5 = f[7].md5partial = f[7].md5samples = "largeignore"
+
+    r = s.get_dupe_groups(f)
+    # No ignores
+    eq_(len(r), 4)
+    # Ignore smaller
+    s.size_threshold = small_size
+    r = s.get_dupe_groups(f)
+    eq_(len(r), 3)
+    # Ignore larger
+    s.size_threshold = 0
+    s.large_size_threshold = large_size
+    r = s.get_dupe_groups(f)
+    eq_(len(r), 3)
+    # Ignore both
+    s.size_threshold = small_size
+    r = s.get_dupe_groups(f)
+    eq_(len(r), 2)
+
+
 def test_big_file_partial_hashes(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.CONTENTS