2016-05-29 18:13:19 +00:00
|
|
|
# Copyright 2016 Hardcoded Software (http://www.hardcoded.net)
|
2014-10-05 20:31:16 +00:00
|
|
|
#
|
2015-01-03 21:33:16 +00:00
|
|
|
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
|
2014-10-05 20:31:16 +00:00
|
|
|
# which should be included with this package. The terms are also available at
|
2015-01-03 21:33:16 +00:00
|
|
|
# http://www.gnu.org/licenses/gpl-3.0.html
|
2009-06-01 09:55:11 +00:00
|
|
|
|
2020-06-26 04:26:48 +00:00
|
|
|
import pytest
|
|
|
|
|
2014-10-05 20:31:16 +00:00
|
|
|
from hscommon.jobprogress import job
|
2022-03-28 04:50:03 +00:00
|
|
|
from pathlib import Path
|
2011-01-05 10:11:21 +00:00
|
|
|
from hscommon.testutil import eq_
|
2009-06-01 09:55:11 +00:00
|
|
|
|
2022-05-09 06:40:08 +00:00
|
|
|
from core import fs
|
|
|
|
from core.engine import getwords, Match
|
|
|
|
from core.ignore import IgnoreList
|
|
|
|
from core.scanner import Scanner, ScanType
|
|
|
|
from core.me.scanner import ScannerME
|
2009-06-01 09:55:11 +00:00
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2023-01-12 05:20:40 +00:00
|
|
|
# TODO update this to be able to inherit from fs.File
|
2011-01-05 10:11:21 +00:00
|
|
|
class NamedObject:
|
2012-02-21 16:14:12 +00:00
|
|
|
def __init__(self, name="foobar", size=1, path=None):
|
|
|
|
if path is None:
|
|
|
|
path = Path(name)
|
|
|
|
else:
|
2022-03-28 04:50:03 +00:00
|
|
|
path = Path(path, name)
|
2009-06-01 09:55:11 +00:00
|
|
|
self.name = name
|
|
|
|
self.size = size
|
2012-02-21 16:14:12 +00:00
|
|
|
self.path = path
|
2009-06-01 09:55:11 +00:00
|
|
|
self.words = getwords(name)
|
2014-10-05 20:31:16 +00:00
|
|
|
|
2010-08-11 14:39:06 +00:00
|
|
|
def __repr__(self):
|
2022-04-28 01:53:12 +00:00
|
|
|
return "<NamedObject {!r} {!r}>".format(self.name, self.path)
|
2014-10-05 20:31:16 +00:00
|
|
|
|
2023-01-12 05:20:40 +00:00
|
|
|
def exists(self):
|
|
|
|
return self.path.exists()
|
|
|
|
|
2009-06-01 09:55:11 +00:00
|
|
|
|
|
|
|
no = NamedObject
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2020-06-26 04:26:48 +00:00
|
|
|
@pytest.fixture
|
|
|
|
def fake_fileexists(request):
|
2011-01-05 10:11:21 +00:00
|
|
|
# This is a hack to avoid invalidating all previous tests since the scanner started to test
|
|
|
|
# for file existence before doing the match grouping.
|
2020-06-26 04:26:48 +00:00
|
|
|
monkeypatch = request.getfixturevalue("monkeypatch")
|
2020-01-01 02:16:27 +00:00
|
|
|
monkeypatch.setattr(Path, "exists", lambda _: True)
|
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
|
|
|
|
def test_empty(fake_fileexists):
|
|
|
|
s = Scanner()
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups([])
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(r, [])
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_default_settings(fake_fileexists):
|
|
|
|
s = Scanner()
|
|
|
|
eq_(s.min_match_percentage, 80)
|
2021-08-21 23:02:02 +00:00
|
|
|
eq_(s.scan_type, ScanType.FILENAME)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(s.mix_file_kind, True)
|
|
|
|
eq_(s.word_weighting, False)
|
|
|
|
eq_(s.match_similar_words, False)
|
2021-08-27 10:35:54 +00:00
|
|
|
eq_(s.size_threshold, 0)
|
|
|
|
eq_(s.large_size_threshold, 0)
|
2021-06-21 20:44:05 +00:00
|
|
|
eq_(s.big_file_size_threshold, 0)
|
2011-01-05 10:11:21 +00:00
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_simple_with_default_settings(fake_fileexists):
|
|
|
|
s = Scanner()
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [no("foo bar", path="p1"), no("foo bar", path="p2"), no("foo bleh")]
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
g = r[0]
|
2020-01-01 02:16:27 +00:00
|
|
|
# 'foo bleh' cannot be in the group because the default min match % is 80
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(g), 2)
|
|
|
|
assert g.ref in f[:2]
|
|
|
|
assert g.dupes[0] in f[:2]
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_simple_with_lower_min_match(fake_fileexists):
|
|
|
|
s = Scanner()
|
|
|
|
s.min_match_percentage = 50
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [no("foo bar", path="p1"), no("foo bar", path="p2"), no("foo bleh")]
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
g = r[0]
|
|
|
|
eq_(len(g), 3)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_trim_all_ref_groups(fake_fileexists):
|
|
|
|
# When all files of a group are ref, don't include that group in the results, but also don't
|
|
|
|
# count the files from that group as discarded.
|
|
|
|
s = Scanner()
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [
|
|
|
|
no("foo", path="p1"),
|
|
|
|
no("foo", path="p2"),
|
|
|
|
no("bar", path="p1"),
|
|
|
|
no("bar", path="p2"),
|
|
|
|
]
|
2011-01-05 10:11:21 +00:00
|
|
|
f[2].is_ref = True
|
|
|
|
f[3].is_ref = True
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
eq_(s.discarded_file_count, 0)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2021-08-21 08:52:09 +00:00
|
|
|
def test_prioritize(fake_fileexists):
|
2011-01-05 10:11:21 +00:00
|
|
|
s = Scanner()
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [
|
|
|
|
no("foo", path="p1"),
|
|
|
|
no("foo", path="p2"),
|
|
|
|
no("bar", path="p1"),
|
|
|
|
no("bar", path="p2"),
|
|
|
|
]
|
2011-01-05 10:11:21 +00:00
|
|
|
f[1].size = 2
|
|
|
|
f[2].size = 3
|
|
|
|
f[3].is_ref = True
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2011-01-05 10:11:21 +00:00
|
|
|
g1, g2 = r
|
2016-05-29 19:02:39 +00:00
|
|
|
assert f[1] in (g1.ref, g2.ref)
|
|
|
|
assert f[0] in (g1.dupes[0], g2.dupes[0])
|
|
|
|
assert f[3] in (g1.ref, g2.ref)
|
|
|
|
assert f[2] in (g1.dupes[0], g2.dupes[0])
|
2011-01-05 10:11:21 +00:00
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_content_scan(fake_fileexists):
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.CONTENTS
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [no("foo"), no("bar"), no("bleh")]
|
2022-03-28 03:27:13 +00:00
|
|
|
f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar"
|
|
|
|
f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar"
|
|
|
|
f[2].digest = f[2].digest_partial = f[1].digest_samples = "bleh"
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
eq_(len(r[0]), 2)
|
2022-03-28 03:27:13 +00:00
|
|
|
eq_(s.discarded_file_count, 0) # don't count the different digest as discarded!
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
|
|
|
|
def test_content_scan_compare_sizes_first(fake_fileexists):
|
|
|
|
class MyFile(no):
|
|
|
|
@property
|
2022-03-28 03:27:13 +00:00
|
|
|
def digest(self):
|
2011-01-05 10:11:21 +00:00
|
|
|
raise AssertionError()
|
|
|
|
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.CONTENTS
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [MyFile("foo", 1), MyFile("bar", 2)]
|
2012-02-21 16:14:12 +00:00
|
|
|
eq_(len(s.get_dupe_groups(f)), 0)
|
2011-01-05 10:11:21 +00:00
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2021-08-27 10:35:54 +00:00
|
|
|
def test_ignore_file_size(fake_fileexists):
|
|
|
|
s = Scanner()
|
|
|
|
s.scan_type = ScanType.CONTENTS
|
|
|
|
small_size = 10 # 10KB
|
|
|
|
s.size_threshold = 0
|
|
|
|
large_size = 100 * 1024 * 1024 # 100MB
|
|
|
|
s.large_size_threshold = 0
|
|
|
|
f = [
|
|
|
|
no("smallignore1", small_size - 1),
|
|
|
|
no("smallignore2", small_size - 1),
|
|
|
|
no("small1", small_size),
|
|
|
|
no("small2", small_size),
|
|
|
|
no("large1", large_size),
|
|
|
|
no("large2", large_size),
|
|
|
|
no("largeignore1", large_size + 1),
|
|
|
|
no("largeignore2", large_size + 1),
|
|
|
|
]
|
2022-03-28 03:27:13 +00:00
|
|
|
f[0].digest = f[0].digest_partial = f[0].digest_samples = "smallignore"
|
|
|
|
f[1].digest = f[1].digest_partial = f[1].digest_samples = "smallignore"
|
|
|
|
f[2].digest = f[2].digest_partial = f[2].digest_samples = "small"
|
|
|
|
f[3].digest = f[3].digest_partial = f[3].digest_samples = "small"
|
|
|
|
f[4].digest = f[4].digest_partial = f[4].digest_samples = "large"
|
|
|
|
f[5].digest = f[5].digest_partial = f[5].digest_samples = "large"
|
|
|
|
f[6].digest = f[6].digest_partial = f[6].digest_samples = "largeignore"
|
|
|
|
f[7].digest = f[7].digest_partial = f[7].digest_samples = "largeignore"
|
2021-08-27 10:35:54 +00:00
|
|
|
|
|
|
|
r = s.get_dupe_groups(f)
|
|
|
|
# No ignores
|
|
|
|
eq_(len(r), 4)
|
|
|
|
# Ignore smaller
|
|
|
|
s.size_threshold = small_size
|
|
|
|
r = s.get_dupe_groups(f)
|
|
|
|
eq_(len(r), 3)
|
|
|
|
# Ignore larger
|
|
|
|
s.size_threshold = 0
|
|
|
|
s.large_size_threshold = large_size
|
|
|
|
r = s.get_dupe_groups(f)
|
|
|
|
eq_(len(r), 3)
|
|
|
|
# Ignore both
|
|
|
|
s.size_threshold = small_size
|
|
|
|
r = s.get_dupe_groups(f)
|
|
|
|
eq_(len(r), 2)
|
|
|
|
|
|
|
|
|
2021-06-21 20:44:05 +00:00
|
|
|
def test_big_file_partial_hashes(fake_fileexists):
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.CONTENTS
|
2021-06-21 20:44:05 +00:00
|
|
|
|
|
|
|
smallsize = 1
|
|
|
|
bigsize = 100 * 1024 * 1024 # 100MB
|
|
|
|
s.big_file_size_threshold = bigsize
|
|
|
|
|
2021-08-15 09:10:18 +00:00
|
|
|
f = [no("bigfoo", bigsize), no("bigbar", bigsize), no("smallfoo", smallsize), no("smallbar", smallsize)]
|
2022-03-28 03:27:13 +00:00
|
|
|
f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar"
|
|
|
|
f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar"
|
|
|
|
f[2].digest = f[2].digest_partial = "bleh"
|
|
|
|
f[3].digest = f[3].digest_partial = "bleh"
|
2021-06-21 20:44:05 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
|
|
|
eq_(len(r), 2)
|
|
|
|
|
2022-03-28 03:27:13 +00:00
|
|
|
# digest_partial is still the same, but the file is actually different
|
|
|
|
f[1].digest = f[1].digest_samples = "difffoobar"
|
|
|
|
# here we compare the full digests, as the user disabled the optimization
|
2021-06-21 20:44:05 +00:00
|
|
|
s.big_file_size_threshold = 0
|
|
|
|
r = s.get_dupe_groups(f)
|
|
|
|
eq_(len(r), 1)
|
|
|
|
|
2022-03-28 03:27:13 +00:00
|
|
|
# here we should compare the digest_samples, and see they are different
|
2021-06-21 20:44:05 +00:00
|
|
|
s.big_file_size_threshold = bigsize
|
|
|
|
r = s.get_dupe_groups(f)
|
|
|
|
eq_(len(r), 1)
|
|
|
|
|
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists):
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.CONTENTS
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [no("foo"), no("bar"), no("bleh")]
|
2022-03-28 03:27:13 +00:00
|
|
|
f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar"
|
|
|
|
f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar"
|
|
|
|
f[2].digest = f[2].digest_partial = f[2].digest_samples = "bleh"
|
2011-01-05 10:11:21 +00:00
|
|
|
s.min_match_percentage = 101
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
eq_(len(r[0]), 2)
|
|
|
|
s.min_match_percentage = 0
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
eq_(len(r[0]), 2)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2022-03-28 03:27:13 +00:00
|
|
|
def test_content_scan_doesnt_put_digest_in_words_at_the_end(fake_fileexists):
|
2011-01-05 10:11:21 +00:00
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.CONTENTS
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [no("foo"), no("bar")]
|
2024-02-19 18:32:13 +00:00
|
|
|
f[0].digest = f[0].digest_partial = f[0].digest_samples = (
|
|
|
|
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
|
|
|
|
)
|
|
|
|
f[1].digest = f[1].digest_partial = f[1].digest_samples = (
|
|
|
|
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
|
|
|
|
)
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2021-06-21 20:44:05 +00:00
|
|
|
# FIXME looks like we are missing something here?
|
2016-05-29 18:13:19 +00:00
|
|
|
r[0]
|
2011-01-05 10:11:21 +00:00
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_extension_is_not_counted_in_filename_scan(fake_fileexists):
|
|
|
|
s = Scanner()
|
|
|
|
s.min_match_percentage = 100
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [no("foo.bar"), no("foo.bleh")]
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
eq_(len(r[0]), 2)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_job(fake_fileexists):
|
2020-01-01 02:16:27 +00:00
|
|
|
def do_progress(progress, desc=""):
|
2011-01-05 10:11:21 +00:00
|
|
|
log.append(progress)
|
|
|
|
return True
|
|
|
|
|
|
|
|
s = Scanner()
|
|
|
|
log = []
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [no("foo bar"), no("foo bar"), no("foo bleh")]
|
2016-05-29 18:13:19 +00:00
|
|
|
s.get_dupe_groups(f, j=job.Job(1, do_progress))
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(log[0], 0)
|
|
|
|
eq_(log[-1], 100)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_mix_file_kind(fake_fileexists):
|
|
|
|
s = Scanner()
|
|
|
|
s.mix_file_kind = False
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [no("foo.1"), no("foo.2")]
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 0)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_word_weighting(fake_fileexists):
|
|
|
|
s = Scanner()
|
|
|
|
s.min_match_percentage = 75
|
|
|
|
s.word_weighting = True
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [no("foo bar"), no("foo bar bleh")]
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
g = r[0]
|
|
|
|
m = g.get_match_of(g.dupes[0])
|
2020-01-01 02:16:27 +00:00
|
|
|
eq_(m.percentage, 75) # 16 letters, 12 matching
|
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
|
|
|
|
def test_similar_words(fake_fileexists):
|
|
|
|
s = Scanner()
|
|
|
|
s.match_similar_words = True
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [
|
|
|
|
no("The White Stripes"),
|
|
|
|
no("The Whites Stripe"),
|
|
|
|
no("Limp Bizkit"),
|
|
|
|
no("Limp Bizkitt"),
|
|
|
|
]
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 2)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_fields(fake_fileexists):
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.FIELDS
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [no("The White Stripes - Little Ghost"), no("The White Stripes - Little Acorn")]
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 0)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_fields_no_order(fake_fileexists):
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.FIELDSNOORDER
|
2020-01-01 02:16:27 +00:00
|
|
|
f = [no("The White Stripes - Little Ghost"), no("Little Ghost - The White Stripes")]
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups(f)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_tag_scan(fake_fileexists):
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.TAG
|
2020-01-01 02:16:27 +00:00
|
|
|
o1 = no("foo")
|
|
|
|
o2 = no("bar")
|
|
|
|
o1.artist = "The White Stripes"
|
|
|
|
o1.title = "The Air Near My Fingers"
|
|
|
|
o2.artist = "The White Stripes"
|
|
|
|
o2.title = "The Air Near My Fingers"
|
2016-05-29 19:02:39 +00:00
|
|
|
r = s.get_dupe_groups([o1, o2])
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_tag_with_album_scan(fake_fileexists):
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.TAG
|
2022-04-28 01:53:12 +00:00
|
|
|
s.scanned_tags = {"artist", "album", "title"}
|
2020-01-01 02:16:27 +00:00
|
|
|
o1 = no("foo")
|
|
|
|
o2 = no("bar")
|
|
|
|
o3 = no("bleh")
|
|
|
|
o1.artist = "The White Stripes"
|
|
|
|
o1.title = "The Air Near My Fingers"
|
|
|
|
o1.album = "Elephant"
|
|
|
|
o2.artist = "The White Stripes"
|
|
|
|
o2.title = "The Air Near My Fingers"
|
|
|
|
o2.album = "Elephant"
|
|
|
|
o3.artist = "The White Stripes"
|
|
|
|
o3.title = "The Air Near My Fingers"
|
|
|
|
o3.album = "foobar"
|
2016-05-29 19:02:39 +00:00
|
|
|
r = s.get_dupe_groups([o1, o2, o3])
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_that_dash_in_tags_dont_create_new_fields(fake_fileexists):
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.TAG
|
2022-04-28 01:53:12 +00:00
|
|
|
s.scanned_tags = {"artist", "album", "title"}
|
2011-01-05 10:11:21 +00:00
|
|
|
s.min_match_percentage = 50
|
2020-01-01 02:16:27 +00:00
|
|
|
o1 = no("foo")
|
|
|
|
o2 = no("bar")
|
|
|
|
o1.artist = "The White Stripes - a"
|
|
|
|
o1.title = "The Air Near My Fingers - a"
|
|
|
|
o1.album = "Elephant - a"
|
|
|
|
o2.artist = "The White Stripes - b"
|
|
|
|
o2.title = "The Air Near My Fingers - b"
|
|
|
|
o2.album = "Elephant - b"
|
2016-05-29 19:02:39 +00:00
|
|
|
r = s.get_dupe_groups([o1, o2])
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_tag_scan_with_different_scanned(fake_fileexists):
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.TAG
|
2022-04-28 01:53:12 +00:00
|
|
|
s.scanned_tags = {"track", "year"}
|
2020-01-01 02:16:27 +00:00
|
|
|
o1 = no("foo")
|
|
|
|
o2 = no("bar")
|
|
|
|
o1.artist = "The White Stripes"
|
|
|
|
o1.title = "some title"
|
|
|
|
o1.track = "foo"
|
|
|
|
o1.year = "bar"
|
|
|
|
o2.artist = "The White Stripes"
|
|
|
|
o2.title = "another title"
|
|
|
|
o2.track = "foo"
|
|
|
|
o2.year = "bar"
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups([o1, o2])
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_tag_scan_only_scans_existing_tags(fake_fileexists):
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.TAG
|
2022-04-28 01:53:12 +00:00
|
|
|
s.scanned_tags = {"artist", "foo"}
|
2020-01-01 02:16:27 +00:00
|
|
|
o1 = no("foo")
|
|
|
|
o2 = no("bar")
|
|
|
|
o1.artist = "The White Stripes"
|
|
|
|
o1.foo = "foo"
|
|
|
|
o2.artist = "The White Stripes"
|
|
|
|
o2.foo = "bar"
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups([o1, o2])
|
2020-01-01 02:16:27 +00:00
|
|
|
eq_(len(r), 1) # Because 'foo' is not scanned, they match
|
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
|
|
|
|
def test_tag_scan_converts_to_str(fake_fileexists):
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.TAG
|
2022-04-28 01:53:12 +00:00
|
|
|
s.scanned_tags = {"track"}
|
2020-01-01 02:16:27 +00:00
|
|
|
o1 = no("foo")
|
|
|
|
o2 = no("bar")
|
2011-01-05 10:11:21 +00:00
|
|
|
o1.track = 42
|
|
|
|
o2.track = 42
|
|
|
|
try:
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups([o1, o2])
|
2011-01-05 10:11:21 +00:00
|
|
|
except TypeError:
|
|
|
|
raise AssertionError()
|
|
|
|
eq_(len(r), 1)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_tag_scan_non_ascii(fake_fileexists):
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.TAG
|
2022-04-28 01:53:12 +00:00
|
|
|
s.scanned_tags = {"title"}
|
2020-01-01 02:16:27 +00:00
|
|
|
o1 = no("foo")
|
|
|
|
o2 = no("bar")
|
|
|
|
o1.title = "foobar\u00e9"
|
|
|
|
o2.title = "foobar\u00e9"
|
2011-01-05 10:11:21 +00:00
|
|
|
try:
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups([o1, o2])
|
2011-01-05 10:11:21 +00:00
|
|
|
except UnicodeEncodeError:
|
|
|
|
raise AssertionError()
|
|
|
|
eq_(len(r), 1)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_ignore_list(fake_fileexists):
|
|
|
|
s = Scanner()
|
2020-01-01 02:16:27 +00:00
|
|
|
f1 = no("foobar")
|
|
|
|
f2 = no("foobar")
|
|
|
|
f3 = no("foobar")
|
|
|
|
f1.path = Path("dir1/foobar")
|
|
|
|
f2.path = Path("dir2/foobar")
|
|
|
|
f3.path = Path("dir3/foobar")
|
2016-05-29 18:13:19 +00:00
|
|
|
ignore_list = IgnoreList()
|
2021-08-25 06:11:24 +00:00
|
|
|
ignore_list.ignore(str(f1.path), str(f2.path))
|
|
|
|
ignore_list.ignore(str(f1.path), str(f3.path))
|
2016-05-29 19:02:39 +00:00
|
|
|
r = s.get_dupe_groups([f1, f2, f3], ignore_list=ignore_list)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
g = r[0]
|
|
|
|
eq_(len(g.dupes), 1)
|
|
|
|
assert f1 not in g
|
|
|
|
assert f2 in g
|
|
|
|
assert f3 in g
|
|
|
|
# Ignored matches are not counted as discarded
|
|
|
|
eq_(s.discarded_file_count, 0)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_ignore_list_checks_for_unicode(fake_fileexists):
|
2020-01-01 02:16:27 +00:00
|
|
|
# scanner was calling path_str for ignore list checks. Since the Path changes, it must
|
|
|
|
# be unicode(path)
|
|
|
|
s = Scanner()
|
|
|
|
f1 = no("foobar")
|
|
|
|
f2 = no("foobar")
|
|
|
|
f3 = no("foobar")
|
|
|
|
f1.path = Path("foo1\u00e9")
|
|
|
|
f2.path = Path("foo2\u00e9")
|
|
|
|
f3.path = Path("foo3\u00e9")
|
2016-05-29 18:13:19 +00:00
|
|
|
ignore_list = IgnoreList()
|
2021-08-25 06:11:24 +00:00
|
|
|
ignore_list.ignore(str(f1.path), str(f2.path))
|
|
|
|
ignore_list.ignore(str(f1.path), str(f3.path))
|
2016-05-29 19:02:39 +00:00
|
|
|
r = s.get_dupe_groups([f1, f2, f3], ignore_list=ignore_list)
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
g = r[0]
|
|
|
|
eq_(len(g.dupes), 1)
|
|
|
|
assert f1 not in g
|
|
|
|
assert f2 in g
|
|
|
|
assert f3 in g
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_file_evaluates_to_false(fake_fileexists):
|
|
|
|
# A very wrong way to use any() was added at some point, causing resulting group list
|
|
|
|
# to be empty.
|
|
|
|
class FalseNamedObject(NamedObject):
|
|
|
|
def __bool__(self):
|
|
|
|
return False
|
2014-10-05 20:31:16 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
s = Scanner()
|
2020-01-01 02:16:27 +00:00
|
|
|
f1 = FalseNamedObject("foobar", path="p1")
|
|
|
|
f2 = FalseNamedObject("foobar", path="p2")
|
2012-02-21 16:14:12 +00:00
|
|
|
r = s.get_dupe_groups([f1, f2])
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(r), 1)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_size_threshold(fake_fileexists):
|
|
|
|
# Only file equal or higher than the size_threshold in size are scanned
|
|
|
|
s = Scanner()
|
2020-01-01 02:16:27 +00:00
|
|
|
f1 = no("foo", 1, path="p1")
|
|
|
|
f2 = no("foo", 2, path="p2")
|
|
|
|
f3 = no("foo", 3, path="p3")
|
2011-01-05 10:11:21 +00:00
|
|
|
s.size_threshold = 2
|
2016-05-29 19:02:39 +00:00
|
|
|
groups = s.get_dupe_groups([f1, f2, f3])
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(groups), 1)
|
|
|
|
[group] = groups
|
|
|
|
eq_(len(group), 2)
|
|
|
|
assert f1 not in group
|
|
|
|
assert f2 in group
|
|
|
|
assert f3 in group
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_tie_breaker_path_deepness(fake_fileexists):
|
|
|
|
# If there is a tie in prioritization, path deepness is used as a tie breaker
|
|
|
|
s = Scanner()
|
2020-01-01 02:16:27 +00:00
|
|
|
o1, o2 = no("foo"), no("foo")
|
|
|
|
o1.path = Path("foo")
|
|
|
|
o2.path = Path("foo/bar")
|
2012-02-21 16:14:12 +00:00
|
|
|
[group] = s.get_dupe_groups([o1, o2])
|
2011-01-05 10:11:21 +00:00
|
|
|
assert group.ref is o2
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_tie_breaker_copy(fake_fileexists):
|
|
|
|
# if copy is in the words used (even if it has a deeper path), it becomes a dupe
|
|
|
|
s = Scanner()
|
2020-01-01 02:16:27 +00:00
|
|
|
o1, o2 = no("foo bar Copy"), no("foo bar")
|
|
|
|
o1.path = Path("deeper/path")
|
|
|
|
o2.path = Path("foo")
|
2012-02-21 16:14:12 +00:00
|
|
|
[group] = s.get_dupe_groups([o1, o2])
|
2011-01-05 10:11:21 +00:00
|
|
|
assert group.ref is o2
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_tie_breaker_same_name_plus_digit(fake_fileexists):
|
|
|
|
# if ref has the same words as dupe, but has some just one extra word which is a digit, it
|
|
|
|
# becomes a dupe
|
|
|
|
s = Scanner()
|
2020-01-01 02:16:27 +00:00
|
|
|
o1 = no("foo bar 42")
|
|
|
|
o2 = no("foo bar [42]")
|
|
|
|
o3 = no("foo bar (42)")
|
|
|
|
o4 = no("foo bar {42}")
|
|
|
|
o5 = no("foo bar")
|
2011-01-05 10:11:21 +00:00
|
|
|
# all numbered names have deeper paths, so they'll end up ref if the digits aren't correctly
|
|
|
|
# used as tie breakers
|
2020-01-01 02:16:27 +00:00
|
|
|
o1.path = Path("deeper/path")
|
|
|
|
o2.path = Path("deeper/path")
|
|
|
|
o3.path = Path("deeper/path")
|
|
|
|
o4.path = Path("deeper/path")
|
|
|
|
o5.path = Path("foo")
|
2012-02-21 16:14:12 +00:00
|
|
|
[group] = s.get_dupe_groups([o1, o2, o3, o4, o5])
|
2011-01-05 10:11:21 +00:00
|
|
|
assert group.ref is o5
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_partial_group_match(fake_fileexists):
|
2014-10-05 20:31:16 +00:00
|
|
|
# Count the number of discarded matches (when a file doesn't match all other dupes of the
|
2011-01-05 10:11:21 +00:00
|
|
|
# group) in Scanner.discarded_file_count
|
|
|
|
s = Scanner()
|
2020-01-01 02:16:27 +00:00
|
|
|
o1, o2, o3 = no("a b"), no("a"), no("b")
|
2011-01-05 10:11:21 +00:00
|
|
|
s.min_match_percentage = 50
|
2012-02-21 16:14:12 +00:00
|
|
|
[group] = s.get_dupe_groups([o1, o2, o3])
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(len(group), 2)
|
|
|
|
assert o1 in group
|
2012-07-08 13:03:05 +00:00
|
|
|
# The file that will actually be counted as a dupe is undefined. The only thing we want to test
|
|
|
|
# is that we don't have both
|
|
|
|
if o2 in group:
|
|
|
|
assert o3 not in group
|
|
|
|
else:
|
|
|
|
assert o3 in group
|
2011-01-05 10:11:21 +00:00
|
|
|
eq_(s.discarded_file_count, 1)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def test_dont_group_files_that_dont_exist(tmpdir):
|
|
|
|
# when creating groups, check that files exist first. It's possible that these files have
|
|
|
|
# been moved during the scan by the user.
|
|
|
|
# In this test, we have to delete one of the files between the get_matches() part and the
|
|
|
|
# get_groups() part.
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.CONTENTS
|
2011-01-05 10:11:21 +00:00
|
|
|
p = Path(str(tmpdir))
|
2022-03-28 04:50:03 +00:00
|
|
|
with p.joinpath("file1").open("w") as fp:
|
|
|
|
fp.write("foo")
|
|
|
|
with p.joinpath("file2").open("w") as fp:
|
|
|
|
fp.write("foo")
|
2011-01-05 10:11:21 +00:00
|
|
|
file1, file2 = fs.get_files(p)
|
2016-05-29 19:02:39 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
def getmatches(*args, **kw):
|
2022-03-28 04:50:03 +00:00
|
|
|
file2.path.unlink()
|
2011-01-05 10:11:21 +00:00
|
|
|
return [Match(file1, file2, 100)]
|
2016-05-29 19:02:39 +00:00
|
|
|
|
2011-01-05 10:11:21 +00:00
|
|
|
s._getmatches = getmatches
|
2014-10-05 20:31:16 +00:00
|
|
|
|
2012-02-21 16:14:12 +00:00
|
|
|
assert not s.get_dupe_groups([file1, file2])
|
2011-04-12 11:22:29 +00:00
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2011-04-12 11:22:29 +00:00
|
|
|
def test_folder_scan_exclude_subfolder_matches(fake_fileexists):
|
|
|
|
# when doing a Folders scan type, don't include matches for folders whose parent folder already
|
|
|
|
# match.
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.FOLDERS
|
2011-04-12 11:22:29 +00:00
|
|
|
topf1 = no("top folder 1", size=42)
|
2022-03-28 03:27:13 +00:00
|
|
|
topf1.digest = topf1.digest_partial = topf1.digest_samples = b"some_digest__1"
|
2020-01-01 02:16:27 +00:00
|
|
|
topf1.path = Path("/topf1")
|
2011-04-12 11:22:29 +00:00
|
|
|
topf2 = no("top folder 2", size=42)
|
2022-03-28 03:27:13 +00:00
|
|
|
topf2.digest = topf2.digest_partial = topf2.digest_samples = b"some_digest__1"
|
2020-01-01 02:16:27 +00:00
|
|
|
topf2.path = Path("/topf2")
|
2011-04-12 11:22:29 +00:00
|
|
|
subf1 = no("sub folder 1", size=41)
|
2022-03-28 03:27:13 +00:00
|
|
|
subf1.digest = subf1.digest_partial = subf1.digest_samples = b"some_digest__2"
|
2020-01-01 02:16:27 +00:00
|
|
|
subf1.path = Path("/topf1/sub")
|
2011-04-12 11:22:29 +00:00
|
|
|
subf2 = no("sub folder 2", size=41)
|
2022-03-28 03:27:13 +00:00
|
|
|
subf2.digest = subf2.digest_partial = subf2.digest_samples = b"some_digest__2"
|
2020-01-01 02:16:27 +00:00
|
|
|
subf2.path = Path("/topf2/sub")
|
|
|
|
eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2])), 1) # only top folders
|
2011-04-12 11:22:29 +00:00
|
|
|
# however, if another folder matches a subfolder, keep in in the matches
|
|
|
|
otherf = no("other folder", size=41)
|
2022-03-28 03:27:13 +00:00
|
|
|
otherf.digest = otherf.digest_partial = otherf.digest_samples = b"some_digest__2"
|
2020-01-01 02:16:27 +00:00
|
|
|
otherf.path = Path("/otherfolder")
|
2012-02-21 16:14:12 +00:00
|
|
|
eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2, otherf])), 2)
|
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2012-02-21 16:14:12 +00:00
|
|
|
def test_ignore_files_with_same_path(fake_fileexists):
|
|
|
|
# It's possible that the scanner is fed with two file instances pointing to the same path. One
|
|
|
|
# of these files has to be ignored
|
|
|
|
s = Scanner()
|
2020-01-01 02:16:27 +00:00
|
|
|
f1 = no("foobar", path="path1/foobar")
|
|
|
|
f2 = no("foobar", path="path1/foobar")
|
2012-02-21 16:14:12 +00:00
|
|
|
eq_(s.get_dupe_groups([f1, f2]), [])
|
2012-02-26 16:18:29 +00:00
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2012-02-26 16:18:29 +00:00
|
|
|
def test_dont_count_ref_files_as_discarded(fake_fileexists):
|
|
|
|
# To speed up the scan, we don't bother comparing contents of files that are both ref files.
|
|
|
|
# However, this causes problems in "discarded" counting and we make sure here that we don't
|
|
|
|
# report discarded matches in exact duplicate scans.
|
|
|
|
s = Scanner()
|
2021-08-21 23:02:02 +00:00
|
|
|
s.scan_type = ScanType.CONTENTS
|
2012-02-26 16:18:29 +00:00
|
|
|
o1 = no("foo", path="p1")
|
|
|
|
o2 = no("foo", path="p2")
|
|
|
|
o3 = no("foo", path="p3")
|
2022-03-28 03:27:13 +00:00
|
|
|
o1.digest = o1.digest_partial = o1.digest_samples = "foobar"
|
|
|
|
o2.digest = o2.digest_partial = o2.digest_samples = "foobar"
|
|
|
|
o3.digest = o3.digest_partial = o3.digest_samples = "foobar"
|
2012-02-26 16:18:29 +00:00
|
|
|
o1.is_ref = True
|
|
|
|
o2.is_ref = True
|
|
|
|
eq_(len(s.get_dupe_groups([o1, o2, o3])), 1)
|
|
|
|
eq_(s.discarded_file_count, 0)
|
2016-06-01 02:32:37 +00:00
|
|
|
|
2020-01-01 02:16:27 +00:00
|
|
|
|
2021-08-21 08:52:09 +00:00
|
|
|
def test_prioritize_me(fake_fileexists):
|
|
|
|
# in ScannerME, bitrate goes first (right after is_ref) in prioritization
|
2016-06-01 02:32:37 +00:00
|
|
|
s = ScannerME()
|
2020-01-01 02:16:27 +00:00
|
|
|
o1, o2 = no("foo", path="p1"), no("foo", path="p2")
|
2016-06-01 02:32:37 +00:00
|
|
|
o1.bitrate = 1
|
|
|
|
o2.bitrate = 2
|
|
|
|
[group] = s.get_dupe_groups([o1, o2])
|
|
|
|
assert group.ref is o2
|