mirror of
https://github.com/arsenetar/dupeguru.git
synced 2025-03-10 05:34:36 +00:00
Add unit tests for hash sample optimization
* Instead of keeping md5 samples separate, merge them as one hash computed from the various selected chunks we picked. * We don't need to keep a boolean to see whether or not the user chose to optimize; we can simply compare the value of the threshold, since 0 means no optimization currently active.
This commit is contained in:
parent
e07dfd5955
commit
277bc3fbb8
@ -286,7 +286,8 @@ def getmatches(
|
|||||||
def getmatches_by_contents(files, bigsize=0, j=job.nulljob):
|
def getmatches_by_contents(files, bigsize=0, j=job.nulljob):
|
||||||
"""Returns a list of :class:`Match` within ``files`` if their contents is the same.
|
"""Returns a list of :class:`Match` within ``files`` if their contents is the same.
|
||||||
|
|
||||||
:param bigsize: The size in bytes over which we consider files too big for a full md5.
|
:param bigsize: The size in bytes over which we consider files big enough to
|
||||||
|
justify taking samples of md5. If 0, compute md5 as usual.
|
||||||
:param j: A :ref:`job progress instance <jobs>`.
|
:param j: A :ref:`job progress instance <jobs>`.
|
||||||
"""
|
"""
|
||||||
size2files = defaultdict(set)
|
size2files = defaultdict(set)
|
||||||
@ -304,7 +305,6 @@ def getmatches_by_contents(files, bigsize=0, j=job.nulljob):
|
|||||||
continue # Don't spend time comparing two ref pics together.
|
continue # Don't spend time comparing two ref pics together.
|
||||||
if first.md5partial == second.md5partial:
|
if first.md5partial == second.md5partial:
|
||||||
if bigsize > 0 and first.size > bigsize:
|
if bigsize > 0 and first.size > bigsize:
|
||||||
print(f"first md5chunks {first} {first.md5samples}, second {second} {second.md5samples}")
|
|
||||||
if first.md5samples == second.md5samples:
|
if first.md5samples == second.md5samples:
|
||||||
result.append(Match(first, second, 100))
|
result.append(Match(first, second, 100))
|
||||||
else:
|
else:
|
||||||
|
30
core/fs.py
30
core/fs.py
@ -83,9 +83,9 @@ class File:
|
|||||||
INITIAL_INFO = {
|
INITIAL_INFO = {
|
||||||
"size": 0,
|
"size": 0,
|
||||||
"mtime": 0,
|
"mtime": 0,
|
||||||
"md5": "",
|
"md5": b"",
|
||||||
"md5partial": "",
|
"md5partial": b"",
|
||||||
"md5samples": []
|
"md5samples": b""
|
||||||
}
|
}
|
||||||
# Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
|
# Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
|
||||||
# files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
|
# files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
|
||||||
@ -104,7 +104,6 @@ class File:
|
|||||||
result = object.__getattribute__(self, attrname)
|
result = object.__getattribute__(self, attrname)
|
||||||
if result is NOT_SET:
|
if result is NOT_SET:
|
||||||
try:
|
try:
|
||||||
print(f"Try get attr for {self} {attrname}")
|
|
||||||
self._read_info(attrname)
|
self._read_info(attrname)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
@ -121,12 +120,12 @@ class File:
|
|||||||
return (0x4000, 0x4000) # 16Kb
|
return (0x4000, 0x4000) # 16Kb
|
||||||
|
|
||||||
def _read_info(self, field):
|
def _read_info(self, field):
|
||||||
|
# print(f"_read_info({field}) for {self}")
|
||||||
if field in ("size", "mtime"):
|
if field in ("size", "mtime"):
|
||||||
stats = self.path.stat()
|
stats = self.path.stat()
|
||||||
self.size = nonone(stats.st_size, 0)
|
self.size = nonone(stats.st_size, 0)
|
||||||
self.mtime = nonone(stats.st_mtime, 0)
|
self.mtime = nonone(stats.st_mtime, 0)
|
||||||
elif field == "md5partial":
|
elif field == "md5partial":
|
||||||
print(f"_read_info md5partial {self}")
|
|
||||||
try:
|
try:
|
||||||
with self.path.open("rb") as fp:
|
with self.path.open("rb") as fp:
|
||||||
offset, size = self._get_md5partial_offset_and_size()
|
offset, size = self._get_md5partial_offset_and_size()
|
||||||
@ -146,27 +145,24 @@ class File:
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
elif field == "md5samples":
|
elif field == "md5samples":
|
||||||
print(f"computing md5chunks for {self}, caller: {inspect.stack()[1][3]}")
|
|
||||||
try:
|
try:
|
||||||
with self.path.open("rb") as fp:
|
with self.path.open("rb") as fp:
|
||||||
md5chunks = []
|
size = self.size
|
||||||
# Chunk at 25% of the file
|
# Chunk at 25% of the file
|
||||||
fp.seek(floor(self.size * 25 / 100), 0)
|
fp.seek(floor(size * 25 / 100), 0)
|
||||||
filedata = fp.read(CHUNK_SIZE)
|
filedata = fp.read(CHUNK_SIZE)
|
||||||
md5chunks.append(hashlib.md5(filedata).hexdigest())
|
md5 = hashlib.md5(filedata)
|
||||||
|
|
||||||
# Chunk at 60% of the file
|
# Chunk at 60% of the file
|
||||||
fp.seek(floor(self.size * 60 / 100), 0)
|
fp.seek(floor(size * 60 / 100), 0)
|
||||||
filedata = fp.read(CHUNK_SIZE)
|
filedata = fp.read(CHUNK_SIZE)
|
||||||
md5chunks.append(hashlib.md5(filedata).hexdigest())
|
md5.update(filedata)
|
||||||
|
|
||||||
# Last chunk of the file
|
# Last chunk of the file
|
||||||
fp.seek(-CHUNK_SIZE, 2)
|
fp.seek(-CHUNK_SIZE, 2)
|
||||||
filedata = fp.read(CHUNK_SIZE)
|
filedata = fp.read(CHUNK_SIZE)
|
||||||
md5chunks.append(hashlib.md5(filedata).hexdigest())
|
md5.update(filedata)
|
||||||
|
setattr(self, field, md5.digest())
|
||||||
# Use setattr to avoid circular (de)reference
|
|
||||||
setattr(self, field, tuple(md5chunks))
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Error computing md5samples: {e}")
|
logging.error(f"Error computing md5samples: {e}")
|
||||||
pass
|
pass
|
||||||
@ -239,16 +235,16 @@ class Folder(File):
|
|||||||
return folders + files
|
return folders + files
|
||||||
|
|
||||||
def _read_info(self, field):
|
def _read_info(self, field):
|
||||||
|
# print(f"_read_info({field}) for Folder {self}")
|
||||||
if field in {"size", "mtime"}:
|
if field in {"size", "mtime"}:
|
||||||
size = sum((f.size for f in self._all_items()), 0)
|
size = sum((f.size for f in self._all_items()), 0)
|
||||||
self.size = size
|
self.size = size
|
||||||
stats = self.path.stat()
|
stats = self.path.stat()
|
||||||
self.mtime = nonone(stats.st_mtime, 0)
|
self.mtime = nonone(stats.st_mtime, 0)
|
||||||
elif field in {"md5", "md5partial"}:
|
elif field in {"md5", "md5partial", "md5samples"}:
|
||||||
# What's sensitive here is that we must make sure that subfiles'
|
# What's sensitive here is that we must make sure that subfiles'
|
||||||
# md5 are always added up in the same order, but we also want a
|
# md5 are always added up in the same order, but we also want a
|
||||||
# different md5 if a file gets moved in a different subdirectory.
|
# different md5 if a file gets moved in a different subdirectory.
|
||||||
print(f"Getting {field} of folder {self}...")
|
|
||||||
|
|
||||||
def get_dir_md5_concat():
|
def get_dir_md5_concat():
|
||||||
items = self._all_items()
|
items = self._all_items()
|
||||||
|
@ -89,7 +89,7 @@ class Scanner:
|
|||||||
if self.scan_type in {ScanType.Contents, ScanType.Folders}:
|
if self.scan_type in {ScanType.Contents, ScanType.Folders}:
|
||||||
return engine.getmatches_by_contents(
|
return engine.getmatches_by_contents(
|
||||||
files,
|
files,
|
||||||
bigsize=self.big_file_size_threshold if self.big_file_partial_hashes else 0,
|
bigsize=self.big_file_size_threshold,
|
||||||
j=j
|
j=j
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -222,6 +222,5 @@ class Scanner:
|
|||||||
scan_type = ScanType.Filename
|
scan_type = ScanType.Filename
|
||||||
scanned_tags = {"artist", "title"}
|
scanned_tags = {"artist", "title"}
|
||||||
size_threshold = 0
|
size_threshold = 0
|
||||||
big_file_partial_hashes = True
|
big_file_size_threshold = 0
|
||||||
big_file_size_threshold = 100 * 1024 * 1024
|
|
||||||
word_weighting = False
|
word_weighting = False
|
||||||
|
@ -88,6 +88,7 @@ class NamedObject:
|
|||||||
self.size = size
|
self.size = size
|
||||||
self.md5partial = name
|
self.md5partial = name
|
||||||
self.md5 = name
|
self.md5 = name
|
||||||
|
self.md5samples = name
|
||||||
if with_words:
|
if with_words:
|
||||||
self.words = getwords(name)
|
self.words = getwords(name)
|
||||||
self.is_ref = False
|
self.is_ref = False
|
||||||
|
@ -551,6 +551,28 @@ class TestCaseGetMatchesByContents:
|
|||||||
o1, o2 = no(size=0), no(size=0)
|
o1, o2 = no(size=0), no(size=0)
|
||||||
assert not getmatches_by_contents([o1, o2])
|
assert not getmatches_by_contents([o1, o2])
|
||||||
|
|
||||||
|
def test_big_file_partial_hashes(self):
|
||||||
|
smallsize = 1
|
||||||
|
bigsize = 100 * 1024 * 1024 # 100MB
|
||||||
|
f = [no("bigfoo", size=bigsize), no("bigbar", size=bigsize),
|
||||||
|
no("smallfoo", size=smallsize), no("smallbar", size=smallsize)]
|
||||||
|
f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
|
||||||
|
f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
|
||||||
|
f[2].md5 = f[2].md5partial = "bleh"
|
||||||
|
f[3].md5 = f[3].md5partial = "bleh"
|
||||||
|
r = getmatches_by_contents(f, bigsize=bigsize)
|
||||||
|
eq_(len(r), 2)
|
||||||
|
# User disabled optimization for big files, compute hashes as usual
|
||||||
|
r = getmatches_by_contents(f, bigsize=0)
|
||||||
|
eq_(len(r), 2)
|
||||||
|
# Other file is now slightly different, md5partial is still the same
|
||||||
|
f[1].md5 = f[1].md5samples = "foobardiff"
|
||||||
|
r = getmatches_by_contents(f, bigsize=bigsize)
|
||||||
|
# Successfully filter it out
|
||||||
|
eq_(len(r), 1)
|
||||||
|
r = getmatches_by_contents(f, bigsize=0)
|
||||||
|
eq_(len(r), 1)
|
||||||
|
|
||||||
|
|
||||||
class TestCaseGroup:
|
class TestCaseGroup:
|
||||||
def test_empy(self):
|
def test_empy(self):
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
# http://www.gnu.org/licenses/gpl-3.0.html
|
# http://www.gnu.org/licenses/gpl-3.0.html
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
|
from os import urandom
|
||||||
|
|
||||||
from hscommon.path import Path
|
from hscommon.path import Path
|
||||||
from hscommon.testutil import eq_
|
from hscommon.testutil import eq_
|
||||||
@ -15,6 +16,36 @@ from core.tests.directories_test import create_fake_fs
|
|||||||
from .. import fs
|
from .. import fs
|
||||||
|
|
||||||
|
|
||||||
|
def create_fake_fs_with_random_data(rootpath):
|
||||||
|
rootpath = rootpath["fs"]
|
||||||
|
rootpath.mkdir()
|
||||||
|
rootpath["dir1"].mkdir()
|
||||||
|
rootpath["dir2"].mkdir()
|
||||||
|
rootpath["dir3"].mkdir()
|
||||||
|
fp = rootpath["file1.test"].open("wb")
|
||||||
|
data1 = urandom(200 * 1024) # 200KiB
|
||||||
|
data2 = urandom(1024 * 1024) # 1MiB
|
||||||
|
data3 = urandom(10 * 1024 * 1024) # 10MiB
|
||||||
|
fp.write(data1)
|
||||||
|
fp.close()
|
||||||
|
fp = rootpath["file2.test"].open("wb")
|
||||||
|
fp.write(data2)
|
||||||
|
fp.close()
|
||||||
|
fp = rootpath["file3.test"].open("wb")
|
||||||
|
fp.write(data3)
|
||||||
|
fp.close()
|
||||||
|
fp = rootpath["dir1"]["file1.test"].open("wb")
|
||||||
|
fp.write(data1)
|
||||||
|
fp.close()
|
||||||
|
fp = rootpath["dir2"]["file2.test"].open("wb")
|
||||||
|
fp.write(data2)
|
||||||
|
fp.close()
|
||||||
|
fp = rootpath["dir3"]["file3.test"].open("wb")
|
||||||
|
fp.write(data3)
|
||||||
|
fp.close()
|
||||||
|
return rootpath
|
||||||
|
|
||||||
|
|
||||||
def test_size_aggregates_subfiles(tmpdir):
|
def test_size_aggregates_subfiles(tmpdir):
|
||||||
p = create_fake_fs(Path(str(tmpdir)))
|
p = create_fake_fs(Path(str(tmpdir)))
|
||||||
b = fs.Folder(p)
|
b = fs.Folder(p)
|
||||||
@ -25,7 +56,7 @@ def test_md5_aggregate_subfiles_sorted(tmpdir):
|
|||||||
# dir.allfiles can return child in any order. Thus, bundle.md5 must aggregate
|
# dir.allfiles can return child in any order. Thus, bundle.md5 must aggregate
|
||||||
# all files' md5 it contains, but it must make sure that it does so in the
|
# all files' md5 it contains, but it must make sure that it does so in the
|
||||||
# same order everytime.
|
# same order everytime.
|
||||||
p = create_fake_fs(Path(str(tmpdir)))
|
p = create_fake_fs_with_random_data(Path(str(tmpdir)))
|
||||||
b = fs.Folder(p)
|
b = fs.Folder(p)
|
||||||
md51 = fs.File(p["dir1"]["file1.test"]).md5
|
md51 = fs.File(p["dir1"]["file1.test"]).md5
|
||||||
md52 = fs.File(p["dir2"]["file2.test"]).md5
|
md52 = fs.File(p["dir2"]["file2.test"]).md5
|
||||||
@ -41,6 +72,36 @@ def test_md5_aggregate_subfiles_sorted(tmpdir):
|
|||||||
eq_(b.md5, md5.digest())
|
eq_(b.md5, md5.digest())
|
||||||
|
|
||||||
|
|
||||||
|
def test_partial_md5_aggregate_subfile_sorted(tmpdir):
|
||||||
|
p = create_fake_fs_with_random_data(Path(str(tmpdir)))
|
||||||
|
b = fs.Folder(p)
|
||||||
|
md51 = fs.File(p["dir1"]["file1.test"]).md5partial
|
||||||
|
md52 = fs.File(p["dir2"]["file2.test"]).md5partial
|
||||||
|
md53 = fs.File(p["dir3"]["file3.test"]).md5partial
|
||||||
|
md54 = fs.File(p["file1.test"]).md5partial
|
||||||
|
md55 = fs.File(p["file2.test"]).md5partial
|
||||||
|
md56 = fs.File(p["file3.test"]).md5partial
|
||||||
|
# The expected md5 is the md5 of md5s for folders and the direct md5 for files
|
||||||
|
folder_md51 = hashlib.md5(md51).digest()
|
||||||
|
folder_md52 = hashlib.md5(md52).digest()
|
||||||
|
folder_md53 = hashlib.md5(md53).digest()
|
||||||
|
md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56)
|
||||||
|
eq_(b.md5partial, md5.digest())
|
||||||
|
|
||||||
|
md51 = fs.File(p["dir1"]["file1.test"]).md5samples
|
||||||
|
md52 = fs.File(p["dir2"]["file2.test"]).md5samples
|
||||||
|
md53 = fs.File(p["dir3"]["file3.test"]).md5samples
|
||||||
|
md54 = fs.File(p["file1.test"]).md5samples
|
||||||
|
md55 = fs.File(p["file2.test"]).md5samples
|
||||||
|
md56 = fs.File(p["file3.test"]).md5samples
|
||||||
|
# The expected md5 is the md5 of md5s for folders and the direct md5 for files
|
||||||
|
folder_md51 = hashlib.md5(md51).digest()
|
||||||
|
folder_md52 = hashlib.md5(md52).digest()
|
||||||
|
folder_md53 = hashlib.md5(md53).digest()
|
||||||
|
md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56)
|
||||||
|
eq_(b.md5samples, md5.digest())
|
||||||
|
|
||||||
|
|
||||||
def test_has_file_attrs(tmpdir):
|
def test_has_file_attrs(tmpdir):
|
||||||
# a Folder must behave like a file, so it must have mtime attributes
|
# a Folder must behave like a file, so it must have mtime attributes
|
||||||
b = fs.Folder(Path(str(tmpdir)))
|
b = fs.Folder(Path(str(tmpdir)))
|
||||||
|
@ -56,6 +56,7 @@ def test_default_settings(fake_fileexists):
|
|||||||
eq_(s.mix_file_kind, True)
|
eq_(s.mix_file_kind, True)
|
||||||
eq_(s.word_weighting, False)
|
eq_(s.word_weighting, False)
|
||||||
eq_(s.match_similar_words, False)
|
eq_(s.match_similar_words, False)
|
||||||
|
eq_(s.big_file_size_threshold, 0)
|
||||||
|
|
||||||
|
|
||||||
def test_simple_with_default_settings(fake_fileexists):
|
def test_simple_with_default_settings(fake_fileexists):
|
||||||
@ -120,9 +121,9 @@ def test_content_scan(fake_fileexists):
|
|||||||
s = Scanner()
|
s = Scanner()
|
||||||
s.scan_type = ScanType.Contents
|
s.scan_type = ScanType.Contents
|
||||||
f = [no("foo"), no("bar"), no("bleh")]
|
f = [no("foo"), no("bar"), no("bleh")]
|
||||||
f[0].md5 = f[0].md5partial = "foobar"
|
f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
|
||||||
f[1].md5 = f[1].md5partial = "foobar"
|
f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
|
||||||
f[2].md5 = f[2].md5partial = "bleh"
|
f[2].md5 = f[2].md5partial = f[1].md5samples = "bleh"
|
||||||
r = s.get_dupe_groups(f)
|
r = s.get_dupe_groups(f)
|
||||||
eq_(len(r), 1)
|
eq_(len(r), 1)
|
||||||
eq_(len(r[0]), 2)
|
eq_(len(r[0]), 2)
|
||||||
@ -141,13 +142,43 @@ def test_content_scan_compare_sizes_first(fake_fileexists):
|
|||||||
eq_(len(s.get_dupe_groups(f)), 0)
|
eq_(len(s.get_dupe_groups(f)), 0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_big_file_partial_hashes(fake_fileexists):
|
||||||
|
s = Scanner()
|
||||||
|
s.scan_type = ScanType.Contents
|
||||||
|
|
||||||
|
smallsize = 1
|
||||||
|
bigsize = 100 * 1024 * 1024 # 100MB
|
||||||
|
s.big_file_size_threshold = bigsize
|
||||||
|
|
||||||
|
f = [no("bigfoo", bigsize), no("bigbar", bigsize),
|
||||||
|
no("smallfoo", smallsize), no("smallbar", smallsize)]
|
||||||
|
f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
|
||||||
|
f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
|
||||||
|
f[2].md5 = f[2].md5partial = "bleh"
|
||||||
|
f[3].md5 = f[3].md5partial = "bleh"
|
||||||
|
r = s.get_dupe_groups(f)
|
||||||
|
eq_(len(r), 2)
|
||||||
|
|
||||||
|
# md5partial is still the same, but the file is actually different
|
||||||
|
f[1].md5 = f[1].md5samples = "difffoobar"
|
||||||
|
# here we compare the full md5s, as the user disabled the optimization
|
||||||
|
s.big_file_size_threshold = 0
|
||||||
|
r = s.get_dupe_groups(f)
|
||||||
|
eq_(len(r), 1)
|
||||||
|
|
||||||
|
# here we should compare the md5samples, and see they are different
|
||||||
|
s.big_file_size_threshold = bigsize
|
||||||
|
r = s.get_dupe_groups(f)
|
||||||
|
eq_(len(r), 1)
|
||||||
|
|
||||||
|
|
||||||
def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists):
|
def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists):
|
||||||
s = Scanner()
|
s = Scanner()
|
||||||
s.scan_type = ScanType.Contents
|
s.scan_type = ScanType.Contents
|
||||||
f = [no("foo"), no("bar"), no("bleh")]
|
f = [no("foo"), no("bar"), no("bleh")]
|
||||||
f[0].md5 = f[0].md5partial = "foobar"
|
f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
|
||||||
f[1].md5 = f[1].md5partial = "foobar"
|
f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
|
||||||
f[2].md5 = f[2].md5partial = "bleh"
|
f[2].md5 = f[2].md5partial = f[2].md5samples = "bleh"
|
||||||
s.min_match_percentage = 101
|
s.min_match_percentage = 101
|
||||||
r = s.get_dupe_groups(f)
|
r = s.get_dupe_groups(f)
|
||||||
eq_(len(r), 1)
|
eq_(len(r), 1)
|
||||||
@ -162,13 +193,12 @@ def test_content_scan_doesnt_put_md5_in_words_at_the_end(fake_fileexists):
|
|||||||
s = Scanner()
|
s = Scanner()
|
||||||
s.scan_type = ScanType.Contents
|
s.scan_type = ScanType.Contents
|
||||||
f = [no("foo"), no("bar")]
|
f = [no("foo"), no("bar")]
|
||||||
f[0].md5 = f[
|
f[0].md5 = f[0].md5partial = f[0].md5samples =\
|
||||||
0
|
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
|
||||||
].md5partial = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
|
f[1].md5 = f[1].md5partial = f[1].md5samples =\
|
||||||
f[1].md5 = f[
|
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
|
||||||
1
|
|
||||||
].md5partial = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
|
|
||||||
r = s.get_dupe_groups(f)
|
r = s.get_dupe_groups(f)
|
||||||
|
# FIXME looks like we are missing something here?
|
||||||
r[0]
|
r[0]
|
||||||
|
|
||||||
|
|
||||||
@ -514,21 +544,21 @@ def test_folder_scan_exclude_subfolder_matches(fake_fileexists):
|
|||||||
s = Scanner()
|
s = Scanner()
|
||||||
s.scan_type = ScanType.Folders
|
s.scan_type = ScanType.Folders
|
||||||
topf1 = no("top folder 1", size=42)
|
topf1 = no("top folder 1", size=42)
|
||||||
topf1.md5 = topf1.md5partial = b"some_md5_1"
|
topf1.md5 = topf1.md5partial = topf1.md5samples = b"some_md5_1"
|
||||||
topf1.path = Path("/topf1")
|
topf1.path = Path("/topf1")
|
||||||
topf2 = no("top folder 2", size=42)
|
topf2 = no("top folder 2", size=42)
|
||||||
topf2.md5 = topf2.md5partial = b"some_md5_1"
|
topf2.md5 = topf2.md5partial = topf2.md5samples = b"some_md5_1"
|
||||||
topf2.path = Path("/topf2")
|
topf2.path = Path("/topf2")
|
||||||
subf1 = no("sub folder 1", size=41)
|
subf1 = no("sub folder 1", size=41)
|
||||||
subf1.md5 = subf1.md5partial = b"some_md5_2"
|
subf1.md5 = subf1.md5partial = subf1.md5samples = b"some_md5_2"
|
||||||
subf1.path = Path("/topf1/sub")
|
subf1.path = Path("/topf1/sub")
|
||||||
subf2 = no("sub folder 2", size=41)
|
subf2 = no("sub folder 2", size=41)
|
||||||
subf2.md5 = subf2.md5partial = b"some_md5_2"
|
subf2.md5 = subf2.md5partial = subf2.md5samples = b"some_md5_2"
|
||||||
subf2.path = Path("/topf2/sub")
|
subf2.path = Path("/topf2/sub")
|
||||||
eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2])), 1) # only top folders
|
eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2])), 1) # only top folders
|
||||||
# however, if another folder matches a subfolder, keep in in the matches
|
# however, if another folder matches a subfolder, keep in in the matches
|
||||||
otherf = no("other folder", size=41)
|
otherf = no("other folder", size=41)
|
||||||
otherf.md5 = otherf.md5partial = b"some_md5_2"
|
otherf.md5 = otherf.md5partial = otherf.md5samples = b"some_md5_2"
|
||||||
otherf.path = Path("/otherfolder")
|
otherf.path = Path("/otherfolder")
|
||||||
eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2, otherf])), 2)
|
eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2, otherf])), 2)
|
||||||
|
|
||||||
@ -551,9 +581,9 @@ def test_dont_count_ref_files_as_discarded(fake_fileexists):
|
|||||||
o1 = no("foo", path="p1")
|
o1 = no("foo", path="p1")
|
||||||
o2 = no("foo", path="p2")
|
o2 = no("foo", path="p2")
|
||||||
o3 = no("foo", path="p3")
|
o3 = no("foo", path="p3")
|
||||||
o1.md5 = o1.md5partial = "foobar"
|
o1.md5 = o1.md5partial = o1.md5samples = "foobar"
|
||||||
o2.md5 = o2.md5partial = "foobar"
|
o2.md5 = o2.md5partial = o2.md5samples = "foobar"
|
||||||
o3.md5 = o3.md5partial = "foobar"
|
o3.md5 = o3.md5partial = o3.md5samples = "foobar"
|
||||||
o1.is_ref = True
|
o1.is_ref = True
|
||||||
o2.is_ref = True
|
o2.is_ref = True
|
||||||
eq_(len(s.get_dupe_groups([o1, o2, o3])), 1)
|
eq_(len(s.get_dupe_groups([o1, o2, o3])), 1)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user