diff --git a/core/engine.py b/core/engine.py index 4c2eadf1..958c1f7b 100644 --- a/core/engine.py +++ b/core/engine.py @@ -283,7 +283,7 @@ def getmatches_by_contents(files, bigsize=0, j=job.nulljob): """Returns a list of :class:`Match` within ``files`` if their contents is the same. :param bigsize: The size in bytes over which we consider files big enough to - justify taking samples of md5. If 0, compute md5 as usual. + justify taking samples of the file for hashing. If 0, compute digest as usual. :param j: A :ref:`job progress instance `. """ size2files = defaultdict(set) @@ -300,15 +300,15 @@ def getmatches_by_contents(files, bigsize=0, j=job.nulljob): if first.is_ref and second.is_ref: continue # Don't spend time comparing two ref pics together. if first.size == 0 and second.size == 0: - # skip md5 for zero length files + # skip hashing for zero length files result.append(Match(first, second, 100)) continue - if first.md5partial == second.md5partial: + if first.digest_partial == second.digest_partial: if bigsize > 0 and first.size > bigsize: - if first.md5samples == second.md5samples: + if first.digest_samples == second.digest_samples: result.append(Match(first, second, 100)) else: - if first.md5 == second.md5: + if first.digest == second.digest: result.append(Match(first, second, 100)) group_count += 1 j.add_progress(desc=PROGRESS_MESSAGE % (len(result), group_count)) diff --git a/core/fs.py b/core/fs.py index 9a078818..fc235a4a 100644 --- a/core/fs.py +++ b/core/fs.py @@ -11,7 +11,7 @@ # resulting needless complexity and memory usage. It's been a while since I wanted to do that fork, # and I'm doing it now. -import hashlib +import xxhash from math import floor import logging import sqlite3 @@ -40,7 +40,7 @@ NOT_SET = object() # CPU. CHUNK_SIZE = 1024 * 1024 # 1 MiB -# Minimum size below which partial hashes don't need to be computed +# Minimum size below which partial hashing is not used MIN_FILE_SIZE = 3 * CHUNK_SIZE # 3MiB, because we take 3 samples @@ -161,7 +161,7 @@ filesdb = FilesDB() # Singleton class File: """Represents a file and holds metadata to be used for scanning.""" - INITIAL_INFO = {"size": 0, "mtime": 0, "md5": b"", "md5partial": b"", "md5samples": b""} + INITIAL_INFO = {"size": 0, "mtime": 0, "digest": b"", "digest_partial": b"", "digest_samples": b""} # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become # even greater when we take into account read attributes (70%!). Yeah, it's worth it. @@ -187,32 +187,32 @@ class File: result = self.INITIAL_INFO[attrname] return result - def _calc_md5(self): + def _calc_digest(self): # type: () -> bytes with self.path.open("rb") as fp: - md5 = hashlib.md5() + file_hash = xxhash.xxh128() # The goal here is to not run out of memory on really big files. However, the chunk # size has to be large enough so that the python loop isn't too costly in terms of # CPU. CHUNK_SIZE = 1024 * 1024 # 1 mb filedata = fp.read(CHUNK_SIZE) while filedata: - md5.update(filedata) + file_hash.update(filedata) filedata = fp.read(CHUNK_SIZE) - return md5.digest() + return file_hash.digest() - def _calc_md5partial(self): + def _calc_digest_partial(self): # type: () -> bytes - # This offset is where we should start reading the file to get a partial md5 + # This offset is where we should start reading the file to get a partial hash # For audio file, it should be where audio data starts offset, size = (0x4000, 0x4000) with self.path.open("rb") as fp: fp.seek(offset) - partialdata = fp.read(size) - return hashlib.md5(partialdata).digest() + partial_data = fp.read(size) + return xxhash.xxh128_digest(partial_data) def _read_info(self, field): # print(f"_read_info({field}) for {self}") @@ -220,48 +220,47 @@ class File: stats = self.path.stat() self.size = nonone(stats.st_size, 0) self.mtime = nonone(stats.st_mtime, 0) - elif field == "md5partial": + elif field == "digest_partial": try: - self.md5partial = filesdb.get(self.path, "md5partial") - if self.md5partial is None: - self.md5partial = self._calc_md5partial() - filesdb.put(self.path, "md5partial", self.md5partial) + self.digest_partial = filesdb.get(self.path, "md5partial") + if self.digest_partial is None: + self.digest_partial = self._calc_digest_partial() + filesdb.put(self.path, "md5partial", self.digest_partial) except Exception as e: - logging.warning("Couldn't get md5partial for %s: %s", self.path, e) - elif field == "md5": + logging.warning("Couldn't get digest_partial for %s: %s", self.path, e) + elif field == "digest": try: - self.md5 = filesdb.get(self.path, "md5") - if self.md5 is None: - self.md5 = self._calc_md5() - filesdb.put(self.path, "md5", self.md5) + self.digest = filesdb.get(self.path, "md5") + if self.digest is None: + self.digest = self._calc_digest() + filesdb.put(self.path, "md5", self.digest) except Exception as e: - logging.warning("Couldn't get md5 for %s: %s", self.path, e) - elif field == "md5samples": + logging.warning("Couldn't get digest for %s: %s", self.path, e) + elif field == "digest_samples": + size = self.size + # Might as well hash such small files entirely. + if size <= MIN_FILE_SIZE: + setattr(self, field, self.digest) + return try: with self.path.open("rb") as fp: - size = self.size - # Might as well hash such small files entirely. - if size <= MIN_FILE_SIZE: - setattr(self, field, self.md5) - return - # Chunk at 25% of the file fp.seek(floor(size * 25 / 100), 0) - filedata = fp.read(CHUNK_SIZE) - md5 = hashlib.md5(filedata) + file_data = fp.read(CHUNK_SIZE) + file_hash = xxhash.xxh128(file_data) # Chunk at 60% of the file fp.seek(floor(size * 60 / 100), 0) - filedata = fp.read(CHUNK_SIZE) - md5.update(filedata) + file_data = fp.read(CHUNK_SIZE) + file_hash.update(file_data) # Last chunk of the file fp.seek(-CHUNK_SIZE, 2) - filedata = fp.read(CHUNK_SIZE) - md5.update(filedata) - setattr(self, field, md5.digest()) + file_data = fp.read(CHUNK_SIZE) + file_hash.update(file_data) + setattr(self, field, file_hash.digest()) except Exception as e: - logging.error(f"Error computing md5samples: {e}") + logging.error(f"Error computing digest_samples: {e}") def _read_all_info(self, attrnames=None): """Cache all possible info. @@ -314,7 +313,7 @@ class File: class Folder(File): """A wrapper around a folder path. - It has the size/md5 info of a File, but its value is the sum of its subitems. + It has the size/digest info of a File, but its value is the sum of its subitems. """ __slots__ = File.__slots__ + ("_subfolders",) @@ -335,19 +334,18 @@ class Folder(File): self.size = size stats = self.path.stat() self.mtime = nonone(stats.st_mtime, 0) - elif field in {"md5", "md5partial", "md5samples"}: + elif field in {"digest", "digest_partial", "digest_samples"}: # What's sensitive here is that we must make sure that subfiles' - # md5 are always added up in the same order, but we also want a - # different md5 if a file gets moved in a different subdirectory. + # digest are always added up in the same order, but we also want a + # different digest if a file gets moved in a different subdirectory. - def get_dir_md5_concat(): + def get_dir_digest_concat(): items = self._all_items() items.sort(key=lambda f: f.path) - md5s = [getattr(f, field) for f in items] - return b"".join(md5s) + digests = [getattr(f, field) for f in items] + return b"".join(digests) - md5 = hashlib.md5(get_dir_md5_concat()) - digest = md5.digest() + digest = xxhash.xxh128_digest(get_dir_digest_concat()) setattr(self, field, digest) @property diff --git a/core/me/fs.py b/core/me/fs.py index 6bf7401b..fbfdfa09 100644 --- a/core/me/fs.py +++ b/core/me/fs.py @@ -97,11 +97,6 @@ class MusicFile(fs.File): "dupe_count": format_dupe_count(dupe_count), } - def _get_md5partial_offset_and_size(self): - # No longer calculating the offset and audio size, just whole file - size = self.path.stat().st_size - return (0, size) - def _read_info(self, field): fs.File._read_info(self, field) if field in TAG_FIELDS: diff --git a/core/pe/matchblock.py b/core/pe/matchblock.py index 0b26a233..063ec725 100644 --- a/core/pe/matchblock.py +++ b/core/pe/matchblock.py @@ -238,7 +238,7 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo for ref_id, other_id, percentage in myiter: ref = id2picture[ref_id] other = id2picture[other_id] - if percentage == 100 and ref.md5 != other.md5: + if percentage == 100 and ref.digest != other.digest: percentage = 99 if percentage >= threshold: ref.dimensions # pre-read dimensions for display in results diff --git a/core/tests/base.py b/core/tests/base.py index 36aa3cd0..9f6e68d4 100644 --- a/core/tests/base.py +++ b/core/tests/base.py @@ -86,9 +86,9 @@ class NamedObject: folder = "basepath" self._folder = Path(folder) self.size = size - self.md5partial = name - self.md5 = name - self.md5samples = name + self.digest_partial = name + self.digest = name + self.digest_samples = name if with_words: self.words = getwords(name) self.is_ref = False diff --git a/core/tests/engine_test.py b/core/tests/engine_test.py index cc1ea532..b35076f3 100644 --- a/core/tests/engine_test.py +++ b/core/tests/engine_test.py @@ -530,7 +530,7 @@ class TestCaseGetMatches: class TestCaseGetMatchesByContents: - def test_big_file_partial_hashes(self): + def test_big_file_partial_hashing(self): smallsize = 1 bigsize = 100 * 1024 * 1024 # 100MB f = [ @@ -539,17 +539,17 @@ class TestCaseGetMatchesByContents: no("smallfoo", size=smallsize), no("smallbar", size=smallsize), ] - f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar" - f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar" - f[2].md5 = f[2].md5partial = "bleh" - f[3].md5 = f[3].md5partial = "bleh" + f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar" + f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar" + f[2].digest = f[2].digest_partial = "bleh" + f[3].digest = f[3].digest_partial = "bleh" r = getmatches_by_contents(f, bigsize=bigsize) eq_(len(r), 2) - # User disabled optimization for big files, compute hashes as usual + # User disabled optimization for big files, compute digests as usual r = getmatches_by_contents(f, bigsize=0) eq_(len(r), 2) - # Other file is now slightly different, md5partial is still the same - f[1].md5 = f[1].md5samples = "foobardiff" + # Other file is now slightly different, digest_partial is still the same + f[1].digest = f[1].digest_samples = "foobardiff" r = getmatches_by_contents(f, bigsize=bigsize) # Successfully filter it out eq_(len(r), 1) diff --git a/core/tests/fs_test.py b/core/tests/fs_test.py index 97793ee7..d5b7610b 100644 --- a/core/tests/fs_test.py +++ b/core/tests/fs_test.py @@ -6,7 +6,7 @@ # which should be included with this package. The terms are also available at # http://www.gnu.org/licenses/gpl-3.0.html -import hashlib +import xxhash from os import urandom from hscommon.path import Path @@ -52,54 +52,54 @@ def test_size_aggregates_subfiles(tmpdir): eq_(b.size, 12) -def test_md5_aggregate_subfiles_sorted(tmpdir): - # dir.allfiles can return child in any order. Thus, bundle.md5 must aggregate - # all files' md5 it contains, but it must make sure that it does so in the +def test_digest_aggregate_subfiles_sorted(tmpdir): + # dir.allfiles can return child in any order. Thus, bundle.digest must aggregate + # all files' digests it contains, but it must make sure that it does so in the # same order everytime. p = create_fake_fs_with_random_data(Path(str(tmpdir))) b = fs.Folder(p) - md51 = fs.File(p["dir1"]["file1.test"]).md5 - md52 = fs.File(p["dir2"]["file2.test"]).md5 - md53 = fs.File(p["dir3"]["file3.test"]).md5 - md54 = fs.File(p["file1.test"]).md5 - md55 = fs.File(p["file2.test"]).md5 - md56 = fs.File(p["file3.test"]).md5 - # The expected md5 is the md5 of md5s for folders and the direct md5 for files - folder_md51 = hashlib.md5(md51).digest() - folder_md52 = hashlib.md5(md52).digest() - folder_md53 = hashlib.md5(md53).digest() - md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56) - eq_(b.md5, md5.digest()) + digest1 = fs.File(p["dir1"]["file1.test"]).digest + digest2 = fs.File(p["dir2"]["file2.test"]).digest + digest3 = fs.File(p["dir3"]["file3.test"]).digest + digest4 = fs.File(p["file1.test"]).digest + digest5 = fs.File(p["file2.test"]).digest + digest6 = fs.File(p["file3.test"]).digest + # The expected digest is the hash of digests for folders and the direct digest for files + folder_digest1 = xxhash.xxh128_digest(digest1) + folder_digest2 = xxhash.xxh128_digest(digest2) + folder_digest3 = xxhash.xxh128_digest(digest3) + digest = xxhash.xxh128_digest(folder_digest1 + folder_digest2 + folder_digest3 + digest4 + digest5 + digest6) + eq_(b.digest, digest) -def test_partial_md5_aggregate_subfile_sorted(tmpdir): +def test_partial_digest_aggregate_subfile_sorted(tmpdir): p = create_fake_fs_with_random_data(Path(str(tmpdir))) b = fs.Folder(p) - md51 = fs.File(p["dir1"]["file1.test"]).md5partial - md52 = fs.File(p["dir2"]["file2.test"]).md5partial - md53 = fs.File(p["dir3"]["file3.test"]).md5partial - md54 = fs.File(p["file1.test"]).md5partial - md55 = fs.File(p["file2.test"]).md5partial - md56 = fs.File(p["file3.test"]).md5partial - # The expected md5 is the md5 of md5s for folders and the direct md5 for files - folder_md51 = hashlib.md5(md51).digest() - folder_md52 = hashlib.md5(md52).digest() - folder_md53 = hashlib.md5(md53).digest() - md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56) - eq_(b.md5partial, md5.digest()) + digest1 = fs.File(p["dir1"]["file1.test"]).digest_partial + digest2 = fs.File(p["dir2"]["file2.test"]).digest_partial + digest3 = fs.File(p["dir3"]["file3.test"]).digest_partial + digest4 = fs.File(p["file1.test"]).digest_partial + digest5 = fs.File(p["file2.test"]).digest_partial + digest6 = fs.File(p["file3.test"]).digest_partial + # The expected digest is the hash of digests for folders and the direct digest for files + folder_digest1 = xxhash.xxh128_digest(digest1) + folder_digest2 = xxhash.xxh128_digest(digest2) + folder_digest3 = xxhash.xxh128_digest(digest3) + digest = xxhash.xxh128_digest(folder_digest1 + folder_digest2 + folder_digest3 + digest4 + digest5 + digest6) + eq_(b.digest_partial, digest) - md51 = fs.File(p["dir1"]["file1.test"]).md5samples - md52 = fs.File(p["dir2"]["file2.test"]).md5samples - md53 = fs.File(p["dir3"]["file3.test"]).md5samples - md54 = fs.File(p["file1.test"]).md5samples - md55 = fs.File(p["file2.test"]).md5samples - md56 = fs.File(p["file3.test"]).md5samples - # The expected md5 is the md5 of md5s for folders and the direct md5 for files - folder_md51 = hashlib.md5(md51).digest() - folder_md52 = hashlib.md5(md52).digest() - folder_md53 = hashlib.md5(md53).digest() - md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56) - eq_(b.md5samples, md5.digest()) + digest1 = fs.File(p["dir1"]["file1.test"]).digest_samples + digest2 = fs.File(p["dir2"]["file2.test"]).digest_samples + digest3 = fs.File(p["dir3"]["file3.test"]).digest_samples + digest4 = fs.File(p["file1.test"]).digest_samples + digest5 = fs.File(p["file2.test"]).digest_samples + digest6 = fs.File(p["file3.test"]).digest_samples + # The expected digest is the digest of digests for folders and the direct digest for files + folder_digest1 = xxhash.xxh128_digest(digest1) + folder_digest2 = xxhash.xxh128_digest(digest2) + folder_digest3 = xxhash.xxh128_digest(digest3) + digest = xxhash.xxh128_digest(folder_digest1 + folder_digest2 + folder_digest3 + digest4 + digest5 + digest6) + eq_(b.digest_samples, digest) def test_has_file_attrs(tmpdir): diff --git a/core/tests/scanner_test.py b/core/tests/scanner_test.py index c0aae5b4..909ee3d4 100644 --- a/core/tests/scanner_test.py +++ b/core/tests/scanner_test.py @@ -123,19 +123,19 @@ def test_content_scan(fake_fileexists): s = Scanner() s.scan_type = ScanType.CONTENTS f = [no("foo"), no("bar"), no("bleh")] - f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar" - f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar" - f[2].md5 = f[2].md5partial = f[1].md5samples = "bleh" + f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar" + f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar" + f[2].digest = f[2].digest_partial = f[1].digest_samples = "bleh" r = s.get_dupe_groups(f) eq_(len(r), 1) eq_(len(r[0]), 2) - eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded! + eq_(s.discarded_file_count, 0) # don't count the different digest as discarded! def test_content_scan_compare_sizes_first(fake_fileexists): class MyFile(no): @property - def md5(self): + def digest(self): raise AssertionError() s = Scanner() @@ -161,14 +161,14 @@ def test_ignore_file_size(fake_fileexists): no("largeignore1", large_size + 1), no("largeignore2", large_size + 1), ] - f[0].md5 = f[0].md5partial = f[0].md5samples = "smallignore" - f[1].md5 = f[1].md5partial = f[1].md5samples = "smallignore" - f[2].md5 = f[2].md5partial = f[2].md5samples = "small" - f[3].md5 = f[3].md5partial = f[3].md5samples = "small" - f[4].md5 = f[4].md5partial = f[4].md5samples = "large" - f[5].md5 = f[5].md5partial = f[5].md5samples = "large" - f[6].md5 = f[6].md5partial = f[6].md5samples = "largeignore" - f[7].md5 = f[7].md5partial = f[7].md5samples = "largeignore" + f[0].digest = f[0].digest_partial = f[0].digest_samples = "smallignore" + f[1].digest = f[1].digest_partial = f[1].digest_samples = "smallignore" + f[2].digest = f[2].digest_partial = f[2].digest_samples = "small" + f[3].digest = f[3].digest_partial = f[3].digest_samples = "small" + f[4].digest = f[4].digest_partial = f[4].digest_samples = "large" + f[5].digest = f[5].digest_partial = f[5].digest_samples = "large" + f[6].digest = f[6].digest_partial = f[6].digest_samples = "largeignore" + f[7].digest = f[7].digest_partial = f[7].digest_samples = "largeignore" r = s.get_dupe_groups(f) # No ignores @@ -197,21 +197,21 @@ def test_big_file_partial_hashes(fake_fileexists): s.big_file_size_threshold = bigsize f = [no("bigfoo", bigsize), no("bigbar", bigsize), no("smallfoo", smallsize), no("smallbar", smallsize)] - f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar" - f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar" - f[2].md5 = f[2].md5partial = "bleh" - f[3].md5 = f[3].md5partial = "bleh" + f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar" + f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar" + f[2].digest = f[2].digest_partial = "bleh" + f[3].digest = f[3].digest_partial = "bleh" r = s.get_dupe_groups(f) eq_(len(r), 2) - # md5partial is still the same, but the file is actually different - f[1].md5 = f[1].md5samples = "difffoobar" - # here we compare the full md5s, as the user disabled the optimization + # digest_partial is still the same, but the file is actually different + f[1].digest = f[1].digest_samples = "difffoobar" + # here we compare the full digests, as the user disabled the optimization s.big_file_size_threshold = 0 r = s.get_dupe_groups(f) eq_(len(r), 1) - # here we should compare the md5samples, and see they are different + # here we should compare the digest_samples, and see they are different s.big_file_size_threshold = bigsize r = s.get_dupe_groups(f) eq_(len(r), 1) @@ -221,9 +221,9 @@ def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists): s = Scanner() s.scan_type = ScanType.CONTENTS f = [no("foo"), no("bar"), no("bleh")] - f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar" - f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar" - f[2].md5 = f[2].md5partial = f[2].md5samples = "bleh" + f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar" + f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar" + f[2].digest = f[2].digest_partial = f[2].digest_samples = "bleh" s.min_match_percentage = 101 r = s.get_dupe_groups(f) eq_(len(r), 1) @@ -234,12 +234,16 @@ def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists): eq_(len(r[0]), 2) -def test_content_scan_doesnt_put_md5_in_words_at_the_end(fake_fileexists): +def test_content_scan_doesnt_put_digest_in_words_at_the_end(fake_fileexists): s = Scanner() s.scan_type = ScanType.CONTENTS f = [no("foo"), no("bar")] - f[0].md5 = f[0].md5partial = f[0].md5samples = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" - f[1].md5 = f[1].md5partial = f[1].md5samples = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + f[0].digest = f[0].digest_partial = f[ + 0 + ].digest_samples = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + f[1].digest = f[1].digest_partial = f[ + 1 + ].digest_samples = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" r = s.get_dupe_groups(f) # FIXME looks like we are missing something here? r[0] @@ -587,21 +591,21 @@ def test_folder_scan_exclude_subfolder_matches(fake_fileexists): s = Scanner() s.scan_type = ScanType.FOLDERS topf1 = no("top folder 1", size=42) - topf1.md5 = topf1.md5partial = topf1.md5samples = b"some_md5_1" + topf1.digest = topf1.digest_partial = topf1.digest_samples = b"some_digest__1" topf1.path = Path("/topf1") topf2 = no("top folder 2", size=42) - topf2.md5 = topf2.md5partial = topf2.md5samples = b"some_md5_1" + topf2.digest = topf2.digest_partial = topf2.digest_samples = b"some_digest__1" topf2.path = Path("/topf2") subf1 = no("sub folder 1", size=41) - subf1.md5 = subf1.md5partial = subf1.md5samples = b"some_md5_2" + subf1.digest = subf1.digest_partial = subf1.digest_samples = b"some_digest__2" subf1.path = Path("/topf1/sub") subf2 = no("sub folder 2", size=41) - subf2.md5 = subf2.md5partial = subf2.md5samples = b"some_md5_2" + subf2.digest = subf2.digest_partial = subf2.digest_samples = b"some_digest__2" subf2.path = Path("/topf2/sub") eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2])), 1) # only top folders # however, if another folder matches a subfolder, keep in in the matches otherf = no("other folder", size=41) - otherf.md5 = otherf.md5partial = otherf.md5samples = b"some_md5_2" + otherf.digest = otherf.digest_partial = otherf.digest_samples = b"some_digest__2" otherf.path = Path("/otherfolder") eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2, otherf])), 2) @@ -624,9 +628,9 @@ def test_dont_count_ref_files_as_discarded(fake_fileexists): o1 = no("foo", path="p1") o2 = no("foo", path="p2") o3 = no("foo", path="p3") - o1.md5 = o1.md5partial = o1.md5samples = "foobar" - o2.md5 = o2.md5partial = o2.md5samples = "foobar" - o3.md5 = o3.md5partial = o3.md5samples = "foobar" + o1.digest = o1.digest_partial = o1.digest_samples = "foobar" + o2.digest = o2.digest_partial = o2.digest_samples = "foobar" + o3.digest = o3.digest_partial = o3.digest_samples = "foobar" o1.is_ref = True o2.is_ref = True eq_(len(s.get_dupe_groups([o1, o2, o3])), 1) diff --git a/requirements.txt b/requirements.txt index cbaedf95..cf2faf97 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ +distro>=1.5.0 +mutagen>=1.44.0 +PyQt5 >=5.14.1,<6.0; sys_platform != 'linux' +pywin32>=228; sys_platform == 'win32' Send2Trash>=1.3.0 sphinx>=3.0.0 -polib>=1.1.0 -mutagen>=1.44.0 -distro>=1.5.0 -PyQt5 >=5.14.1,<6.0; sys_platform != 'linux' -pywin32>=228; sys_platform == 'win32' \ No newline at end of file +xxhash>=3.0.0,<4.0.0