From 51b18d4c84d70d256ec535e71853ff9180904f53 Mon Sep 17 00:00:00 2001 From: Andrew Senetar Date: Sat, 19 Mar 2022 15:25:46 -0500 Subject: [PATCH] Switch file hashing to xxhash instead of md5 - Improves performance significantly in some cases - Add xxhash to requirements.txt and sort requirements - Rename md5 based members to digest - Update all tests to use new member names and hashing methods - Update hash db code to upgrade schema NOTE: May consider supporting multiple hashing algorithms in the future. --- core/engine.py | 10 +-- core/fs.py | 169 ++++++++++++++++++++----------------- core/me/fs.py | 5 -- core/pe/matchblock.py | 2 +- core/tests/base.py | 6 +- core/tests/engine_test.py | 16 ++-- core/tests/fs_test.py | 82 +++++++++--------- core/tests/scanner_test.py | 74 ++++++++-------- requirements.txt | 10 +-- 9 files changed, 194 insertions(+), 180 deletions(-) diff --git a/core/engine.py b/core/engine.py index 4c2eadf1..958c1f7b 100644 --- a/core/engine.py +++ b/core/engine.py @@ -283,7 +283,7 @@ def getmatches_by_contents(files, bigsize=0, j=job.nulljob): """Returns a list of :class:`Match` within ``files`` if their contents is the same. :param bigsize: The size in bytes over which we consider files big enough to - justify taking samples of md5. If 0, compute md5 as usual. + justify taking samples of the file for hashing. If 0, compute digest as usual. :param j: A :ref:`job progress instance `. """ size2files = defaultdict(set) @@ -300,15 +300,15 @@ def getmatches_by_contents(files, bigsize=0, j=job.nulljob): if first.is_ref and second.is_ref: continue # Don't spend time comparing two ref pics together. if first.size == 0 and second.size == 0: - # skip md5 for zero length files + # skip hashing for zero length files result.append(Match(first, second, 100)) continue - if first.md5partial == second.md5partial: + if first.digest_partial == second.digest_partial: if bigsize > 0 and first.size > bigsize: - if first.md5samples == second.md5samples: + if first.digest_samples == second.digest_samples: result.append(Match(first, second, 100)) else: - if first.md5 == second.md5: + if first.digest == second.digest: result.append(Match(first, second, 100)) group_count += 1 j.add_progress(desc=PROGRESS_MESSAGE % (len(result), group_count)) diff --git a/core/fs.py b/core/fs.py index 9a078818..a4cee502 100644 --- a/core/fs.py +++ b/core/fs.py @@ -11,12 +11,13 @@ # resulting needless complexity and memory usage. It's been a while since I wanted to do that fork, # and I'm doing it now. -import hashlib +import os +import xxhash from math import floor import logging import sqlite3 from threading import Lock -from typing import Any +from typing import Any, AnyStr, Union from hscommon.path import Path from hscommon.util import nonone, get_file_ext @@ -40,7 +41,7 @@ NOT_SET = object() # CPU. CHUNK_SIZE = 1024 * 1024 # 1 MiB -# Minimum size below which partial hashes don't need to be computed +# Minimum size below which partial hashing is not used MIN_FILE_SIZE = 3 * CHUNK_SIZE # 3MiB, because we take 3 samples @@ -83,9 +84,11 @@ class OperationError(FSError): class FilesDB: + schema_version = 1 + schema_version_description = "Changed from md5 to xxhash" - create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)" - drop_table_query = "DROP TABLE files;" + create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, digest BLOB, digest_partial BLOB, digest_samples BLOB)" + drop_table_query = "DROP TABLE IF EXISTS files;" select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns" insert_query = """ INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value) @@ -97,24 +100,37 @@ class FilesDB: self.cur = None self.lock = None - def connect(self, path): - # type: (str, ) -> None - + def connect(self, path: Union[AnyStr, os.PathLike]) -> None: self.conn = sqlite3.connect(path, check_same_thread=False) self.cur = self.conn.cursor() - self.cur.execute(self.create_table_query) self.lock = Lock() + self._check_upgrade() - def clear(self): - # type: () -> None + def _check_upgrade(self) -> None: + with self.lock: + has_schema = self.cur.execute( + "SELECT NAME FROM sqlite_master WHERE type='table' AND name='schema_version'" + ).fetchall() + version = None + if has_schema: + version = self.cur.execute("SELECT version FROM schema_version ORDER BY version DESC").fetchone()[0] + else: + self.cur.execute("CREATE TABLE schema_version (version int PRIMARY KEY, description TEXT)") + if version != self.schema_version: + self.cur.execute(self.drop_table_query) + self.cur.execute( + "INSERT OR REPLACE INTO schema_version VALUES (:version, :description)", + {"version": self.schema_version, "description": self.schema_version_description}, + ) + self.cur.execute(self.create_table_query) + self.conn.commit() + def clear(self) -> None: with self.lock: self.cur.execute(self.drop_table_query) self.cur.execute(self.create_table_query) - def get(self, path, key): - # type: (Path, str) -> bytes - + def get(self, path: Path, key: str) -> Union[bytes, None]: stat = path.stat() size = stat.st_size mtime_ns = stat.st_mtime_ns @@ -128,9 +144,7 @@ class FilesDB: return None - def put(self, path, key, value): - # type: (Path, str, Any) -> None - + def put(self, path: Path, key: str, value: Any) -> None: stat = path.stat() size = stat.st_size mtime_ns = stat.st_mtime_ns @@ -141,15 +155,11 @@ class FilesDB: {"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value}, ) - def commit(self): - # type: () -> None - + def commit(self) -> None: with self.lock: self.conn.commit() - def close(self): - # type: () -> None - + def close(self) -> None: with self.lock: self.cur.close() self.conn.close() @@ -161,7 +171,7 @@ filesdb = FilesDB() # Singleton class File: """Represents a file and holds metadata to be used for scanning.""" - INITIAL_INFO = {"size": 0, "mtime": 0, "md5": b"", "md5partial": b"", "md5samples": b""} + INITIAL_INFO = {"size": 0, "mtime": 0, "digest": b"", "digest_partial": b"", "digest_samples": b""} # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become # even greater when we take into account read attributes (70%!). Yeah, it's worth it. @@ -187,32 +197,51 @@ class File: result = self.INITIAL_INFO[attrname] return result - def _calc_md5(self): + def _calc_digest(self): # type: () -> bytes with self.path.open("rb") as fp: - md5 = hashlib.md5() + file_hash = xxhash.xxh128() # The goal here is to not run out of memory on really big files. However, the chunk # size has to be large enough so that the python loop isn't too costly in terms of # CPU. CHUNK_SIZE = 1024 * 1024 # 1 mb filedata = fp.read(CHUNK_SIZE) while filedata: - md5.update(filedata) + file_hash.update(filedata) filedata = fp.read(CHUNK_SIZE) - return md5.digest() + return file_hash.digest() - def _calc_md5partial(self): + def _calc_digest_partial(self): # type: () -> bytes - # This offset is where we should start reading the file to get a partial md5 + # This offset is where we should start reading the file to get a partial hash # For audio file, it should be where audio data starts offset, size = (0x4000, 0x4000) with self.path.open("rb") as fp: fp.seek(offset) - partialdata = fp.read(size) - return hashlib.md5(partialdata).digest() + partial_data = fp.read(size) + return xxhash.xxh128_digest(partial_data) + + def _calc_digest_samples(self) -> bytes: + size = self.size + with self.path.open("rb") as fp: + # Chunk at 25% of the file + fp.seek(floor(size * 25 / 100), 0) + file_data = fp.read(CHUNK_SIZE) + file_hash = xxhash.xxh128(file_data) + + # Chunk at 60% of the file + fp.seek(floor(size * 60 / 100), 0) + file_data = fp.read(CHUNK_SIZE) + file_hash.update(file_data) + + # Last chunk of the file + fp.seek(-CHUNK_SIZE, 2) + file_data = fp.read(CHUNK_SIZE) + file_hash.update(file_data) + return file_hash.digest() def _read_info(self, field): # print(f"_read_info({field}) for {self}") @@ -220,48 +249,35 @@ class File: stats = self.path.stat() self.size = nonone(stats.st_size, 0) self.mtime = nonone(stats.st_mtime, 0) - elif field == "md5partial": + elif field == "digest_partial": try: - self.md5partial = filesdb.get(self.path, "md5partial") - if self.md5partial is None: - self.md5partial = self._calc_md5partial() - filesdb.put(self.path, "md5partial", self.md5partial) + self.digest_partial = filesdb.get(self.path, "digest_partial") + if self.digest_partial is None: + self.digest_partial = self._calc_digest_partial() + filesdb.put(self.path, "digest_partial", self.digest_partial) except Exception as e: - logging.warning("Couldn't get md5partial for %s: %s", self.path, e) - elif field == "md5": + logging.warning("Couldn't get digest_partial for %s: %s", self.path, e) + elif field == "digest": try: - self.md5 = filesdb.get(self.path, "md5") - if self.md5 is None: - self.md5 = self._calc_md5() - filesdb.put(self.path, "md5", self.md5) + self.digest = filesdb.get(self.path, "digest") + if self.digest is None: + self.digest = self._calc_digest() + filesdb.put(self.path, "digest", self.digest) except Exception as e: - logging.warning("Couldn't get md5 for %s: %s", self.path, e) - elif field == "md5samples": + logging.warning("Couldn't get digest for %s: %s", self.path, e) + elif field == "digest_samples": + size = self.size + # Might as well hash such small files entirely. + if size <= MIN_FILE_SIZE: + setattr(self, field, self.digest) + return try: - with self.path.open("rb") as fp: - size = self.size - # Might as well hash such small files entirely. - if size <= MIN_FILE_SIZE: - setattr(self, field, self.md5) - return - - # Chunk at 25% of the file - fp.seek(floor(size * 25 / 100), 0) - filedata = fp.read(CHUNK_SIZE) - md5 = hashlib.md5(filedata) - - # Chunk at 60% of the file - fp.seek(floor(size * 60 / 100), 0) - filedata = fp.read(CHUNK_SIZE) - md5.update(filedata) - - # Last chunk of the file - fp.seek(-CHUNK_SIZE, 2) - filedata = fp.read(CHUNK_SIZE) - md5.update(filedata) - setattr(self, field, md5.digest()) + self.digest_samples = filesdb.get(self.path, "digest_samples") + if self.digest_samples is None: + self.digest_samples = self._calc_digest_samples() + filesdb.put(self.path, "digest_samples", self.digest_samples) except Exception as e: - logging.error(f"Error computing md5samples: {e}") + logging.warning(f"Couldn't get digest_samples for {self.path}: {e}") def _read_all_info(self, attrnames=None): """Cache all possible info. @@ -314,7 +330,7 @@ class File: class Folder(File): """A wrapper around a folder path. - It has the size/md5 info of a File, but its value is the sum of its subitems. + It has the size/digest info of a File, but its value is the sum of its subitems. """ __slots__ = File.__slots__ + ("_subfolders",) @@ -335,19 +351,18 @@ class Folder(File): self.size = size stats = self.path.stat() self.mtime = nonone(stats.st_mtime, 0) - elif field in {"md5", "md5partial", "md5samples"}: + elif field in {"digest", "digest_partial", "digest_samples"}: # What's sensitive here is that we must make sure that subfiles' - # md5 are always added up in the same order, but we also want a - # different md5 if a file gets moved in a different subdirectory. + # digest are always added up in the same order, but we also want a + # different digest if a file gets moved in a different subdirectory. - def get_dir_md5_concat(): + def get_dir_digest_concat(): items = self._all_items() items.sort(key=lambda f: f.path) - md5s = [getattr(f, field) for f in items] - return b"".join(md5s) + digests = [getattr(f, field) for f in items] + return b"".join(digests) - md5 = hashlib.md5(get_dir_md5_concat()) - digest = md5.digest() + digest = xxhash.xxh128_digest(get_dir_digest_concat()) setattr(self, field, digest) @property diff --git a/core/me/fs.py b/core/me/fs.py index 6bf7401b..fbfdfa09 100644 --- a/core/me/fs.py +++ b/core/me/fs.py @@ -97,11 +97,6 @@ class MusicFile(fs.File): "dupe_count": format_dupe_count(dupe_count), } - def _get_md5partial_offset_and_size(self): - # No longer calculating the offset and audio size, just whole file - size = self.path.stat().st_size - return (0, size) - def _read_info(self, field): fs.File._read_info(self, field) if field in TAG_FIELDS: diff --git a/core/pe/matchblock.py b/core/pe/matchblock.py index 0b26a233..063ec725 100644 --- a/core/pe/matchblock.py +++ b/core/pe/matchblock.py @@ -238,7 +238,7 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo for ref_id, other_id, percentage in myiter: ref = id2picture[ref_id] other = id2picture[other_id] - if percentage == 100 and ref.md5 != other.md5: + if percentage == 100 and ref.digest != other.digest: percentage = 99 if percentage >= threshold: ref.dimensions # pre-read dimensions for display in results diff --git a/core/tests/base.py b/core/tests/base.py index 36aa3cd0..9f6e68d4 100644 --- a/core/tests/base.py +++ b/core/tests/base.py @@ -86,9 +86,9 @@ class NamedObject: folder = "basepath" self._folder = Path(folder) self.size = size - self.md5partial = name - self.md5 = name - self.md5samples = name + self.digest_partial = name + self.digest = name + self.digest_samples = name if with_words: self.words = getwords(name) self.is_ref = False diff --git a/core/tests/engine_test.py b/core/tests/engine_test.py index cc1ea532..b35076f3 100644 --- a/core/tests/engine_test.py +++ b/core/tests/engine_test.py @@ -530,7 +530,7 @@ class TestCaseGetMatches: class TestCaseGetMatchesByContents: - def test_big_file_partial_hashes(self): + def test_big_file_partial_hashing(self): smallsize = 1 bigsize = 100 * 1024 * 1024 # 100MB f = [ @@ -539,17 +539,17 @@ class TestCaseGetMatchesByContents: no("smallfoo", size=smallsize), no("smallbar", size=smallsize), ] - f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar" - f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar" - f[2].md5 = f[2].md5partial = "bleh" - f[3].md5 = f[3].md5partial = "bleh" + f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar" + f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar" + f[2].digest = f[2].digest_partial = "bleh" + f[3].digest = f[3].digest_partial = "bleh" r = getmatches_by_contents(f, bigsize=bigsize) eq_(len(r), 2) - # User disabled optimization for big files, compute hashes as usual + # User disabled optimization for big files, compute digests as usual r = getmatches_by_contents(f, bigsize=0) eq_(len(r), 2) - # Other file is now slightly different, md5partial is still the same - f[1].md5 = f[1].md5samples = "foobardiff" + # Other file is now slightly different, digest_partial is still the same + f[1].digest = f[1].digest_samples = "foobardiff" r = getmatches_by_contents(f, bigsize=bigsize) # Successfully filter it out eq_(len(r), 1) diff --git a/core/tests/fs_test.py b/core/tests/fs_test.py index 97793ee7..d5b7610b 100644 --- a/core/tests/fs_test.py +++ b/core/tests/fs_test.py @@ -6,7 +6,7 @@ # which should be included with this package. The terms are also available at # http://www.gnu.org/licenses/gpl-3.0.html -import hashlib +import xxhash from os import urandom from hscommon.path import Path @@ -52,54 +52,54 @@ def test_size_aggregates_subfiles(tmpdir): eq_(b.size, 12) -def test_md5_aggregate_subfiles_sorted(tmpdir): - # dir.allfiles can return child in any order. Thus, bundle.md5 must aggregate - # all files' md5 it contains, but it must make sure that it does so in the +def test_digest_aggregate_subfiles_sorted(tmpdir): + # dir.allfiles can return child in any order. Thus, bundle.digest must aggregate + # all files' digests it contains, but it must make sure that it does so in the # same order everytime. p = create_fake_fs_with_random_data(Path(str(tmpdir))) b = fs.Folder(p) - md51 = fs.File(p["dir1"]["file1.test"]).md5 - md52 = fs.File(p["dir2"]["file2.test"]).md5 - md53 = fs.File(p["dir3"]["file3.test"]).md5 - md54 = fs.File(p["file1.test"]).md5 - md55 = fs.File(p["file2.test"]).md5 - md56 = fs.File(p["file3.test"]).md5 - # The expected md5 is the md5 of md5s for folders and the direct md5 for files - folder_md51 = hashlib.md5(md51).digest() - folder_md52 = hashlib.md5(md52).digest() - folder_md53 = hashlib.md5(md53).digest() - md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56) - eq_(b.md5, md5.digest()) + digest1 = fs.File(p["dir1"]["file1.test"]).digest + digest2 = fs.File(p["dir2"]["file2.test"]).digest + digest3 = fs.File(p["dir3"]["file3.test"]).digest + digest4 = fs.File(p["file1.test"]).digest + digest5 = fs.File(p["file2.test"]).digest + digest6 = fs.File(p["file3.test"]).digest + # The expected digest is the hash of digests for folders and the direct digest for files + folder_digest1 = xxhash.xxh128_digest(digest1) + folder_digest2 = xxhash.xxh128_digest(digest2) + folder_digest3 = xxhash.xxh128_digest(digest3) + digest = xxhash.xxh128_digest(folder_digest1 + folder_digest2 + folder_digest3 + digest4 + digest5 + digest6) + eq_(b.digest, digest) -def test_partial_md5_aggregate_subfile_sorted(tmpdir): +def test_partial_digest_aggregate_subfile_sorted(tmpdir): p = create_fake_fs_with_random_data(Path(str(tmpdir))) b = fs.Folder(p) - md51 = fs.File(p["dir1"]["file1.test"]).md5partial - md52 = fs.File(p["dir2"]["file2.test"]).md5partial - md53 = fs.File(p["dir3"]["file3.test"]).md5partial - md54 = fs.File(p["file1.test"]).md5partial - md55 = fs.File(p["file2.test"]).md5partial - md56 = fs.File(p["file3.test"]).md5partial - # The expected md5 is the md5 of md5s for folders and the direct md5 for files - folder_md51 = hashlib.md5(md51).digest() - folder_md52 = hashlib.md5(md52).digest() - folder_md53 = hashlib.md5(md53).digest() - md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56) - eq_(b.md5partial, md5.digest()) + digest1 = fs.File(p["dir1"]["file1.test"]).digest_partial + digest2 = fs.File(p["dir2"]["file2.test"]).digest_partial + digest3 = fs.File(p["dir3"]["file3.test"]).digest_partial + digest4 = fs.File(p["file1.test"]).digest_partial + digest5 = fs.File(p["file2.test"]).digest_partial + digest6 = fs.File(p["file3.test"]).digest_partial + # The expected digest is the hash of digests for folders and the direct digest for files + folder_digest1 = xxhash.xxh128_digest(digest1) + folder_digest2 = xxhash.xxh128_digest(digest2) + folder_digest3 = xxhash.xxh128_digest(digest3) + digest = xxhash.xxh128_digest(folder_digest1 + folder_digest2 + folder_digest3 + digest4 + digest5 + digest6) + eq_(b.digest_partial, digest) - md51 = fs.File(p["dir1"]["file1.test"]).md5samples - md52 = fs.File(p["dir2"]["file2.test"]).md5samples - md53 = fs.File(p["dir3"]["file3.test"]).md5samples - md54 = fs.File(p["file1.test"]).md5samples - md55 = fs.File(p["file2.test"]).md5samples - md56 = fs.File(p["file3.test"]).md5samples - # The expected md5 is the md5 of md5s for folders and the direct md5 for files - folder_md51 = hashlib.md5(md51).digest() - folder_md52 = hashlib.md5(md52).digest() - folder_md53 = hashlib.md5(md53).digest() - md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56) - eq_(b.md5samples, md5.digest()) + digest1 = fs.File(p["dir1"]["file1.test"]).digest_samples + digest2 = fs.File(p["dir2"]["file2.test"]).digest_samples + digest3 = fs.File(p["dir3"]["file3.test"]).digest_samples + digest4 = fs.File(p["file1.test"]).digest_samples + digest5 = fs.File(p["file2.test"]).digest_samples + digest6 = fs.File(p["file3.test"]).digest_samples + # The expected digest is the digest of digests for folders and the direct digest for files + folder_digest1 = xxhash.xxh128_digest(digest1) + folder_digest2 = xxhash.xxh128_digest(digest2) + folder_digest3 = xxhash.xxh128_digest(digest3) + digest = xxhash.xxh128_digest(folder_digest1 + folder_digest2 + folder_digest3 + digest4 + digest5 + digest6) + eq_(b.digest_samples, digest) def test_has_file_attrs(tmpdir): diff --git a/core/tests/scanner_test.py b/core/tests/scanner_test.py index c0aae5b4..909ee3d4 100644 --- a/core/tests/scanner_test.py +++ b/core/tests/scanner_test.py @@ -123,19 +123,19 @@ def test_content_scan(fake_fileexists): s = Scanner() s.scan_type = ScanType.CONTENTS f = [no("foo"), no("bar"), no("bleh")] - f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar" - f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar" - f[2].md5 = f[2].md5partial = f[1].md5samples = "bleh" + f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar" + f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar" + f[2].digest = f[2].digest_partial = f[1].digest_samples = "bleh" r = s.get_dupe_groups(f) eq_(len(r), 1) eq_(len(r[0]), 2) - eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded! + eq_(s.discarded_file_count, 0) # don't count the different digest as discarded! def test_content_scan_compare_sizes_first(fake_fileexists): class MyFile(no): @property - def md5(self): + def digest(self): raise AssertionError() s = Scanner() @@ -161,14 +161,14 @@ def test_ignore_file_size(fake_fileexists): no("largeignore1", large_size + 1), no("largeignore2", large_size + 1), ] - f[0].md5 = f[0].md5partial = f[0].md5samples = "smallignore" - f[1].md5 = f[1].md5partial = f[1].md5samples = "smallignore" - f[2].md5 = f[2].md5partial = f[2].md5samples = "small" - f[3].md5 = f[3].md5partial = f[3].md5samples = "small" - f[4].md5 = f[4].md5partial = f[4].md5samples = "large" - f[5].md5 = f[5].md5partial = f[5].md5samples = "large" - f[6].md5 = f[6].md5partial = f[6].md5samples = "largeignore" - f[7].md5 = f[7].md5partial = f[7].md5samples = "largeignore" + f[0].digest = f[0].digest_partial = f[0].digest_samples = "smallignore" + f[1].digest = f[1].digest_partial = f[1].digest_samples = "smallignore" + f[2].digest = f[2].digest_partial = f[2].digest_samples = "small" + f[3].digest = f[3].digest_partial = f[3].digest_samples = "small" + f[4].digest = f[4].digest_partial = f[4].digest_samples = "large" + f[5].digest = f[5].digest_partial = f[5].digest_samples = "large" + f[6].digest = f[6].digest_partial = f[6].digest_samples = "largeignore" + f[7].digest = f[7].digest_partial = f[7].digest_samples = "largeignore" r = s.get_dupe_groups(f) # No ignores @@ -197,21 +197,21 @@ def test_big_file_partial_hashes(fake_fileexists): s.big_file_size_threshold = bigsize f = [no("bigfoo", bigsize), no("bigbar", bigsize), no("smallfoo", smallsize), no("smallbar", smallsize)] - f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar" - f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar" - f[2].md5 = f[2].md5partial = "bleh" - f[3].md5 = f[3].md5partial = "bleh" + f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar" + f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar" + f[2].digest = f[2].digest_partial = "bleh" + f[3].digest = f[3].digest_partial = "bleh" r = s.get_dupe_groups(f) eq_(len(r), 2) - # md5partial is still the same, but the file is actually different - f[1].md5 = f[1].md5samples = "difffoobar" - # here we compare the full md5s, as the user disabled the optimization + # digest_partial is still the same, but the file is actually different + f[1].digest = f[1].digest_samples = "difffoobar" + # here we compare the full digests, as the user disabled the optimization s.big_file_size_threshold = 0 r = s.get_dupe_groups(f) eq_(len(r), 1) - # here we should compare the md5samples, and see they are different + # here we should compare the digest_samples, and see they are different s.big_file_size_threshold = bigsize r = s.get_dupe_groups(f) eq_(len(r), 1) @@ -221,9 +221,9 @@ def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists): s = Scanner() s.scan_type = ScanType.CONTENTS f = [no("foo"), no("bar"), no("bleh")] - f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar" - f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar" - f[2].md5 = f[2].md5partial = f[2].md5samples = "bleh" + f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar" + f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar" + f[2].digest = f[2].digest_partial = f[2].digest_samples = "bleh" s.min_match_percentage = 101 r = s.get_dupe_groups(f) eq_(len(r), 1) @@ -234,12 +234,16 @@ def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists): eq_(len(r[0]), 2) -def test_content_scan_doesnt_put_md5_in_words_at_the_end(fake_fileexists): +def test_content_scan_doesnt_put_digest_in_words_at_the_end(fake_fileexists): s = Scanner() s.scan_type = ScanType.CONTENTS f = [no("foo"), no("bar")] - f[0].md5 = f[0].md5partial = f[0].md5samples = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" - f[1].md5 = f[1].md5partial = f[1].md5samples = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + f[0].digest = f[0].digest_partial = f[ + 0 + ].digest_samples = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + f[1].digest = f[1].digest_partial = f[ + 1 + ].digest_samples = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" r = s.get_dupe_groups(f) # FIXME looks like we are missing something here? r[0] @@ -587,21 +591,21 @@ def test_folder_scan_exclude_subfolder_matches(fake_fileexists): s = Scanner() s.scan_type = ScanType.FOLDERS topf1 = no("top folder 1", size=42) - topf1.md5 = topf1.md5partial = topf1.md5samples = b"some_md5_1" + topf1.digest = topf1.digest_partial = topf1.digest_samples = b"some_digest__1" topf1.path = Path("/topf1") topf2 = no("top folder 2", size=42) - topf2.md5 = topf2.md5partial = topf2.md5samples = b"some_md5_1" + topf2.digest = topf2.digest_partial = topf2.digest_samples = b"some_digest__1" topf2.path = Path("/topf2") subf1 = no("sub folder 1", size=41) - subf1.md5 = subf1.md5partial = subf1.md5samples = b"some_md5_2" + subf1.digest = subf1.digest_partial = subf1.digest_samples = b"some_digest__2" subf1.path = Path("/topf1/sub") subf2 = no("sub folder 2", size=41) - subf2.md5 = subf2.md5partial = subf2.md5samples = b"some_md5_2" + subf2.digest = subf2.digest_partial = subf2.digest_samples = b"some_digest__2" subf2.path = Path("/topf2/sub") eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2])), 1) # only top folders # however, if another folder matches a subfolder, keep in in the matches otherf = no("other folder", size=41) - otherf.md5 = otherf.md5partial = otherf.md5samples = b"some_md5_2" + otherf.digest = otherf.digest_partial = otherf.digest_samples = b"some_digest__2" otherf.path = Path("/otherfolder") eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2, otherf])), 2) @@ -624,9 +628,9 @@ def test_dont_count_ref_files_as_discarded(fake_fileexists): o1 = no("foo", path="p1") o2 = no("foo", path="p2") o3 = no("foo", path="p3") - o1.md5 = o1.md5partial = o1.md5samples = "foobar" - o2.md5 = o2.md5partial = o2.md5samples = "foobar" - o3.md5 = o3.md5partial = o3.md5samples = "foobar" + o1.digest = o1.digest_partial = o1.digest_samples = "foobar" + o2.digest = o2.digest_partial = o2.digest_samples = "foobar" + o3.digest = o3.digest_partial = o3.digest_samples = "foobar" o1.is_ref = True o2.is_ref = True eq_(len(s.get_dupe_groups([o1, o2, o3])), 1) diff --git a/requirements.txt b/requirements.txt index cbaedf95..cf2faf97 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ +distro>=1.5.0 +mutagen>=1.44.0 +PyQt5 >=5.14.1,<6.0; sys_platform != 'linux' +pywin32>=228; sys_platform == 'win32' Send2Trash>=1.3.0 sphinx>=3.0.0 -polib>=1.1.0 -mutagen>=1.44.0 -distro>=1.5.0 -PyQt5 >=5.14.1,<6.0; sys_platform != 'linux' -pywin32>=228; sys_platform == 'win32' \ No newline at end of file +xxhash>=3.0.0,<4.0.0