1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-03-13 12:01:38 +00:00

Compare commits

..

1 Commits

Author SHA1 Message Date
c408873d20 Update changelog 2022-03-25 23:37:46 -05:00
10 changed files with 190 additions and 194 deletions

View File

@@ -283,7 +283,7 @@ def getmatches_by_contents(files, bigsize=0, j=job.nulljob):
"""Returns a list of :class:`Match` within ``files`` if their contents is the same. """Returns a list of :class:`Match` within ``files`` if their contents is the same.
:param bigsize: The size in bytes over which we consider files big enough to :param bigsize: The size in bytes over which we consider files big enough to
justify taking samples of the file for hashing. If 0, compute digest as usual. justify taking samples of md5. If 0, compute md5 as usual.
:param j: A :ref:`job progress instance <jobs>`. :param j: A :ref:`job progress instance <jobs>`.
""" """
size2files = defaultdict(set) size2files = defaultdict(set)
@@ -300,15 +300,15 @@ def getmatches_by_contents(files, bigsize=0, j=job.nulljob):
if first.is_ref and second.is_ref: if first.is_ref and second.is_ref:
continue # Don't spend time comparing two ref pics together. continue # Don't spend time comparing two ref pics together.
if first.size == 0 and second.size == 0: if first.size == 0 and second.size == 0:
# skip hashing for zero length files # skip md5 for zero length files
result.append(Match(first, second, 100)) result.append(Match(first, second, 100))
continue continue
if first.digest_partial == second.digest_partial: if first.md5partial == second.md5partial:
if bigsize > 0 and first.size > bigsize: if bigsize > 0 and first.size > bigsize:
if first.digest_samples == second.digest_samples: if first.md5samples == second.md5samples:
result.append(Match(first, second, 100)) result.append(Match(first, second, 100))
else: else:
if first.digest == second.digest: if first.md5 == second.md5:
result.append(Match(first, second, 100)) result.append(Match(first, second, 100))
group_count += 1 group_count += 1
j.add_progress(desc=PROGRESS_MESSAGE % (len(result), group_count)) j.add_progress(desc=PROGRESS_MESSAGE % (len(result), group_count))

View File

@@ -11,13 +11,12 @@
# resulting needless complexity and memory usage. It's been a while since I wanted to do that fork, # resulting needless complexity and memory usage. It's been a while since I wanted to do that fork,
# and I'm doing it now. # and I'm doing it now.
import os import hashlib
import xxhash
from math import floor from math import floor
import logging import logging
import sqlite3 import sqlite3
from threading import Lock from threading import Lock
from typing import Any, AnyStr, Union from typing import Any
from hscommon.path import Path from hscommon.path import Path
from hscommon.util import nonone, get_file_ext from hscommon.util import nonone, get_file_ext
@@ -41,7 +40,7 @@ NOT_SET = object()
# CPU. # CPU.
CHUNK_SIZE = 1024 * 1024 # 1 MiB CHUNK_SIZE = 1024 * 1024 # 1 MiB
# Minimum size below which partial hashing is not used # Minimum size below which partial hashes don't need to be computed
MIN_FILE_SIZE = 3 * CHUNK_SIZE # 3MiB, because we take 3 samples MIN_FILE_SIZE = 3 * CHUNK_SIZE # 3MiB, because we take 3 samples
@@ -84,11 +83,9 @@ class OperationError(FSError):
class FilesDB: class FilesDB:
schema_version = 1
schema_version_description = "Changed from md5 to xxhash"
create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, digest BLOB, digest_partial BLOB, digest_samples BLOB)" create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
drop_table_query = "DROP TABLE IF EXISTS files;" drop_table_query = "DROP TABLE files;"
select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns" select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns"
insert_query = """ insert_query = """
INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value) INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value)
@@ -100,37 +97,24 @@ class FilesDB:
self.cur = None self.cur = None
self.lock = None self.lock = None
def connect(self, path: Union[AnyStr, os.PathLike]) -> None: def connect(self, path):
# type: (str, ) -> None
self.conn = sqlite3.connect(path, check_same_thread=False) self.conn = sqlite3.connect(path, check_same_thread=False)
self.cur = self.conn.cursor() self.cur = self.conn.cursor()
self.cur.execute(self.create_table_query)
self.lock = Lock() self.lock = Lock()
self._check_upgrade()
def _check_upgrade(self) -> None: def clear(self):
with self.lock: # type: () -> None
has_schema = self.cur.execute(
"SELECT NAME FROM sqlite_master WHERE type='table' AND name='schema_version'"
).fetchall()
version = None
if has_schema:
version = self.cur.execute("SELECT version FROM schema_version ORDER BY version DESC").fetchone()[0]
else:
self.cur.execute("CREATE TABLE schema_version (version int PRIMARY KEY, description TEXT)")
if version != self.schema_version:
self.cur.execute(self.drop_table_query)
self.cur.execute(
"INSERT OR REPLACE INTO schema_version VALUES (:version, :description)",
{"version": self.schema_version, "description": self.schema_version_description},
)
self.cur.execute(self.create_table_query)
self.conn.commit()
def clear(self) -> None:
with self.lock: with self.lock:
self.cur.execute(self.drop_table_query) self.cur.execute(self.drop_table_query)
self.cur.execute(self.create_table_query) self.cur.execute(self.create_table_query)
def get(self, path: Path, key: str) -> Union[bytes, None]: def get(self, path, key):
# type: (Path, str) -> bytes
stat = path.stat() stat = path.stat()
size = stat.st_size size = stat.st_size
mtime_ns = stat.st_mtime_ns mtime_ns = stat.st_mtime_ns
@@ -144,7 +128,9 @@ class FilesDB:
return None return None
def put(self, path: Path, key: str, value: Any) -> None: def put(self, path, key, value):
# type: (Path, str, Any) -> None
stat = path.stat() stat = path.stat()
size = stat.st_size size = stat.st_size
mtime_ns = stat.st_mtime_ns mtime_ns = stat.st_mtime_ns
@@ -155,11 +141,15 @@ class FilesDB:
{"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value}, {"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value},
) )
def commit(self) -> None: def commit(self):
# type: () -> None
with self.lock: with self.lock:
self.conn.commit() self.conn.commit()
def close(self) -> None: def close(self):
# type: () -> None
with self.lock: with self.lock:
self.cur.close() self.cur.close()
self.conn.close() self.conn.close()
@@ -171,7 +161,7 @@ filesdb = FilesDB() # Singleton
class File: class File:
"""Represents a file and holds metadata to be used for scanning.""" """Represents a file and holds metadata to be used for scanning."""
INITIAL_INFO = {"size": 0, "mtime": 0, "digest": b"", "digest_partial": b"", "digest_samples": b""} INITIAL_INFO = {"size": 0, "mtime": 0, "md5": b"", "md5partial": b"", "md5samples": b""}
# Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
# files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
# even greater when we take into account read attributes (70%!). Yeah, it's worth it. # even greater when we take into account read attributes (70%!). Yeah, it's worth it.
@@ -197,51 +187,32 @@ class File:
result = self.INITIAL_INFO[attrname] result = self.INITIAL_INFO[attrname]
return result return result
def _calc_digest(self): def _calc_md5(self):
# type: () -> bytes # type: () -> bytes
with self.path.open("rb") as fp: with self.path.open("rb") as fp:
file_hash = xxhash.xxh128() md5 = hashlib.md5()
# The goal here is to not run out of memory on really big files. However, the chunk # The goal here is to not run out of memory on really big files. However, the chunk
# size has to be large enough so that the python loop isn't too costly in terms of # size has to be large enough so that the python loop isn't too costly in terms of
# CPU. # CPU.
CHUNK_SIZE = 1024 * 1024 # 1 mb CHUNK_SIZE = 1024 * 1024 # 1 mb
filedata = fp.read(CHUNK_SIZE) filedata = fp.read(CHUNK_SIZE)
while filedata: while filedata:
file_hash.update(filedata) md5.update(filedata)
filedata = fp.read(CHUNK_SIZE) filedata = fp.read(CHUNK_SIZE)
return file_hash.digest() return md5.digest()
def _calc_digest_partial(self): def _calc_md5partial(self):
# type: () -> bytes # type: () -> bytes
# This offset is where we should start reading the file to get a partial hash # This offset is where we should start reading the file to get a partial md5
# For audio file, it should be where audio data starts # For audio file, it should be where audio data starts
offset, size = (0x4000, 0x4000) offset, size = (0x4000, 0x4000)
with self.path.open("rb") as fp: with self.path.open("rb") as fp:
fp.seek(offset) fp.seek(offset)
partial_data = fp.read(size) partialdata = fp.read(size)
return xxhash.xxh128_digest(partial_data) return hashlib.md5(partialdata).digest()
def _calc_digest_samples(self) -> bytes:
size = self.size
with self.path.open("rb") as fp:
# Chunk at 25% of the file
fp.seek(floor(size * 25 / 100), 0)
file_data = fp.read(CHUNK_SIZE)
file_hash = xxhash.xxh128(file_data)
# Chunk at 60% of the file
fp.seek(floor(size * 60 / 100), 0)
file_data = fp.read(CHUNK_SIZE)
file_hash.update(file_data)
# Last chunk of the file
fp.seek(-CHUNK_SIZE, 2)
file_data = fp.read(CHUNK_SIZE)
file_hash.update(file_data)
return file_hash.digest()
def _read_info(self, field): def _read_info(self, field):
# print(f"_read_info({field}) for {self}") # print(f"_read_info({field}) for {self}")
@@ -249,35 +220,48 @@ class File:
stats = self.path.stat() stats = self.path.stat()
self.size = nonone(stats.st_size, 0) self.size = nonone(stats.st_size, 0)
self.mtime = nonone(stats.st_mtime, 0) self.mtime = nonone(stats.st_mtime, 0)
elif field == "digest_partial": elif field == "md5partial":
try: try:
self.digest_partial = filesdb.get(self.path, "digest_partial") self.md5partial = filesdb.get(self.path, "md5partial")
if self.digest_partial is None: if self.md5partial is None:
self.digest_partial = self._calc_digest_partial() self.md5partial = self._calc_md5partial()
filesdb.put(self.path, "digest_partial", self.digest_partial) filesdb.put(self.path, "md5partial", self.md5partial)
except Exception as e: except Exception as e:
logging.warning("Couldn't get digest_partial for %s: %s", self.path, e) logging.warning("Couldn't get md5partial for %s: %s", self.path, e)
elif field == "digest": elif field == "md5":
try: try:
self.digest = filesdb.get(self.path, "digest") self.md5 = filesdb.get(self.path, "md5")
if self.digest is None: if self.md5 is None:
self.digest = self._calc_digest() self.md5 = self._calc_md5()
filesdb.put(self.path, "digest", self.digest) filesdb.put(self.path, "md5", self.md5)
except Exception as e: except Exception as e:
logging.warning("Couldn't get digest for %s: %s", self.path, e) logging.warning("Couldn't get md5 for %s: %s", self.path, e)
elif field == "digest_samples": elif field == "md5samples":
size = self.size
# Might as well hash such small files entirely.
if size <= MIN_FILE_SIZE:
setattr(self, field, self.digest)
return
try: try:
self.digest_samples = filesdb.get(self.path, "digest_samples") with self.path.open("rb") as fp:
if self.digest_samples is None: size = self.size
self.digest_samples = self._calc_digest_samples() # Might as well hash such small files entirely.
filesdb.put(self.path, "digest_samples", self.digest_samples) if size <= MIN_FILE_SIZE:
setattr(self, field, self.md5)
return
# Chunk at 25% of the file
fp.seek(floor(size * 25 / 100), 0)
filedata = fp.read(CHUNK_SIZE)
md5 = hashlib.md5(filedata)
# Chunk at 60% of the file
fp.seek(floor(size * 60 / 100), 0)
filedata = fp.read(CHUNK_SIZE)
md5.update(filedata)
# Last chunk of the file
fp.seek(-CHUNK_SIZE, 2)
filedata = fp.read(CHUNK_SIZE)
md5.update(filedata)
setattr(self, field, md5.digest())
except Exception as e: except Exception as e:
logging.warning(f"Couldn't get digest_samples for {self.path}: {e}") logging.error(f"Error computing md5samples: {e}")
def _read_all_info(self, attrnames=None): def _read_all_info(self, attrnames=None):
"""Cache all possible info. """Cache all possible info.
@@ -330,7 +314,7 @@ class File:
class Folder(File): class Folder(File):
"""A wrapper around a folder path. """A wrapper around a folder path.
It has the size/digest info of a File, but its value is the sum of its subitems. It has the size/md5 info of a File, but its value is the sum of its subitems.
""" """
__slots__ = File.__slots__ + ("_subfolders",) __slots__ = File.__slots__ + ("_subfolders",)
@@ -351,18 +335,19 @@ class Folder(File):
self.size = size self.size = size
stats = self.path.stat() stats = self.path.stat()
self.mtime = nonone(stats.st_mtime, 0) self.mtime = nonone(stats.st_mtime, 0)
elif field in {"digest", "digest_partial", "digest_samples"}: elif field in {"md5", "md5partial", "md5samples"}:
# What's sensitive here is that we must make sure that subfiles' # What's sensitive here is that we must make sure that subfiles'
# digest are always added up in the same order, but we also want a # md5 are always added up in the same order, but we also want a
# different digest if a file gets moved in a different subdirectory. # different md5 if a file gets moved in a different subdirectory.
def get_dir_digest_concat(): def get_dir_md5_concat():
items = self._all_items() items = self._all_items()
items.sort(key=lambda f: f.path) items.sort(key=lambda f: f.path)
digests = [getattr(f, field) for f in items] md5s = [getattr(f, field) for f in items]
return b"".join(digests) return b"".join(md5s)
digest = xxhash.xxh128_digest(get_dir_digest_concat()) md5 = hashlib.md5(get_dir_md5_concat())
digest = md5.digest()
setattr(self, field, digest) setattr(self, field, digest)
@property @property

View File

@@ -97,6 +97,11 @@ class MusicFile(fs.File):
"dupe_count": format_dupe_count(dupe_count), "dupe_count": format_dupe_count(dupe_count),
} }
def _get_md5partial_offset_and_size(self):
# No longer calculating the offset and audio size, just whole file
size = self.path.stat().st_size
return (0, size)
def _read_info(self, field): def _read_info(self, field):
fs.File._read_info(self, field) fs.File._read_info(self, field)
if field in TAG_FIELDS: if field in TAG_FIELDS:

View File

@@ -238,7 +238,7 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo
for ref_id, other_id, percentage in myiter: for ref_id, other_id, percentage in myiter:
ref = id2picture[ref_id] ref = id2picture[ref_id]
other = id2picture[other_id] other = id2picture[other_id]
if percentage == 100 and ref.digest != other.digest: if percentage == 100 and ref.md5 != other.md5:
percentage = 99 percentage = 99
if percentage >= threshold: if percentage >= threshold:
ref.dimensions # pre-read dimensions for display in results ref.dimensions # pre-read dimensions for display in results

View File

@@ -86,9 +86,9 @@ class NamedObject:
folder = "basepath" folder = "basepath"
self._folder = Path(folder) self._folder = Path(folder)
self.size = size self.size = size
self.digest_partial = name self.md5partial = name
self.digest = name self.md5 = name
self.digest_samples = name self.md5samples = name
if with_words: if with_words:
self.words = getwords(name) self.words = getwords(name)
self.is_ref = False self.is_ref = False

View File

@@ -530,7 +530,7 @@ class TestCaseGetMatches:
class TestCaseGetMatchesByContents: class TestCaseGetMatchesByContents:
def test_big_file_partial_hashing(self): def test_big_file_partial_hashes(self):
smallsize = 1 smallsize = 1
bigsize = 100 * 1024 * 1024 # 100MB bigsize = 100 * 1024 * 1024 # 100MB
f = [ f = [
@@ -539,17 +539,17 @@ class TestCaseGetMatchesByContents:
no("smallfoo", size=smallsize), no("smallfoo", size=smallsize),
no("smallbar", size=smallsize), no("smallbar", size=smallsize),
] ]
f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar" f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar" f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
f[2].digest = f[2].digest_partial = "bleh" f[2].md5 = f[2].md5partial = "bleh"
f[3].digest = f[3].digest_partial = "bleh" f[3].md5 = f[3].md5partial = "bleh"
r = getmatches_by_contents(f, bigsize=bigsize) r = getmatches_by_contents(f, bigsize=bigsize)
eq_(len(r), 2) eq_(len(r), 2)
# User disabled optimization for big files, compute digests as usual # User disabled optimization for big files, compute hashes as usual
r = getmatches_by_contents(f, bigsize=0) r = getmatches_by_contents(f, bigsize=0)
eq_(len(r), 2) eq_(len(r), 2)
# Other file is now slightly different, digest_partial is still the same # Other file is now slightly different, md5partial is still the same
f[1].digest = f[1].digest_samples = "foobardiff" f[1].md5 = f[1].md5samples = "foobardiff"
r = getmatches_by_contents(f, bigsize=bigsize) r = getmatches_by_contents(f, bigsize=bigsize)
# Successfully filter it out # Successfully filter it out
eq_(len(r), 1) eq_(len(r), 1)

View File

@@ -6,7 +6,7 @@
# which should be included with this package. The terms are also available at # which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html # http://www.gnu.org/licenses/gpl-3.0.html
import xxhash import hashlib
from os import urandom from os import urandom
from hscommon.path import Path from hscommon.path import Path
@@ -52,54 +52,54 @@ def test_size_aggregates_subfiles(tmpdir):
eq_(b.size, 12) eq_(b.size, 12)
def test_digest_aggregate_subfiles_sorted(tmpdir): def test_md5_aggregate_subfiles_sorted(tmpdir):
# dir.allfiles can return child in any order. Thus, bundle.digest must aggregate # dir.allfiles can return child in any order. Thus, bundle.md5 must aggregate
# all files' digests it contains, but it must make sure that it does so in the # all files' md5 it contains, but it must make sure that it does so in the
# same order everytime. # same order everytime.
p = create_fake_fs_with_random_data(Path(str(tmpdir))) p = create_fake_fs_with_random_data(Path(str(tmpdir)))
b = fs.Folder(p) b = fs.Folder(p)
digest1 = fs.File(p["dir1"]["file1.test"]).digest md51 = fs.File(p["dir1"]["file1.test"]).md5
digest2 = fs.File(p["dir2"]["file2.test"]).digest md52 = fs.File(p["dir2"]["file2.test"]).md5
digest3 = fs.File(p["dir3"]["file3.test"]).digest md53 = fs.File(p["dir3"]["file3.test"]).md5
digest4 = fs.File(p["file1.test"]).digest md54 = fs.File(p["file1.test"]).md5
digest5 = fs.File(p["file2.test"]).digest md55 = fs.File(p["file2.test"]).md5
digest6 = fs.File(p["file3.test"]).digest md56 = fs.File(p["file3.test"]).md5
# The expected digest is the hash of digests for folders and the direct digest for files # The expected md5 is the md5 of md5s for folders and the direct md5 for files
folder_digest1 = xxhash.xxh128_digest(digest1) folder_md51 = hashlib.md5(md51).digest()
folder_digest2 = xxhash.xxh128_digest(digest2) folder_md52 = hashlib.md5(md52).digest()
folder_digest3 = xxhash.xxh128_digest(digest3) folder_md53 = hashlib.md5(md53).digest()
digest = xxhash.xxh128_digest(folder_digest1 + folder_digest2 + folder_digest3 + digest4 + digest5 + digest6) md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56)
eq_(b.digest, digest) eq_(b.md5, md5.digest())
def test_partial_digest_aggregate_subfile_sorted(tmpdir): def test_partial_md5_aggregate_subfile_sorted(tmpdir):
p = create_fake_fs_with_random_data(Path(str(tmpdir))) p = create_fake_fs_with_random_data(Path(str(tmpdir)))
b = fs.Folder(p) b = fs.Folder(p)
digest1 = fs.File(p["dir1"]["file1.test"]).digest_partial md51 = fs.File(p["dir1"]["file1.test"]).md5partial
digest2 = fs.File(p["dir2"]["file2.test"]).digest_partial md52 = fs.File(p["dir2"]["file2.test"]).md5partial
digest3 = fs.File(p["dir3"]["file3.test"]).digest_partial md53 = fs.File(p["dir3"]["file3.test"]).md5partial
digest4 = fs.File(p["file1.test"]).digest_partial md54 = fs.File(p["file1.test"]).md5partial
digest5 = fs.File(p["file2.test"]).digest_partial md55 = fs.File(p["file2.test"]).md5partial
digest6 = fs.File(p["file3.test"]).digest_partial md56 = fs.File(p["file3.test"]).md5partial
# The expected digest is the hash of digests for folders and the direct digest for files # The expected md5 is the md5 of md5s for folders and the direct md5 for files
folder_digest1 = xxhash.xxh128_digest(digest1) folder_md51 = hashlib.md5(md51).digest()
folder_digest2 = xxhash.xxh128_digest(digest2) folder_md52 = hashlib.md5(md52).digest()
folder_digest3 = xxhash.xxh128_digest(digest3) folder_md53 = hashlib.md5(md53).digest()
digest = xxhash.xxh128_digest(folder_digest1 + folder_digest2 + folder_digest3 + digest4 + digest5 + digest6) md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56)
eq_(b.digest_partial, digest) eq_(b.md5partial, md5.digest())
digest1 = fs.File(p["dir1"]["file1.test"]).digest_samples md51 = fs.File(p["dir1"]["file1.test"]).md5samples
digest2 = fs.File(p["dir2"]["file2.test"]).digest_samples md52 = fs.File(p["dir2"]["file2.test"]).md5samples
digest3 = fs.File(p["dir3"]["file3.test"]).digest_samples md53 = fs.File(p["dir3"]["file3.test"]).md5samples
digest4 = fs.File(p["file1.test"]).digest_samples md54 = fs.File(p["file1.test"]).md5samples
digest5 = fs.File(p["file2.test"]).digest_samples md55 = fs.File(p["file2.test"]).md5samples
digest6 = fs.File(p["file3.test"]).digest_samples md56 = fs.File(p["file3.test"]).md5samples
# The expected digest is the digest of digests for folders and the direct digest for files # The expected md5 is the md5 of md5s for folders and the direct md5 for files
folder_digest1 = xxhash.xxh128_digest(digest1) folder_md51 = hashlib.md5(md51).digest()
folder_digest2 = xxhash.xxh128_digest(digest2) folder_md52 = hashlib.md5(md52).digest()
folder_digest3 = xxhash.xxh128_digest(digest3) folder_md53 = hashlib.md5(md53).digest()
digest = xxhash.xxh128_digest(folder_digest1 + folder_digest2 + folder_digest3 + digest4 + digest5 + digest6) md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56)
eq_(b.digest_samples, digest) eq_(b.md5samples, md5.digest())
def test_has_file_attrs(tmpdir): def test_has_file_attrs(tmpdir):

View File

@@ -123,19 +123,19 @@ def test_content_scan(fake_fileexists):
s = Scanner() s = Scanner()
s.scan_type = ScanType.CONTENTS s.scan_type = ScanType.CONTENTS
f = [no("foo"), no("bar"), no("bleh")] f = [no("foo"), no("bar"), no("bleh")]
f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar" f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar" f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
f[2].digest = f[2].digest_partial = f[1].digest_samples = "bleh" f[2].md5 = f[2].md5partial = f[1].md5samples = "bleh"
r = s.get_dupe_groups(f) r = s.get_dupe_groups(f)
eq_(len(r), 1) eq_(len(r), 1)
eq_(len(r[0]), 2) eq_(len(r[0]), 2)
eq_(s.discarded_file_count, 0) # don't count the different digest as discarded! eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded!
def test_content_scan_compare_sizes_first(fake_fileexists): def test_content_scan_compare_sizes_first(fake_fileexists):
class MyFile(no): class MyFile(no):
@property @property
def digest(self): def md5(self):
raise AssertionError() raise AssertionError()
s = Scanner() s = Scanner()
@@ -161,14 +161,14 @@ def test_ignore_file_size(fake_fileexists):
no("largeignore1", large_size + 1), no("largeignore1", large_size + 1),
no("largeignore2", large_size + 1), no("largeignore2", large_size + 1),
] ]
f[0].digest = f[0].digest_partial = f[0].digest_samples = "smallignore" f[0].md5 = f[0].md5partial = f[0].md5samples = "smallignore"
f[1].digest = f[1].digest_partial = f[1].digest_samples = "smallignore" f[1].md5 = f[1].md5partial = f[1].md5samples = "smallignore"
f[2].digest = f[2].digest_partial = f[2].digest_samples = "small" f[2].md5 = f[2].md5partial = f[2].md5samples = "small"
f[3].digest = f[3].digest_partial = f[3].digest_samples = "small" f[3].md5 = f[3].md5partial = f[3].md5samples = "small"
f[4].digest = f[4].digest_partial = f[4].digest_samples = "large" f[4].md5 = f[4].md5partial = f[4].md5samples = "large"
f[5].digest = f[5].digest_partial = f[5].digest_samples = "large" f[5].md5 = f[5].md5partial = f[5].md5samples = "large"
f[6].digest = f[6].digest_partial = f[6].digest_samples = "largeignore" f[6].md5 = f[6].md5partial = f[6].md5samples = "largeignore"
f[7].digest = f[7].digest_partial = f[7].digest_samples = "largeignore" f[7].md5 = f[7].md5partial = f[7].md5samples = "largeignore"
r = s.get_dupe_groups(f) r = s.get_dupe_groups(f)
# No ignores # No ignores
@@ -197,21 +197,21 @@ def test_big_file_partial_hashes(fake_fileexists):
s.big_file_size_threshold = bigsize s.big_file_size_threshold = bigsize
f = [no("bigfoo", bigsize), no("bigbar", bigsize), no("smallfoo", smallsize), no("smallbar", smallsize)] f = [no("bigfoo", bigsize), no("bigbar", bigsize), no("smallfoo", smallsize), no("smallbar", smallsize)]
f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar" f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar" f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
f[2].digest = f[2].digest_partial = "bleh" f[2].md5 = f[2].md5partial = "bleh"
f[3].digest = f[3].digest_partial = "bleh" f[3].md5 = f[3].md5partial = "bleh"
r = s.get_dupe_groups(f) r = s.get_dupe_groups(f)
eq_(len(r), 2) eq_(len(r), 2)
# digest_partial is still the same, but the file is actually different # md5partial is still the same, but the file is actually different
f[1].digest = f[1].digest_samples = "difffoobar" f[1].md5 = f[1].md5samples = "difffoobar"
# here we compare the full digests, as the user disabled the optimization # here we compare the full md5s, as the user disabled the optimization
s.big_file_size_threshold = 0 s.big_file_size_threshold = 0
r = s.get_dupe_groups(f) r = s.get_dupe_groups(f)
eq_(len(r), 1) eq_(len(r), 1)
# here we should compare the digest_samples, and see they are different # here we should compare the md5samples, and see they are different
s.big_file_size_threshold = bigsize s.big_file_size_threshold = bigsize
r = s.get_dupe_groups(f) r = s.get_dupe_groups(f)
eq_(len(r), 1) eq_(len(r), 1)
@@ -221,9 +221,9 @@ def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists):
s = Scanner() s = Scanner()
s.scan_type = ScanType.CONTENTS s.scan_type = ScanType.CONTENTS
f = [no("foo"), no("bar"), no("bleh")] f = [no("foo"), no("bar"), no("bleh")]
f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar" f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar" f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
f[2].digest = f[2].digest_partial = f[2].digest_samples = "bleh" f[2].md5 = f[2].md5partial = f[2].md5samples = "bleh"
s.min_match_percentage = 101 s.min_match_percentage = 101
r = s.get_dupe_groups(f) r = s.get_dupe_groups(f)
eq_(len(r), 1) eq_(len(r), 1)
@@ -234,16 +234,12 @@ def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists):
eq_(len(r[0]), 2) eq_(len(r[0]), 2)
def test_content_scan_doesnt_put_digest_in_words_at_the_end(fake_fileexists): def test_content_scan_doesnt_put_md5_in_words_at_the_end(fake_fileexists):
s = Scanner() s = Scanner()
s.scan_type = ScanType.CONTENTS s.scan_type = ScanType.CONTENTS
f = [no("foo"), no("bar")] f = [no("foo"), no("bar")]
f[0].digest = f[0].digest_partial = f[ f[0].md5 = f[0].md5partial = f[0].md5samples = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
0 f[1].md5 = f[1].md5partial = f[1].md5samples = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
].digest_samples = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
f[1].digest = f[1].digest_partial = f[
1
].digest_samples = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
r = s.get_dupe_groups(f) r = s.get_dupe_groups(f)
# FIXME looks like we are missing something here? # FIXME looks like we are missing something here?
r[0] r[0]
@@ -591,21 +587,21 @@ def test_folder_scan_exclude_subfolder_matches(fake_fileexists):
s = Scanner() s = Scanner()
s.scan_type = ScanType.FOLDERS s.scan_type = ScanType.FOLDERS
topf1 = no("top folder 1", size=42) topf1 = no("top folder 1", size=42)
topf1.digest = topf1.digest_partial = topf1.digest_samples = b"some_digest__1" topf1.md5 = topf1.md5partial = topf1.md5samples = b"some_md5_1"
topf1.path = Path("/topf1") topf1.path = Path("/topf1")
topf2 = no("top folder 2", size=42) topf2 = no("top folder 2", size=42)
topf2.digest = topf2.digest_partial = topf2.digest_samples = b"some_digest__1" topf2.md5 = topf2.md5partial = topf2.md5samples = b"some_md5_1"
topf2.path = Path("/topf2") topf2.path = Path("/topf2")
subf1 = no("sub folder 1", size=41) subf1 = no("sub folder 1", size=41)
subf1.digest = subf1.digest_partial = subf1.digest_samples = b"some_digest__2" subf1.md5 = subf1.md5partial = subf1.md5samples = b"some_md5_2"
subf1.path = Path("/topf1/sub") subf1.path = Path("/topf1/sub")
subf2 = no("sub folder 2", size=41) subf2 = no("sub folder 2", size=41)
subf2.digest = subf2.digest_partial = subf2.digest_samples = b"some_digest__2" subf2.md5 = subf2.md5partial = subf2.md5samples = b"some_md5_2"
subf2.path = Path("/topf2/sub") subf2.path = Path("/topf2/sub")
eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2])), 1) # only top folders eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2])), 1) # only top folders
# however, if another folder matches a subfolder, keep in in the matches # however, if another folder matches a subfolder, keep in in the matches
otherf = no("other folder", size=41) otherf = no("other folder", size=41)
otherf.digest = otherf.digest_partial = otherf.digest_samples = b"some_digest__2" otherf.md5 = otherf.md5partial = otherf.md5samples = b"some_md5_2"
otherf.path = Path("/otherfolder") otherf.path = Path("/otherfolder")
eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2, otherf])), 2) eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2, otherf])), 2)
@@ -628,9 +624,9 @@ def test_dont_count_ref_files_as_discarded(fake_fileexists):
o1 = no("foo", path="p1") o1 = no("foo", path="p1")
o2 = no("foo", path="p2") o2 = no("foo", path="p2")
o3 = no("foo", path="p3") o3 = no("foo", path="p3")
o1.digest = o1.digest_partial = o1.digest_samples = "foobar" o1.md5 = o1.md5partial = o1.md5samples = "foobar"
o2.digest = o2.digest_partial = o2.digest_samples = "foobar" o2.md5 = o2.md5partial = o2.md5samples = "foobar"
o3.digest = o3.digest_partial = o3.digest_samples = "foobar" o3.md5 = o3.md5partial = o3.md5samples = "foobar"
o1.is_ref = True o1.is_ref = True
o2.is_ref = True o2.is_ref = True
eq_(len(s.get_dupe_groups([o1, o2, o3])), 1) eq_(len(s.get_dupe_groups([o1, o2, o3])), 1)

View File

@@ -1,3 +1,13 @@
=== 4.2.1 (2022-03-25)
* Default to English on unsupported system language (#976)
* Fix image viewer zoom datatype issue (#978)
* Fix errors from window change event (#937, #980)
* Fix deprecation warning from SQLite
* Enforce minimum Windows version in installer (#983)
* Fix help path for local files
* Drop python 3.6 support
* VS Code project settings added, yaml validation for GitHub actions
=== 4.2.0 (2021-01-24) === 4.2.0 (2021-01-24)
* Add Malay and Turkish * Add Malay and Turkish

View File

@@ -1,7 +1,7 @@
distro>=1.5.0
mutagen>=1.44.0
PyQt5 >=5.14.1,<6.0; sys_platform != 'linux'
pywin32>=228; sys_platform == 'win32'
Send2Trash>=1.3.0 Send2Trash>=1.3.0
sphinx>=3.0.0 sphinx>=3.0.0
xxhash>=3.0.0,<4.0.0 polib>=1.1.0
mutagen>=1.44.0
distro>=1.5.0
PyQt5 >=5.14.1,<6.0; sys_platform != 'linux'
pywin32>=228; sys_platform == 'win32'