fix: Minor cleanups and fixes

- Update NullJob to subclass Job
- Remove unnecessary size pre-read in _getMatches() as file sizes are
  already loaded during file scan via stat call
- Skip ref check if contents scan as the scan already prevents this from
  happening, some of the other scans do things differently and need to
  be reviewed before removing this post step completely
- Add guard on partial hashing to just hash the whole file if smaller
  than the offset and size and use the value for both the partial digest
  and digest
Andrew Senetar 6 months ago
parent 322d29a996
commit 99ec4e0f27
Signed by: arsenetar
GPG Key ID: C63300DCE48AB2F1

@ -54,6 +54,9 @@ CHUNK_SIZE = 1024 * 1024 # 1 MiB
# Minimum size below which partial hashing is not used
MIN_FILE_SIZE = 3 * CHUNK_SIZE # 3MiB, because we take 3 samples
# Partial hashing offset and size
PARTIAL_OFFSET_SIZE = (0x4000, 0x4000)
class FSError(Exception):
cls_message = "An error has occured on '{name}' in '{parent}'"
@ -243,14 +246,9 @@ class File:
def _calc_digest_partial(self):
# type: () -> bytes
# This offset is where we should start reading the file to get a partial hash
# For audio file, it should be where audio data starts
offset, size = (0x4000, 0x4000)
with"rb") as fp:
partial_data =[0])
partial_data =[1])
return hasher(partial_data).digest()
def _calc_digest_samples(self) -> bytes:
@ -281,7 +279,11 @@ class File:
elif field == "digest_partial":
self.digest_partial = filesdb.get(self.path, "digest_partial")
if self.digest_partial is None:
self.digest_partial = self._calc_digest_partial()
# If file is smaller than partial requirements just use the full digest
self.digest_partial = self.digest
self.digest_partial = self._calc_digest_partial()
filesdb.put(self.path, "digest_partial", self.digest_partial)
elif field == "digest":
self.digest = filesdb.get(self.path, "digest")
@ -292,7 +294,7 @@ class File:
size = self.size
# Might as well hash such small files entirely.
if size <= MIN_FILE_SIZE:
setattr(self, field, self.digest)
self.digest_samples = self.digest
self.digest_samples = filesdb.get(self.path, "digest_samples")
if self.digest_samples is None:

@ -87,8 +87,6 @@ class Scanner:
j = j.start_subjob([2, 8])
for f in j.iter_with_progress(files, tr("Read size of %d/%d files")):
f.size # pre-read, makes a smoother progress if read here (especially for bundles)
if self.size_threshold:
files = [f for f in files if f.size >= self.size_threshold]
if self.large_size_threshold:
@ -173,7 +171,9 @@ class Scanner:
matches = [m for m in matches if get_file_ext( == get_file_ext(]
if self.include_exists_check:
matches = [m for m in matches if m.first.exists() and m.second.exists()]
matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)]
# Contents already handles ref checks, other scan types might not catch during scan
if self.scan_type != ScanType.CONTENTS:
matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)]
if ignore_list:
matches = [m for m in matches if not ignore_list.are_ignored(str(m.first.path), str(m.second.path))]"Grouping matches")

@ -7,7 +7,7 @@
from typing import Any, Callable, Generator, Iterator, List, Union
from typing import Any, Callable, Generator, List, Union
class JobCancelled(Exception):
@ -148,7 +148,7 @@ class Job:
class NullJob:
class NullJob(Job):
def __init__(self, *args, **kwargs) -> None:
# Null job does nothing
@ -161,9 +161,6 @@ class NullJob:
# Null job does nothing
def iter_with_progress(self, sequence, *args, **kwargs) -> Iterator:
return iter(sequence)
def start_job(self, *args, **kwargs) -> None:
# Null job does nothing