From 99ec4e0f27cf189618cbdb838af60db5080be117 Mon Sep 17 00:00:00 2001 From: Andrew Senetar Date: Thu, 8 Jun 2023 01:14:52 -0500 Subject: [PATCH] fix: Minor cleanups and fixes - Update NullJob to subclass Job - Remove unnecessary size pre-read in _getMatches() as file sizes are already loaded during file scan via stat call - Skip ref check if contents scan as the scan already prevents this from happening, some of the other scans do things differently and need to be reviewed before removing this post step completely - Add guard on partial hashing to just hash the whole file if smaller than the offset and size and use the value for both the partial digest and digest --- core/fs.py | 20 +++++++++++--------- core/scanner.py | 6 +++--- hscommon/jobprogress/job.py | 7 ++----- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/core/fs.py b/core/fs.py index 11a29a20..e927fd0b 100644 --- a/core/fs.py +++ b/core/fs.py @@ -54,6 +54,9 @@ CHUNK_SIZE = 1024 * 1024 # 1 MiB # Minimum size below which partial hashing is not used MIN_FILE_SIZE = 3 * CHUNK_SIZE # 3MiB, because we take 3 samples +# Partial hashing offset and size +PARTIAL_OFFSET_SIZE = (0x4000, 0x4000) + class FSError(Exception): cls_message = "An error has occured on '{name}' in '{parent}'" @@ -243,14 +246,9 @@ class File: def _calc_digest_partial(self): # type: () -> bytes - - # This offset is where we should start reading the file to get a partial hash - # For audio file, it should be where audio data starts - offset, size = (0x4000, 0x4000) - with self.path.open("rb") as fp: - fp.seek(offset) - partial_data = fp.read(size) + fp.seek(PARTIAL_OFFSET_SIZE[0]) + partial_data = fp.read(PARTIAL_OFFSET_SIZE[1]) return hasher(partial_data).digest() def _calc_digest_samples(self) -> bytes: @@ -281,7 +279,11 @@ class File: elif field == "digest_partial": self.digest_partial = filesdb.get(self.path, "digest_partial") if self.digest_partial is None: - self.digest_partial = self._calc_digest_partial() + # If file is smaller than partial requirements just use the full digest + if self.size < PARTIAL_OFFSET_SIZE[0] + PARTIAL_OFFSET_SIZE[1]: + self.digest_partial = self.digest + else: + self.digest_partial = self._calc_digest_partial() filesdb.put(self.path, "digest_partial", self.digest_partial) elif field == "digest": self.digest = filesdb.get(self.path, "digest") @@ -292,7 +294,7 @@ class File: size = self.size # Might as well hash such small files entirely. if size <= MIN_FILE_SIZE: - setattr(self, field, self.digest) + self.digest_samples = self.digest return self.digest_samples = filesdb.get(self.path, "digest_samples") if self.digest_samples is None: diff --git a/core/scanner.py b/core/scanner.py index 67124f20..cd2c0f02 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -87,8 +87,6 @@ class Scanner: } ): j = j.start_subjob([2, 8]) - for f in j.iter_with_progress(files, tr("Read size of %d/%d files")): - f.size # pre-read, makes a smoother progress if read here (especially for bundles) if self.size_threshold: files = [f for f in files if f.size >= self.size_threshold] if self.large_size_threshold: @@ -173,7 +171,9 @@ class Scanner: matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)] if self.include_exists_check: matches = [m for m in matches if m.first.exists() and m.second.exists()] - matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)] + # Contents already handles ref checks, other scan types might not catch during scan + if self.scan_type != ScanType.CONTENTS: + matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)] if ignore_list: matches = [m for m in matches if not ignore_list.are_ignored(str(m.first.path), str(m.second.path))] logging.info("Grouping matches") diff --git a/hscommon/jobprogress/job.py b/hscommon/jobprogress/job.py index 5f52d2d9..eb9cca08 100644 --- a/hscommon/jobprogress/job.py +++ b/hscommon/jobprogress/job.py @@ -7,7 +7,7 @@ # http://www.gnu.org/licenses/gpl-3.0.html -from typing import Any, Callable, Generator, Iterator, List, Union +from typing import Any, Callable, Generator, List, Union class JobCancelled(Exception): @@ -148,7 +148,7 @@ class Job: self._do_update(desc) -class NullJob: +class NullJob(Job): def __init__(self, *args, **kwargs) -> None: # Null job does nothing pass @@ -161,9 +161,6 @@ class NullJob: # Null job does nothing pass - def iter_with_progress(self, sequence, *args, **kwargs) -> Iterator: - return iter(sequence) - def start_job(self, *args, **kwargs) -> None: # Null job does nothing pass