diff --git a/core/fs.py b/core/fs.py index 11a29a20..e927fd0b 100644 --- a/core/fs.py +++ b/core/fs.py @@ -54,6 +54,9 @@ CHUNK_SIZE = 1024 * 1024 # 1 MiB # Minimum size below which partial hashing is not used MIN_FILE_SIZE = 3 * CHUNK_SIZE # 3MiB, because we take 3 samples +# Partial hashing offset and size +PARTIAL_OFFSET_SIZE = (0x4000, 0x4000) + class FSError(Exception): cls_message = "An error has occured on '{name}' in '{parent}'" @@ -243,14 +246,9 @@ class File: def _calc_digest_partial(self): # type: () -> bytes - - # This offset is where we should start reading the file to get a partial hash - # For audio file, it should be where audio data starts - offset, size = (0x4000, 0x4000) - with self.path.open("rb") as fp: - fp.seek(offset) - partial_data = fp.read(size) + fp.seek(PARTIAL_OFFSET_SIZE[0]) + partial_data = fp.read(PARTIAL_OFFSET_SIZE[1]) return hasher(partial_data).digest() def _calc_digest_samples(self) -> bytes: @@ -281,7 +279,11 @@ class File: elif field == "digest_partial": self.digest_partial = filesdb.get(self.path, "digest_partial") if self.digest_partial is None: - self.digest_partial = self._calc_digest_partial() + # If file is smaller than partial requirements just use the full digest + if self.size < PARTIAL_OFFSET_SIZE[0] + PARTIAL_OFFSET_SIZE[1]: + self.digest_partial = self.digest + else: + self.digest_partial = self._calc_digest_partial() filesdb.put(self.path, "digest_partial", self.digest_partial) elif field == "digest": self.digest = filesdb.get(self.path, "digest") @@ -292,7 +294,7 @@ class File: size = self.size # Might as well hash such small files entirely. if size <= MIN_FILE_SIZE: - setattr(self, field, self.digest) + self.digest_samples = self.digest return self.digest_samples = filesdb.get(self.path, "digest_samples") if self.digest_samples is None: diff --git a/core/scanner.py b/core/scanner.py index 67124f20..cd2c0f02 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -87,8 +87,6 @@ class Scanner: } ): j = j.start_subjob([2, 8]) - for f in j.iter_with_progress(files, tr("Read size of %d/%d files")): - f.size # pre-read, makes a smoother progress if read here (especially for bundles) if self.size_threshold: files = [f for f in files if f.size >= self.size_threshold] if self.large_size_threshold: @@ -173,7 +171,9 @@ class Scanner: matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)] if self.include_exists_check: matches = [m for m in matches if m.first.exists() and m.second.exists()] - matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)] + # Contents already handles ref checks, other scan types might not catch during scan + if self.scan_type != ScanType.CONTENTS: + matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)] if ignore_list: matches = [m for m in matches if not ignore_list.are_ignored(str(m.first.path), str(m.second.path))] logging.info("Grouping matches") diff --git a/hscommon/jobprogress/job.py b/hscommon/jobprogress/job.py index 5f52d2d9..eb9cca08 100644 --- a/hscommon/jobprogress/job.py +++ b/hscommon/jobprogress/job.py @@ -7,7 +7,7 @@ # http://www.gnu.org/licenses/gpl-3.0.html -from typing import Any, Callable, Generator, Iterator, List, Union +from typing import Any, Callable, Generator, List, Union class JobCancelled(Exception): @@ -148,7 +148,7 @@ class Job: self._do_update(desc) -class NullJob: +class NullJob(Job): def __init__(self, *args, **kwargs) -> None: # Null job does nothing pass @@ -161,9 +161,6 @@ class NullJob: # Null job does nothing pass - def iter_with_progress(self, sequence, *args, **kwargs) -> Iterator: - return iter(sequence) - def start_job(self, *args, **kwargs) -> None: # Null job does nothing pass