fix: Minor cleanups and fixes

- Update NullJob to subclass Job - Remove unnecessary size pre-read in _getMatches() as file sizes are already loaded during file scan via stat call - Skip ref check if contents scan as the scan already prevents this from happening, some of the other scans do things differently and need to be reviewed before removing this post step completely - Add guard on partial hashing to just hash the whole file if smaller than the offset and size and use the value for both the partial digest and digest
2025-08-12 13:33:20 +00:00 · 2023-06-08 01:14:52 -05:00 · 2023-06-08 01:14:52 -05:00 · 99ec4e0f27
commit 99ec4e0f27
parent 322d29a996
3 changed files with 16 additions and 17 deletions
--- a/core/fs.py
+++ b/core/fs.py
@ -54,6 +54,9 @@ CHUNK_SIZE = 1024 * 1024  # 1 MiB
 # Minimum size below which partial hashing is not used
 MIN_FILE_SIZE = 3 * CHUNK_SIZE  # 3MiB, because we take 3 samples

+# Partial hashing offset and size
+PARTIAL_OFFSET_SIZE = (0x4000, 0x4000)
+

 class FSError(Exception):
    cls_message = "An error has occured on '{name}' in '{parent}'"
@ -243,14 +246,9 @@ class File:

    def _calc_digest_partial(self):
        # type: () -> bytes
-
-        # This offset is where we should start reading the file to get a partial hash
-        # For audio file, it should be where audio data starts
-        offset, size = (0x4000, 0x4000)
-
        with self.path.open("rb") as fp:
-            fp.seek(offset)
-            partial_data = fp.read(size)
+            fp.seek(PARTIAL_OFFSET_SIZE[0])
+            partial_data = fp.read(PARTIAL_OFFSET_SIZE[1])
            return hasher(partial_data).digest()

    def _calc_digest_samples(self) -> bytes:
@ -281,7 +279,11 @@ class File:
        elif field == "digest_partial":
            self.digest_partial = filesdb.get(self.path, "digest_partial")
            if self.digest_partial is None:
-                self.digest_partial = self._calc_digest_partial()
+                # If file is smaller than partial requirements just use the full digest
+                if self.size < PARTIAL_OFFSET_SIZE[0] + PARTIAL_OFFSET_SIZE[1]:
+                    self.digest_partial = self.digest
+                else:
+                    self.digest_partial = self._calc_digest_partial()
                filesdb.put(self.path, "digest_partial", self.digest_partial)
        elif field == "digest":
            self.digest = filesdb.get(self.path, "digest")
@ -292,7 +294,7 @@ class File:
            size = self.size
            # Might as well hash such small files entirely.
            if size <= MIN_FILE_SIZE:
-                setattr(self, field, self.digest)
+                self.digest_samples = self.digest
                return
            self.digest_samples = filesdb.get(self.path, "digest_samples")
            if self.digest_samples is None:
--- a/core/scanner.py
+++ b/core/scanner.py
@ -87,8 +87,6 @@ class Scanner:
            }
        ):
            j = j.start_subjob([2, 8])
-            for f in j.iter_with_progress(files, tr("Read size of %d/%d files")):
-                f.size  # pre-read, makes a smoother progress if read here (especially for bundles)
            if self.size_threshold:
                files = [f for f in files if f.size >= self.size_threshold]
            if self.large_size_threshold:
@ -173,7 +171,9 @@ class Scanner:
            matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
        if self.include_exists_check:
            matches = [m for m in matches if m.first.exists() and m.second.exists()]
-        matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)]
+        # Contents already handles ref checks, other scan types might not catch during scan
+        if self.scan_type != ScanType.CONTENTS:
+            matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)]
        if ignore_list:
            matches = [m for m in matches if not ignore_list.are_ignored(str(m.first.path), str(m.second.path))]
        logging.info("Grouping matches")
--- a/hscommon/jobprogress/job.py
+++ b/hscommon/jobprogress/job.py
@ -7,7 +7,7 @@
 # http://www.gnu.org/licenses/gpl-3.0.html


-from typing import Any, Callable, Generator, Iterator, List, Union
+from typing import Any, Callable, Generator, List, Union


 class JobCancelled(Exception):
@ -148,7 +148,7 @@ class Job:
        self._do_update(desc)


-class NullJob:
+class NullJob(Job):
    def __init__(self, *args, **kwargs) -> None:
        # Null job does nothing
        pass
@ -161,9 +161,6 @@ class NullJob:
        # Null job does nothing
        pass

-    def iter_with_progress(self, sequence, *args, **kwargs) -> Iterator:
-        return iter(sequence)
-
    def start_job(self, *args, **kwargs) -> None:
        # Null job does nothing
        pass