1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2024-12-03 19:59:01 +00:00

fix: Minor cleanups and fixes

- Update NullJob to subclass Job
- Remove unnecessary size pre-read in _getMatches() as file sizes are
  already loaded during file scan via stat call
- Skip ref check if contents scan as the scan already prevents this from
  happening, some of the other scans do things differently and need to
  be reviewed before removing this post step completely
- Add guard on partial hashing to just hash the whole file if smaller
  than the offset and size and use the value for both the partial digest
  and digest
This commit is contained in:
Andrew Senetar 2023-06-08 01:14:52 -05:00
parent 322d29a996
commit 99ec4e0f27
Signed by: arsenetar
GPG Key ID: C63300DCE48AB2F1
3 changed files with 16 additions and 17 deletions

View File

@ -54,6 +54,9 @@ CHUNK_SIZE = 1024 * 1024 # 1 MiB
# Minimum size below which partial hashing is not used
MIN_FILE_SIZE = 3 * CHUNK_SIZE # 3MiB, because we take 3 samples
# Partial hashing offset and size
PARTIAL_OFFSET_SIZE = (0x4000, 0x4000)
class FSError(Exception):
cls_message = "An error has occured on '{name}' in '{parent}'"
@ -243,14 +246,9 @@ class File:
def _calc_digest_partial(self):
# type: () -> bytes
# This offset is where we should start reading the file to get a partial hash
# For audio file, it should be where audio data starts
offset, size = (0x4000, 0x4000)
with self.path.open("rb") as fp:
fp.seek(offset)
partial_data = fp.read(size)
fp.seek(PARTIAL_OFFSET_SIZE[0])
partial_data = fp.read(PARTIAL_OFFSET_SIZE[1])
return hasher(partial_data).digest()
def _calc_digest_samples(self) -> bytes:
@ -281,7 +279,11 @@ class File:
elif field == "digest_partial":
self.digest_partial = filesdb.get(self.path, "digest_partial")
if self.digest_partial is None:
self.digest_partial = self._calc_digest_partial()
# If file is smaller than partial requirements just use the full digest
if self.size < PARTIAL_OFFSET_SIZE[0] + PARTIAL_OFFSET_SIZE[1]:
self.digest_partial = self.digest
else:
self.digest_partial = self._calc_digest_partial()
filesdb.put(self.path, "digest_partial", self.digest_partial)
elif field == "digest":
self.digest = filesdb.get(self.path, "digest")
@ -292,7 +294,7 @@ class File:
size = self.size
# Might as well hash such small files entirely.
if size <= MIN_FILE_SIZE:
setattr(self, field, self.digest)
self.digest_samples = self.digest
return
self.digest_samples = filesdb.get(self.path, "digest_samples")
if self.digest_samples is None:

View File

@ -87,8 +87,6 @@ class Scanner:
}
):
j = j.start_subjob([2, 8])
for f in j.iter_with_progress(files, tr("Read size of %d/%d files")):
f.size # pre-read, makes a smoother progress if read here (especially for bundles)
if self.size_threshold:
files = [f for f in files if f.size >= self.size_threshold]
if self.large_size_threshold:
@ -173,7 +171,9 @@ class Scanner:
matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
if self.include_exists_check:
matches = [m for m in matches if m.first.exists() and m.second.exists()]
matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)]
# Contents already handles ref checks, other scan types might not catch during scan
if self.scan_type != ScanType.CONTENTS:
matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)]
if ignore_list:
matches = [m for m in matches if not ignore_list.are_ignored(str(m.first.path), str(m.second.path))]
logging.info("Grouping matches")

View File

@ -7,7 +7,7 @@
# http://www.gnu.org/licenses/gpl-3.0.html
from typing import Any, Callable, Generator, Iterator, List, Union
from typing import Any, Callable, Generator, List, Union
class JobCancelled(Exception):
@ -148,7 +148,7 @@ class Job:
self._do_update(desc)
class NullJob:
class NullJob(Job):
def __init__(self, *args, **kwargs) -> None:
# Null job does nothing
pass
@ -161,9 +161,6 @@ class NullJob:
# Null job does nothing
pass
def iter_with_progress(self, sequence, *args, **kwargs) -> Iterator:
return iter(sequence)
def start_job(self, *args, **kwargs) -> None:
# Null job does nothing
pass