mirror of
https://github.com/arsenetar/dupeguru.git
synced 2024-12-21 10:59:03 +00:00
fix: Minor cleanups and fixes
- Update NullJob to subclass Job - Remove unnecessary size pre-read in _getMatches() as file sizes are already loaded during file scan via stat call - Skip ref check if contents scan as the scan already prevents this from happening, some of the other scans do things differently and need to be reviewed before removing this post step completely - Add guard on partial hashing to just hash the whole file if smaller than the offset and size and use the value for both the partial digest and digest
This commit is contained in:
parent
322d29a996
commit
99ec4e0f27
20
core/fs.py
20
core/fs.py
@ -54,6 +54,9 @@ CHUNK_SIZE = 1024 * 1024 # 1 MiB
|
||||
# Minimum size below which partial hashing is not used
|
||||
MIN_FILE_SIZE = 3 * CHUNK_SIZE # 3MiB, because we take 3 samples
|
||||
|
||||
# Partial hashing offset and size
|
||||
PARTIAL_OFFSET_SIZE = (0x4000, 0x4000)
|
||||
|
||||
|
||||
class FSError(Exception):
|
||||
cls_message = "An error has occured on '{name}' in '{parent}'"
|
||||
@ -243,14 +246,9 @@ class File:
|
||||
|
||||
def _calc_digest_partial(self):
|
||||
# type: () -> bytes
|
||||
|
||||
# This offset is where we should start reading the file to get a partial hash
|
||||
# For audio file, it should be where audio data starts
|
||||
offset, size = (0x4000, 0x4000)
|
||||
|
||||
with self.path.open("rb") as fp:
|
||||
fp.seek(offset)
|
||||
partial_data = fp.read(size)
|
||||
fp.seek(PARTIAL_OFFSET_SIZE[0])
|
||||
partial_data = fp.read(PARTIAL_OFFSET_SIZE[1])
|
||||
return hasher(partial_data).digest()
|
||||
|
||||
def _calc_digest_samples(self) -> bytes:
|
||||
@ -281,7 +279,11 @@ class File:
|
||||
elif field == "digest_partial":
|
||||
self.digest_partial = filesdb.get(self.path, "digest_partial")
|
||||
if self.digest_partial is None:
|
||||
self.digest_partial = self._calc_digest_partial()
|
||||
# If file is smaller than partial requirements just use the full digest
|
||||
if self.size < PARTIAL_OFFSET_SIZE[0] + PARTIAL_OFFSET_SIZE[1]:
|
||||
self.digest_partial = self.digest
|
||||
else:
|
||||
self.digest_partial = self._calc_digest_partial()
|
||||
filesdb.put(self.path, "digest_partial", self.digest_partial)
|
||||
elif field == "digest":
|
||||
self.digest = filesdb.get(self.path, "digest")
|
||||
@ -292,7 +294,7 @@ class File:
|
||||
size = self.size
|
||||
# Might as well hash such small files entirely.
|
||||
if size <= MIN_FILE_SIZE:
|
||||
setattr(self, field, self.digest)
|
||||
self.digest_samples = self.digest
|
||||
return
|
||||
self.digest_samples = filesdb.get(self.path, "digest_samples")
|
||||
if self.digest_samples is None:
|
||||
|
@ -87,8 +87,6 @@ class Scanner:
|
||||
}
|
||||
):
|
||||
j = j.start_subjob([2, 8])
|
||||
for f in j.iter_with_progress(files, tr("Read size of %d/%d files")):
|
||||
f.size # pre-read, makes a smoother progress if read here (especially for bundles)
|
||||
if self.size_threshold:
|
||||
files = [f for f in files if f.size >= self.size_threshold]
|
||||
if self.large_size_threshold:
|
||||
@ -173,7 +171,9 @@ class Scanner:
|
||||
matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
|
||||
if self.include_exists_check:
|
||||
matches = [m for m in matches if m.first.exists() and m.second.exists()]
|
||||
matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)]
|
||||
# Contents already handles ref checks, other scan types might not catch during scan
|
||||
if self.scan_type != ScanType.CONTENTS:
|
||||
matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)]
|
||||
if ignore_list:
|
||||
matches = [m for m in matches if not ignore_list.are_ignored(str(m.first.path), str(m.second.path))]
|
||||
logging.info("Grouping matches")
|
||||
|
@ -7,7 +7,7 @@
|
||||
# http://www.gnu.org/licenses/gpl-3.0.html
|
||||
|
||||
|
||||
from typing import Any, Callable, Generator, Iterator, List, Union
|
||||
from typing import Any, Callable, Generator, List, Union
|
||||
|
||||
|
||||
class JobCancelled(Exception):
|
||||
@ -148,7 +148,7 @@ class Job:
|
||||
self._do_update(desc)
|
||||
|
||||
|
||||
class NullJob:
|
||||
class NullJob(Job):
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
# Null job does nothing
|
||||
pass
|
||||
@ -161,9 +161,6 @@ class NullJob:
|
||||
# Null job does nothing
|
||||
pass
|
||||
|
||||
def iter_with_progress(self, sequence, *args, **kwargs) -> Iterator:
|
||||
return iter(sequence)
|
||||
|
||||
def start_job(self, *args, **kwargs) -> None:
|
||||
# Null job does nothing
|
||||
pass
|
||||
|
Loading…
Reference in New Issue
Block a user