mirror of
https://github.com/arsenetar/dupeguru.git
synced 2025-03-10 05:34:36 +00:00
Avoid partially hashing small files
Computing 3 hash samples for files less than 3MiB (3 * CHUNK_SIZE) is not efficient since spans of later samples would overlap a previous one. Therefore we can simply return the hash of the entire small file instead.
This commit is contained in:
parent
718ca5b313
commit
7b764f183e
@ -147,6 +147,11 @@ class File:
|
||||
try:
|
||||
with self.path.open("rb") as fp:
|
||||
size = self.size
|
||||
# Might as well hash such small files entirely.
|
||||
if size <= CHUNK_SIZE * 3: # 3MiB, because 3 samples
|
||||
setattr(self, field, self.md5)
|
||||
return
|
||||
|
||||
# Chunk at 25% of the file
|
||||
fp.seek(floor(size * 25 / 100), 0)
|
||||
filedata = fp.read(CHUNK_SIZE)
|
||||
@ -219,7 +224,7 @@ class File:
|
||||
class Folder(File):
|
||||
"""A wrapper around a folder path.
|
||||
|
||||
It has the size/md5 info of a File, but it's value are the sum of its subitems.
|
||||
It has the size/md5 info of a File, but its value is the sum of its subitems.
|
||||
"""
|
||||
|
||||
__slots__ = File.__slots__ + ("_subfolders",)
|
||||
|
Loading…
x
Reference in New Issue
Block a user