Avoid partially hashing small files

Computing 3 hash samples for files less than 3MiB (3 * CHUNK_SIZE) is not efficient since spans of later samples would overlap a previous one.
Therefore we can simply return the hash of the entire small file instead.
This commit is contained in:
glubsy 2021-08-13 20:38:33 +02:00
parent 718ca5b313
commit 7b764f183e
1 changed files with 6 additions and 1 deletions

View File

@ -147,6 +147,11 @@ class File:
try:
with self.path.open("rb") as fp:
size = self.size
# Might as well hash such small files entirely.
if size <= CHUNK_SIZE * 3: # 3MiB, because 3 samples
setattr(self, field, self.md5)
return
# Chunk at 25% of the file
fp.seek(floor(size * 25 / 100), 0)
filedata = fp.read(CHUNK_SIZE)
@ -219,7 +224,7 @@ class File:
class Folder(File):
"""A wrapper around a folder path.
It has the size/md5 info of a File, but it's value are the sum of its subitems.
It has the size/md5 info of a File, but its value is the sum of its subitems.
"""
__slots__ = File.__slots__ + ("_subfolders",)