From 7b764f183ef512ccd825adfde146de5ae88c5a6a Mon Sep 17 00:00:00 2001 From: glubsy Date: Fri, 13 Aug 2021 20:38:33 +0200 Subject: [PATCH] Avoid partially hashing small files Computing 3 hash samples for files less than 3MiB (3 * CHUNK_SIZE) is not efficient since spans of later samples would overlap a previous one. Therefore we can simply return the hash of the entire small file instead. --- core/fs.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/core/fs.py b/core/fs.py index 9bb315d4..3435aba9 100644 --- a/core/fs.py +++ b/core/fs.py @@ -147,6 +147,11 @@ class File: try: with self.path.open("rb") as fp: size = self.size + # Might as well hash such small files entirely. + if size <= CHUNK_SIZE * 3: # 3MiB, because 3 samples + setattr(self, field, self.md5) + return + # Chunk at 25% of the file fp.seek(floor(size * 25 / 100), 0) filedata = fp.read(CHUNK_SIZE) @@ -219,7 +224,7 @@ class File: class Folder(File): """A wrapper around a folder path. - It has the size/md5 info of a File, but it's value are the sum of its subitems. + It has the size/md5 info of a File, but its value is the sum of its subitems. """ __slots__ = File.__slots__ + ("_subfolders",)