Avoid partially hashing small files

Computing 3 hash samples for files less than 3MiB (3 * CHUNK_SIZE) is not efficient since spans of later samples would overlap a previous one. Therefore we can simply return the hash of the entire small file instead.
2025-09-11 17:58:17 +00:00 · 2021-08-13 20:38:33 +02:00 · 2021-08-13 20:38:33 +02:00 · 7b764f183e
commit 7b764f183e
parent 718ca5b313
1 changed files with 6 additions and 1 deletions
--- a/core/fs.py
+++ b/core/fs.py
@ -147,6 +147,11 @@ class File:
            try:
                with self.path.open("rb") as fp:
                    size = self.size
+                    # Might as well hash such small files entirely.
+                    if size <= CHUNK_SIZE * 3:  # 3MiB, because 3 samples
+                        setattr(self, field, self.md5)
+                        return
+
                    # Chunk at 25% of the file
                    fp.seek(floor(size * 25 / 100), 0)
                    filedata = fp.read(CHUNK_SIZE)
@ -219,7 +224,7 @@ class File:
 class Folder(File):
    """A wrapper around a folder path.

-    It has the size/md5 info of a File, but it's value are the sum of its subitems.
+    It has the size/md5 info of a File, but its value is the sum of its subitems.
    """

    __slots__ = File.__slots__ + ("_subfolders",)