Add unit tests for hash sample optimization

* Instead of keeping md5 samples separate, merge them as one hash computed from the various selected chunks we picked. * We don't need to keep a boolean to see whether or not the user chose to optimize; we can simply compare the value of the threshold, since 0 means no optimization currently active.
2026-01-22 14:41:39 +00:00 · 2021-06-21 22:44:05 +02:00
parent e07dfd5955
commit 277bc3fbb8
7 changed files with 152 additions and 43 deletions
--- a/core/fs.py
+++ b/core/fs.py
@@ -83,9 +83,9 @@ class File:
    INITIAL_INFO = {
        "size": 0,
        "mtime": 0,
-        "md5": "",
-        "md5partial": "",
-        "md5samples": []
+        "md5": b"",
+        "md5partial": b"",
+        "md5samples": b""
    }
    # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
    # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
@@ -104,7 +104,6 @@ class File:
        result = object.__getattribute__(self, attrname)
        if result is NOT_SET:
            try:
-                print(f"Try get attr for {self} {attrname}")
                self._read_info(attrname)
            except Exception as e:
                logging.warning(
@@ -121,12 +120,12 @@ class File:
        return (0x4000, 0x4000)  # 16Kb

    def _read_info(self, field):
+        # print(f"_read_info({field}) for {self}")
        if field in ("size", "mtime"):
            stats = self.path.stat()
            self.size = nonone(stats.st_size, 0)
            self.mtime = nonone(stats.st_mtime, 0)
        elif field == "md5partial":
-            print(f"_read_info md5partial {self}")
            try:
                with self.path.open("rb") as fp:
                    offset, size = self._get_md5partial_offset_and_size()
@@ -146,27 +145,24 @@ class File:
            except Exception:
                pass
        elif field == "md5samples":
-            print(f"computing md5chunks for {self}, caller: {inspect.stack()[1][3]}")
            try:
                with self.path.open("rb") as fp:
-                    md5chunks = []
+                    size = self.size
                    # Chunk at 25% of the file
-                    fp.seek(floor(self.size * 25 / 100), 0)
+                    fp.seek(floor(size * 25 / 100), 0)
                    filedata = fp.read(CHUNK_SIZE)
-                    md5chunks.append(hashlib.md5(filedata).hexdigest())
+                    md5 = hashlib.md5(filedata)

                    # Chunk at 60% of the file
-                    fp.seek(floor(self.size * 60 / 100), 0)
+                    fp.seek(floor(size * 60 / 100), 0)
                    filedata = fp.read(CHUNK_SIZE)
-                    md5chunks.append(hashlib.md5(filedata).hexdigest())
+                    md5.update(filedata)

                    # Last chunk of the file
                    fp.seek(-CHUNK_SIZE, 2)
                    filedata = fp.read(CHUNK_SIZE)
-                    md5chunks.append(hashlib.md5(filedata).hexdigest())
-
-                    # Use setattr to avoid circular (de)reference
-                    setattr(self, field, tuple(md5chunks))
+                    md5.update(filedata)
+                    setattr(self, field, md5.digest())
            except Exception as e:
                logging.error(f"Error computing md5samples: {e}")
                pass
@@ -239,16 +235,16 @@ class Folder(File):
        return folders + files

    def _read_info(self, field):
+        # print(f"_read_info({field}) for Folder {self}")
        if field in {"size", "mtime"}:
            size = sum((f.size for f in self._all_items()), 0)
            self.size = size
            stats = self.path.stat()
            self.mtime = nonone(stats.st_mtime, 0)
-        elif field in {"md5", "md5partial"}:
+        elif field in {"md5", "md5partial", "md5samples"}:
            # What's sensitive here is that we must make sure that subfiles'
            # md5 are always added up in the same order, but we also want a
            # different md5 if a file gets moved in a different subdirectory.
-            print(f"Getting {field} of folder {self}...")

            def get_dir_md5_concat():
                items = self._all_items()