Merge pull request #942 from Dobatymo/hash-cache

Implement hash cache for md5 hash based on sqlite
2025-09-11 17:58:17 +00:00 · 2021-11-23 21:08:22 -06:00 · 2021-11-23 21:08:22 -06:00 · 34f41dc522
commit 34f41dc522
parent b80489fd66 77460045c4
5 changed files with 138 additions and 32 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,6 +7,7 @@ __pycache__
 .lock-waf*
 .tox
 /tags
 *.eggs
 build
 dist
--- a/core/app.py
+++ b/core/app.py
@ -138,6 +138,8 @@ class DupeGuru(Broadcaster):
        self.app_mode = AppMode.STANDARD
        self.discarded_file_count = 0
        self.exclude_list = ExcludeList()
        hash_cache_file = op.join(self.appdata, "hash_cache.db")
        fs.filesdb.connect(hash_cache_file)
        self.directories = directories.Directories(self.exclude_list)
        self.results = results.Results(self)
        self.ignore_list = IgnoreList()
@ -293,6 +295,7 @@ class DupeGuru(Broadcaster):
    def _job_completed(self, jobid):
        if jobid == JobType.SCAN:
            self._results_changed()
            fs.filesdb.commit()
            if not self.results.groups:
                self.view.show_message(tr("No duplicates found."))
            else:
@ -420,6 +423,9 @@ class DupeGuru(Broadcaster):
        except FileNotFoundError:
            pass  # we don't care
    def clear_hash_cache(self):
        fs.filesdb.clear()
    def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType):
        source_path = dupe.path
        location_path = first(p for p in self.directories if dupe.path in p)
@ -751,6 +757,9 @@ class DupeGuru(Broadcaster):
        self.exclude_list.save_to_xml(p)
        self.notify("save_session")
    def close(self):
        fs.filesdb.close()
    def save_as(self, filename):
        """Save results in ``filename``.
--- a/core/fs.py
+++ b/core/fs.py
@ -14,7 +14,11 @@
 import hashlib
 from math import floor
 import logging
 import sqlite3
 from threading import Lock
 from typing import Any
 from hscommon.path import Path
 from hscommon.util import nonone, get_file_ext
 __all__ = [
@ -78,6 +82,82 @@ class OperationError(FSError):
    cls_message = "Operation on '{name}' failed."
 class FilesDB:
    create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
    drop_table_query = "DROP TABLE files;"
    select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns"
    insert_query = """
        INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value)
        ON CONFLICT(path) DO UPDATE SET size=:size, mtime_ns=:mtime_ns, entry_dt=datetime('now'), {key}=:value;
    """
    def __init__(self):
        self.conn = None
        self.cur = None
        self.lock = None
    def connect(self, path):
        # type: (str, ) -> None
        self.conn = sqlite3.connect(path, check_same_thread=False)
        self.cur = self.conn.cursor()
        self.cur.execute(self.create_table_query)
        self.lock = Lock()
    def clear(self):
        # type: () -> None
        with self.lock:
            self.cur.execute(self.drop_table_query)
            self.cur.execute(self.create_table_query)
    def get(self, path, key):
        # type: (Path, str) -> bytes
        stat = path.stat()
        size = stat.st_size
        mtime_ns = stat.st_mtime_ns
        with self.lock:
            self.cur.execute(self.select_query.format(key=key), {"path": str(path), "size": size, "mtime_ns": mtime_ns})
            result = self.cur.fetchone()
        if result:
            return result[0]
        return None
    def put(self, path, key, value):
        # type: (Path, str, Any) -> None
        stat = path.stat()
        size = stat.st_size
        mtime_ns = stat.st_mtime_ns
        with self.lock:
            self.cur.execute(
                self.insert_query.format(key=key),
                {"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value},
            )
    def commit(self):
        # type: () -> None
        with self.lock:
            self.conn.commit()
    def close(self):
        # type: () -> None
        with self.lock:
            self.cur.close()
            self.conn.close()
 filesdb = FilesDB()  # Singleton
 class File:
    """Represents a file and holds metadata to be used for scanning."""
@ -107,10 +187,32 @@ class File:
                result = self.INITIAL_INFO[attrname]
        return result
-    # This offset is where we should start reading the file to get a partial md5
+    def _calc_md5(self):
-    # For audio file, it should be where audio data starts
+        # type: () -> bytes
-    def _get_md5partial_offset_and_size(self):
+
-        return (0x4000, 0x4000)  # 16Kb
+        with self.path.open("rb") as fp:
            md5 = hashlib.md5()
            # The goal here is to not run out of memory on really big files. However, the chunk
            # size has to be large enough so that the python loop isn't too costly in terms of
            # CPU.
            CHUNK_SIZE = 1024 * 1024  # 1 mb
            filedata = fp.read(CHUNK_SIZE)
            while filedata:
                md5.update(filedata)
                filedata = fp.read(CHUNK_SIZE)
            return md5.digest()
    def _calc_md5partial(self):
        # type: () -> bytes
        # This offset is where we should start reading the file to get a partial md5
        # For audio file, it should be where audio data starts
        offset, size = (0x4000, 0x4000)
        with self.path.open("rb") as fp:
            fp.seek(offset)
            partialdata = fp.read(size)
            return hashlib.md5(partialdata).digest()
    def _read_info(self, field):
        # print(f"_read_info({field}) for {self}")
@ -120,28 +222,20 @@ class File:
            self.mtime = nonone(stats.st_mtime, 0)
        elif field == "md5partial":
            try:
-                with self.path.open("rb") as fp:
+                self.md5partial = filesdb.get(self.path, "md5partial")
-                    offset, size = self._get_md5partial_offset_and_size()
+                if self.md5partial is None:
-                    fp.seek(offset)
+                    self.md5partial = self._calc_md5partial()
-                    partialdata = fp.read(size)
+                    filesdb.put(self.path, "md5partial", self.md5partial)
-                    md5 = hashlib.md5(partialdata)
+            except Exception as e:
-                    self.md5partial = md5.digest()
+                logging.warning("Couldn't get md5partial for %s: %s", self.path, e)
            except Exception:
                pass
        elif field == "md5":
            try:
-                with self.path.open("rb") as fp:
+                self.md5 = filesdb.get(self.path, "md5")
-                    md5 = hashlib.md5()
+                if self.md5 is None:
-                    filedata = fp.read(CHUNK_SIZE)
+                    self.md5 = self._calc_md5()
-                    while filedata:
+                    filesdb.put(self.path, "md5", self.md5)
-                        md5.update(filedata)
+            except Exception as e:
-                        filedata = fp.read(CHUNK_SIZE)
+                logging.warning("Couldn't get md5 for %s: %s", self.path, e)
                    # FIXME For python 3.8 and later
                    # while filedata := fp.read(CHUNK_SIZE):
                    #     md5.update(filedata)
                    self.md5 = md5.digest()
            except Exception:
                pass
        elif field == "md5samples":
            try:
                with self.path.open("rb") as fp:
--- a/qt/app.py
+++ b/qt/app.py
@ -129,11 +129,11 @@ class DupeGuru(QObject):
                self.showDirectoriesWindow,
            ),
            (
-                "actionClearPictureCache",
+                "actionClearCache",
                "Ctrl+Shift+P",
                "",
-                tr("Clear Picture Cache"),
+                tr("Clear Cache"),
-                self.clearPictureCacheTriggered,
+                self.clearCacheTriggered,
            ),
            (
                "actionExcludeList",
@ -258,6 +258,7 @@ class DupeGuru(QObject):
        self.willSavePrefs.emit()
        self.prefs.save()
        self.model.save()
        self.model.close()
        # Workaround for #857, hide() or close().
        if self.details_dialog is not None:
            self.details_dialog.close()
@ -288,13 +289,14 @@ class DupeGuru(QObject):
                self.model.load_from(results)
                self.recentResults.insertItem(results)
-    def clearPictureCacheTriggered(self):
+    def clearCacheTriggered(self):
-        title = tr("Clear Picture Cache")
+        title = tr("Clear Cache")
-        msg = tr("Do you really want to remove all your cached picture analysis?")
+        msg = tr("Do you really want to clear the cache? This will remove all cached file hashes and picture analysis.")
        if self.confirm(title, msg, QMessageBox.No):
            self.model.clear_picture_cache()
            self.model.clear_hash_cache()
            active = QApplication.activeWindow()
-            QMessageBox.information(active, title, tr("Picture cache cleared."))
+            QMessageBox.information(active, title, tr("Cache cleared."))
    def ignoreListTriggered(self):
        if self.use_tabs:
--- a/qt/directories_dialog.py
+++ b/qt/directories_dialog.py
@ -126,7 +126,7 @@ class DirectoriesDialog(QMainWindow):
        self.menuFile.addAction(self.actionLoadResults)
        self.menuFile.addAction(self.menuLoadRecent.menuAction())
        self.menuFile.addSeparator()
-        self.menuFile.addAction(self.app.actionClearPictureCache)
+        self.menuFile.addAction(self.app.actionClearCache)
        self.menuFile.addSeparator()
        self.menuFile.addAction(self.actionLoadDirectories)
        self.menuFile.addAction(self.actionSaveDirectories)