implement hash cache for md5 hash based on sqlite

2025-09-11 17:58:17 +00:00 · 2020-11-27 14:49:14 +08:00 · 2020-11-27 14:49:14 +08:00 · 2f02a6010d
commit 2f02a6010d
parent b80489fd66
4 changed files with 147 additions and 41 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,6 +7,7 @@ __pycache__
 .lock-waf*
 .tox
 /tags
+*.eggs

 build
 dist
--- a/core/app.py
+++ b/core/app.py
@ -138,7 +138,8 @@ class DupeGuru(Broadcaster):
        self.app_mode = AppMode.STANDARD
        self.discarded_file_count = 0
        self.exclude_list = ExcludeList()
-        self.directories = directories.Directories(self.exclude_list)
+        hash_cache_file = op.join(self.appdata, "hash.cache")
+        self.directories = directories.Directories(self.exclude_list, hash_cache_file)
        self.results = results.Results(self)
        self.ignore_list = IgnoreList()
        # In addition to "app-level" options, this dictionary also holds options that will be
@ -293,6 +294,7 @@ class DupeGuru(Broadcaster):
    def _job_completed(self, jobid):
        if jobid == JobType.SCAN:
            self._results_changed()
+            self.directories.save_hashes()
            if not self.results.groups:
                self.view.show_message(tr("No duplicates found."))
            else:
--- a/core/directories.py
+++ b/core/directories.py
@ -5,8 +5,11 @@
 # http://www.gnu.org/licenses/gpl-3.0.html

 import os
+import hashlib
+import sqlite3
 from xml.etree import ElementTree as ET
 import logging
+from threading import Lock

 from hscommon.jobprogress import job
 from hscommon.path import Path
@ -44,6 +47,117 @@ class InvalidPathError(Exception):
    """The path being added is invalid"""


+def calc_md5(path):
+    # type: (Path, ) -> bytes
+
+    with path.open("rb") as fp:
+        md5 = hashlib.md5()
+        # The goal here is to not run out of memory on really big files. However, the chunk
+        # size has to be large enough so that the python loop isn't too costly in terms of
+        # CPU.
+        CHUNK_SIZE = 1024 * 1024  # 1 mb
+        filedata = fp.read(CHUNK_SIZE)
+        while filedata:
+            md5.update(filedata)
+            filedata = fp.read(CHUNK_SIZE)
+        return md5.digest()
+
+
+def calc_md5partial(path):
+    # type: (Path, ) -> bytes
+
+    # This offset is where we should start reading the file to get a partial md5
+    # For audio file, it should be where audio data starts
+    offset, size = (0x4000, 0x4000)
+
+    with path.open("rb") as fp:
+        fp.seek(offset)
+        partialdata = fp.read(size)
+        return hashlib.md5(partialdata).digest()
+
+
+class FilesDB:
+
+    create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
+    select_query = "SELECT md5, md5partial FROM files WHERE path=? AND size=? and mtime_ns=?"
+    insert_query = "REPLACE INTO files (path, size, mtime_ns, entry_dt, md5, md5partial) VALUES (?, ?, ?, datetime('now'), ?, ?)"
+
+    def __init__(self, path):
+        # type: (str, ) -> None
+
+        self.conn = sqlite3.connect(path, check_same_thread=False)
+        self.cur = self.conn.cursor()
+        self.setup()
+        self.lock = Lock()
+
+    def setup(self):
+        self.cur.execute(self.create_table_query)
+
+    def get_md5(self, path):
+        # type: (Path, ) -> bytes
+
+        stat = path.stat()
+        size = stat.st_size
+        mtime_ns = stat.st_mtime_ns
+
+        with self.lock:
+            self.cur.execute(self.select_query, (str(path), size, mtime_ns))
+            result = self.cur.fetchone()
+
+            md5 = None
+            md5partial = None
+
+            if result:
+                md5, md5partial = result
+                if md5:
+                    return md5
+
+            md5 = calc_md5(path)
+            self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial))
+            return md5
+
+    def get_md5partial(self, path):
+        # type: (Path, ) -> bytes
+
+        stat = path.stat()
+        size = stat.st_size
+        mtime_ns = stat.st_mtime_ns
+
+        with self.lock:
+            self.cur.execute(self.select_query, (str(path), size, mtime_ns))
+            result = self.cur.fetchone()
+
+            md5 = None
+            md5partial = None
+
+            if result:
+                md5, md5partial = result
+                if md5partial:
+                    return md5partial
+
+            md5partial = calc_md5partial(path)
+            self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial))
+            return md5partial
+
+    def close(self):
+        logging.debug("Closing FilesDB")
+
+        self.conn.commit()
+        self.conn.close()
+
+
+class FilesDBDummy:
+
+    def get_md5(self, path):
+        return calc_md5(path)
+
+    def get_md5partial(self, path):
+        return calc_md5partial(path)
+
+    def close(self):
+        pass
+
+
 class Directories:
    """Holds user folder selection.

@ -55,11 +169,15 @@ class Directories:
    """

    # ---Override
-    def __init__(self, exclude_list=None):
+    def __init__(self, exclude_list=None, hash_cache_file=None):
        self._dirs = []
        # {path: state}
        self.states = {}
        self._exclude_list = exclude_list
+        if hash_cache_file:
+            self.filesdb = FilesDB(hash_cache_file)
+        else:
+            self.filesdb = FilesDBDummy()

    def __contains__(self, path):
        for p in self._dirs:
@ -103,19 +221,19 @@ class Directories:
                if state != DirectoryState.EXCLUDED:
                    # Old logic
                    if self._exclude_list is None or not self._exclude_list.mark_count:
-                        found_files = [fs.get_file(root_path + f, fileclasses=fileclasses) for f in files]
+                        found_files = [fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses) for f in files]
                    else:
                        found_files = []
                        # print(f"len of files: {len(files)} {files}")
                        for f in files:
                            if not self._exclude_list.is_excluded(root, f):
-                                found_files.append(fs.get_file(root_path + f, fileclasses=fileclasses))
+                                found_files.append(fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses))
                    found_files = [f for f in found_files if f is not None]
                    # In some cases, directories can be considered as files by dupeGuru, which is
                    # why we have this line below. In fact, there only one case: Bundle files under
                    # OS X... In other situations, this forloop will do nothing.
                    for d in dirs[:]:
-                        f = fs.get_file(root_path + d, fileclasses=fileclasses)
+                        f = fs.get_file(root_path + d, self.filesdb, fileclasses=fileclasses)
                        if f is not None:
                            found_files.append(f)
                            dirs.remove(d)
@ -200,7 +318,7 @@ class Directories:
            folderclass = fs.Folder
        folder_count = 0
        for path in self._dirs:
-            from_folder = folderclass(path)
+            from_folder = folderclass(path, self.filesdb)
            for folder in self._get_folders(from_folder, j):
                folder_count += 1
                if type(j) != job.NullJob:
@ -286,6 +404,9 @@ class Directories:
            tree = ET.ElementTree(root)
            tree.write(fp, encoding="utf-8")

+    def save_hashes(self):
+        self.filesdb.close()
+
    def set_state(self, path, state):
        """Set the state of folder at ``path``.

--- a/core/fs.py
+++ b/core/fs.py
@ -85,10 +85,11 @@ class File:
    # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
    # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
    # even greater when we take into account read attributes (70%!). Yeah, it's worth it.
-    __slots__ = ("path", "is_ref", "words") + tuple(INITIAL_INFO.keys())
+    __slots__ = ("path", "db", "is_ref", "words") + tuple(INITIAL_INFO.keys())

-    def __init__(self, path):
+    def __init__(self, path, db):
        self.path = path
+        self.db = db
        for attrname in self.INITIAL_INFO:
            setattr(self, attrname, NOT_SET)

@ -107,11 +108,6 @@ class File:
                result = self.INITIAL_INFO[attrname]
        return result

-    # This offset is where we should start reading the file to get a partial md5
-    # For audio file, it should be where audio data starts
-    def _get_md5partial_offset_and_size(self):
-        return (0x4000, 0x4000)  # 16Kb
-
    def _read_info(self, field):
        # print(f"_read_info({field}) for {self}")
        if field in ("size", "mtime"):
@ -120,28 +116,14 @@ class File:
            self.mtime = nonone(stats.st_mtime, 0)
        elif field == "md5partial":
            try:
-                with self.path.open("rb") as fp:
-                    offset, size = self._get_md5partial_offset_and_size()
-                    fp.seek(offset)
-                    partialdata = fp.read(size)
-                    md5 = hashlib.md5(partialdata)
-                    self.md5partial = md5.digest()
-            except Exception:
-                pass
+                self.md5partial = self.db.get_md5partial(self.path)
+            except Exception as e:
+                logging.warning("Couldn't get md5partial for %s: %s", self.path, e)
        elif field == "md5":
            try:
-                with self.path.open("rb") as fp:
-                    md5 = hashlib.md5()
-                    filedata = fp.read(CHUNK_SIZE)
-                    while filedata:
-                        md5.update(filedata)
-                        filedata = fp.read(CHUNK_SIZE)
-                    # FIXME For python 3.8 and later
-                    # while filedata := fp.read(CHUNK_SIZE):
-                    #     md5.update(filedata)
-                    self.md5 = md5.digest()
-            except Exception:
-                pass
+                self.md5 = self.db.get_md5(self.path)
+            except Exception as e:
+                logging.warning("Couldn't get md5 for %s: %s", self.path, e)
        elif field == "md5samples":
            try:
                with self.path.open("rb") as fp:
@ -225,13 +207,13 @@ class Folder(File):

    __slots__ = File.__slots__ + ("_subfolders",)

-    def __init__(self, path):
-        File.__init__(self, path)
+    def __init__(self, path, db):
+        File.__init__(self, path, db)
        self._subfolders = None

    def _all_items(self):
        folders = self.subfolders
-        files = get_files(self.path)
+        files = get_files(self.path, self.db)
        return folders + files

    def _read_info(self, field):
@ -260,7 +242,7 @@ class Folder(File):
    def subfolders(self):
        if self._subfolders is None:
            subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()]
-            self._subfolders = [self.__class__(p) for p in subfolders]
+            self._subfolders = [self.__class__(p, self.db) for p in subfolders]
        return self._subfolders

    @classmethod
@ -268,7 +250,7 @@ class Folder(File):
        return not path.islink() and path.isdir()


-def get_file(path, fileclasses=[File]):
+def get_file(path, db, fileclasses=[File]):
    """Wraps ``path`` around its appropriate :class:`File` class.

    Whether a class is "appropriate" is decided by :meth:`File.can_handle`
@ -278,10 +260,10 @@ def get_file(path, fileclasses=[File]):
    """
    for fileclass in fileclasses:
        if fileclass.can_handle(path):
-            return fileclass(path)
+            return fileclass(path, db)


-def get_files(path, fileclasses=[File]):
+def get_files(path, db, fileclasses=[File]):
    """Returns a list of :class:`File` for each file contained in ``path``.

    :param Path path: path to scan
@ -291,7 +273,7 @@ def get_files(path, fileclasses=[File]):
    try:
        result = []
        for path in path.listdir():
-            file = get_file(path, fileclasses=fileclasses)
+            file = get_file(path, db, fileclasses=fileclasses)
            if file is not None:
                result.append(file)
        return result