From 9753afba7425ccb58a91eac1cd3d391aba6c30d0 Mon Sep 17 00:00:00 2001
From: Dobatymo <Dobatymo@users.noreply.github.com>
Date: Fri, 29 Oct 2021 12:22:12 +0800
Subject: [PATCH] change FilesDB to singleton class

move hash calculation back in to Files class
clear cache now clears hash cache in addition to picture cache
---
 core/app.py              |   9 ++-
 core/directories.py      | 130 ++----------------------------------
 core/fs.py               | 138 +++++++++++++++++++++++++++++++++++----
 qt/app.py                |  16 +++--
 qt/directories_dialog.py |   2 +-
 5 files changed, 149 insertions(+), 146 deletions(-)

diff --git a/core/app.py b/core/app.py
index 441bfd64..dc6210fa 100644
--- a/core/app.py
+++ b/core/app.py
@@ -139,7 +139,8 @@ class DupeGuru(Broadcaster):
         self.discarded_file_count = 0
         self.exclude_list = ExcludeList()
         hash_cache_file = op.join(self.appdata, "hash_cache.db")
-        self.directories = directories.Directories(self.exclude_list, hash_cache_file)
+        fs.filesdb.connect(hash_cache_file)
+        self.directories = directories.Directories(self.exclude_list)
         self.results = results.Results(self)
         self.ignore_list = IgnoreList()
         # In addition to "app-level" options, this dictionary also holds options that will be
@@ -422,6 +423,9 @@ class DupeGuru(Broadcaster):
         except FileNotFoundError:
             pass  # we don't care
 
+    def clear_hash_cache(self):
+        fs.filesdb.clear()
+
     def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType):
         source_path = dupe.path
         location_path = first(p for p in self.directories if dupe.path in p)
@@ -753,6 +757,9 @@ class DupeGuru(Broadcaster):
         self.exclude_list.save_to_xml(p)
         self.notify("save_session")
 
+    def close(self):
+        fs.filesdb.close()
+
     def save_as(self, filename):
         """Save results in ``filename``.
 
diff --git a/core/directories.py b/core/directories.py
index c50c5790..2c74a4f8 100644
--- a/core/directories.py
+++ b/core/directories.py
@@ -5,11 +5,8 @@
 # http://www.gnu.org/licenses/gpl-3.0.html
 
 import os
-import hashlib
-import sqlite3
 from xml.etree import ElementTree as ET
 import logging
-from threading import Lock
 
 from hscommon.jobprogress import job
 from hscommon.path import Path
@@ -47,117 +44,6 @@ class InvalidPathError(Exception):
     """The path being added is invalid"""
 
 
-def calc_md5(path):
-    # type: (Path, ) -> bytes
-
-    with path.open("rb") as fp:
-        md5 = hashlib.md5()
-        # The goal here is to not run out of memory on really big files. However, the chunk
-        # size has to be large enough so that the python loop isn't too costly in terms of
-        # CPU.
-        CHUNK_SIZE = 1024 * 1024  # 1 mb
-        filedata = fp.read(CHUNK_SIZE)
-        while filedata:
-            md5.update(filedata)
-            filedata = fp.read(CHUNK_SIZE)
-        return md5.digest()
-
-
-def calc_md5partial(path):
-    # type: (Path, ) -> bytes
-
-    # This offset is where we should start reading the file to get a partial md5
-    # For audio file, it should be where audio data starts
-    offset, size = (0x4000, 0x4000)
-
-    with path.open("rb") as fp:
-        fp.seek(offset)
-        partialdata = fp.read(size)
-        return hashlib.md5(partialdata).digest()
-
-
-class FilesDB:
-
-    create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
-    select_query = "SELECT md5, md5partial FROM files WHERE path=? AND size=? and mtime_ns=?"
-    insert_query = "REPLACE INTO files (path, size, mtime_ns, entry_dt, md5, md5partial) VALUES (?, ?, ?, datetime('now'), ?, ?)"
-
-    def __init__(self, path):
-        # type: (str, ) -> None
-
-        self.conn = sqlite3.connect(path, check_same_thread=False)
-        self.cur = self.conn.cursor()
-        self.setup()
-        self.lock = Lock()
-
-    def setup(self):
-        self.cur.execute(self.create_table_query)
-
-    def get_md5(self, path):
-        # type: (Path, ) -> bytes
-
-        stat = path.stat()
-        size = stat.st_size
-        mtime_ns = stat.st_mtime_ns
-
-        with self.lock:
-            self.cur.execute(self.select_query, (str(path), size, mtime_ns))
-            result = self.cur.fetchone()
-
-            md5 = None
-            md5partial = None
-
-            if result:
-                md5, md5partial = result
-                if md5:
-                    return md5
-
-            md5 = calc_md5(path)
-            self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial))
-            return md5
-
-    def get_md5partial(self, path):
-        # type: (Path, ) -> bytes
-
-        stat = path.stat()
-        size = stat.st_size
-        mtime_ns = stat.st_mtime_ns
-
-        with self.lock:
-            self.cur.execute(self.select_query, (str(path), size, mtime_ns))
-            result = self.cur.fetchone()
-
-            md5 = None
-            md5partial = None
-
-            if result:
-                md5, md5partial = result
-                if md5partial:
-                    return md5partial
-
-            md5partial = calc_md5partial(path)
-            self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial))
-            return md5partial
-
-    def close(self):
-        logging.debug("Closing FilesDB")
-
-        self.conn.commit()
-        self.conn.close()
-
-
-class FilesDBDummy:
-
-    def get_md5(self, path):
-        return calc_md5(path)
-
-    def get_md5partial(self, path):
-        return calc_md5partial(path)
-
-    def close(self):
-        pass
-
-
 class Directories:
     """Holds user folder selection.
 
@@ -169,15 +55,11 @@ class Directories:
     """
 
     # ---Override
-    def __init__(self, exclude_list=None, hash_cache_file=None):
+    def __init__(self, exclude_list=None):
         self._dirs = []
         # {path: state}
         self.states = {}
         self._exclude_list = exclude_list
-        if hash_cache_file:
-            self.filesdb = FilesDB(hash_cache_file)
-        else:
-            self.filesdb = FilesDBDummy()
 
     def __contains__(self, path):
         for p in self._dirs:
@@ -221,19 +103,19 @@ class Directories:
                 if state != DirectoryState.EXCLUDED:
                     # Old logic
                     if self._exclude_list is None or not self._exclude_list.mark_count:
-                        found_files = [fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses) for f in files]
+                        found_files = [fs.get_file(root_path + f, fileclasses=fileclasses) for f in files]
                     else:
                         found_files = []
                         # print(f"len of files: {len(files)} {files}")
                         for f in files:
                             if not self._exclude_list.is_excluded(root, f):
-                                found_files.append(fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses))
+                                found_files.append(fs.get_file(root_path + f, fileclasses=fileclasses))
                     found_files = [f for f in found_files if f is not None]
                     # In some cases, directories can be considered as files by dupeGuru, which is
                     # why we have this line below. In fact, there only one case: Bundle files under
                     # OS X... In other situations, this forloop will do nothing.
                     for d in dirs[:]:
-                        f = fs.get_file(root_path + d, self.filesdb, fileclasses=fileclasses)
+                        f = fs.get_file(root_path + d, fileclasses=fileclasses)
                         if f is not None:
                             found_files.append(f)
                             dirs.remove(d)
@@ -318,7 +200,7 @@ class Directories:
             folderclass = fs.Folder
         folder_count = 0
         for path in self._dirs:
-            from_folder = folderclass(path, self.filesdb)
+            from_folder = folderclass(path)
             for folder in self._get_folders(from_folder, j):
                 folder_count += 1
                 if type(j) != job.NullJob:
@@ -405,7 +287,7 @@ class Directories:
             tree.write(fp, encoding="utf-8")
 
     def save_hashes(self):
-        self.filesdb.close()
+        fs.filesdb.commit()
 
     def set_state(self, path, state):
         """Set the state of folder at ``path``.
diff --git a/core/fs.py b/core/fs.py
index c6f5734d..9a078818 100644
--- a/core/fs.py
+++ b/core/fs.py
@@ -14,7 +14,11 @@
 import hashlib
 from math import floor
 import logging
+import sqlite3
+from threading import Lock
+from typing import Any
 
+from hscommon.path import Path
 from hscommon.util import nonone, get_file_ext
 
 __all__ = [
@@ -78,6 +82,82 @@ class OperationError(FSError):
     cls_message = "Operation on '{name}' failed."
 
 
+class FilesDB:
+
+    create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
+    drop_table_query = "DROP TABLE files;"
+    select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns"
+    insert_query = """
+        INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value)
+        ON CONFLICT(path) DO UPDATE SET size=:size, mtime_ns=:mtime_ns, entry_dt=datetime('now'), {key}=:value;
+    """
+
+    def __init__(self):
+        self.conn = None
+        self.cur = None
+        self.lock = None
+
+    def connect(self, path):
+        # type: (str, ) -> None
+
+        self.conn = sqlite3.connect(path, check_same_thread=False)
+        self.cur = self.conn.cursor()
+        self.cur.execute(self.create_table_query)
+        self.lock = Lock()
+
+    def clear(self):
+        # type: () -> None
+
+        with self.lock:
+            self.cur.execute(self.drop_table_query)
+            self.cur.execute(self.create_table_query)
+
+    def get(self, path, key):
+        # type: (Path, str) -> bytes
+
+        stat = path.stat()
+        size = stat.st_size
+        mtime_ns = stat.st_mtime_ns
+
+        with self.lock:
+            self.cur.execute(self.select_query.format(key=key), {"path": str(path), "size": size, "mtime_ns": mtime_ns})
+            result = self.cur.fetchone()
+
+        if result:
+            return result[0]
+
+        return None
+
+    def put(self, path, key, value):
+        # type: (Path, str, Any) -> None
+
+        stat = path.stat()
+        size = stat.st_size
+        mtime_ns = stat.st_mtime_ns
+
+        with self.lock:
+            self.cur.execute(
+                self.insert_query.format(key=key),
+                {"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value},
+            )
+
+    def commit(self):
+        # type: () -> None
+
+        with self.lock:
+            self.conn.commit()
+
+    def close(self):
+        # type: () -> None
+
+        with self.lock:
+            self.cur.close()
+            self.conn.close()
+
+
+filesdb = FilesDB()  # Singleton
+
+
 class File:
     """Represents a file and holds metadata to be used for scanning."""
 
@@ -85,11 +165,10 @@ class File:
     # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
     # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
     # even greater when we take into account read attributes (70%!). Yeah, it's worth it.
-    __slots__ = ("path", "db", "is_ref", "words") + tuple(INITIAL_INFO.keys())
+    __slots__ = ("path", "is_ref", "words") + tuple(INITIAL_INFO.keys())
 
-    def __init__(self, path, db):
+    def __init__(self, path):
         self.path = path
-        self.db = db
         for attrname in self.INITIAL_INFO:
             setattr(self, attrname, NOT_SET)
 
@@ -108,6 +187,33 @@ class File:
                 result = self.INITIAL_INFO[attrname]
         return result
 
+    def _calc_md5(self):
+        # type: () -> bytes
+
+        with self.path.open("rb") as fp:
+            md5 = hashlib.md5()
+            # The goal here is to not run out of memory on really big files. However, the chunk
+            # size has to be large enough so that the python loop isn't too costly in terms of
+            # CPU.
+            CHUNK_SIZE = 1024 * 1024  # 1 mb
+            filedata = fp.read(CHUNK_SIZE)
+            while filedata:
+                md5.update(filedata)
+                filedata = fp.read(CHUNK_SIZE)
+            return md5.digest()
+
+    def _calc_md5partial(self):
+        # type: () -> bytes
+
+        # This offset is where we should start reading the file to get a partial md5
+        # For audio file, it should be where audio data starts
+        offset, size = (0x4000, 0x4000)
+
+        with self.path.open("rb") as fp:
+            fp.seek(offset)
+            partialdata = fp.read(size)
+            return hashlib.md5(partialdata).digest()
+
     def _read_info(self, field):
         # print(f"_read_info({field}) for {self}")
         if field in ("size", "mtime"):
@@ -116,12 +222,18 @@ class File:
             self.mtime = nonone(stats.st_mtime, 0)
         elif field == "md5partial":
             try:
-                self.md5partial = self.db.get_md5partial(self.path)
+                self.md5partial = filesdb.get(self.path, "md5partial")
+                if self.md5partial is None:
+                    self.md5partial = self._calc_md5partial()
+                    filesdb.put(self.path, "md5partial", self.md5partial)
             except Exception as e:
                 logging.warning("Couldn't get md5partial for %s: %s", self.path, e)
         elif field == "md5":
             try:
-                self.md5 = self.db.get_md5(self.path)
+                self.md5 = filesdb.get(self.path, "md5")
+                if self.md5 is None:
+                    self.md5 = self._calc_md5()
+                    filesdb.put(self.path, "md5", self.md5)
             except Exception as e:
                 logging.warning("Couldn't get md5 for %s: %s", self.path, e)
         elif field == "md5samples":
@@ -207,13 +319,13 @@ class Folder(File):
 
     __slots__ = File.__slots__ + ("_subfolders",)
 
-    def __init__(self, path, db):
-        File.__init__(self, path, db)
+    def __init__(self, path):
+        File.__init__(self, path)
         self._subfolders = None
 
     def _all_items(self):
         folders = self.subfolders
-        files = get_files(self.path, self.db)
+        files = get_files(self.path)
         return folders + files
 
     def _read_info(self, field):
@@ -242,7 +354,7 @@ class Folder(File):
     def subfolders(self):
         if self._subfolders is None:
             subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()]
-            self._subfolders = [self.__class__(p, self.db) for p in subfolders]
+            self._subfolders = [self.__class__(p) for p in subfolders]
         return self._subfolders
 
     @classmethod
@@ -250,7 +362,7 @@ class Folder(File):
         return not path.islink() and path.isdir()
 
 
-def get_file(path, db, fileclasses=[File]):
+def get_file(path, fileclasses=[File]):
     """Wraps ``path`` around its appropriate :class:`File` class.
 
     Whether a class is "appropriate" is decided by :meth:`File.can_handle`
@@ -260,10 +372,10 @@ def get_file(path, db, fileclasses=[File]):
     """
     for fileclass in fileclasses:
         if fileclass.can_handle(path):
-            return fileclass(path, db)
+            return fileclass(path)
 
 
-def get_files(path, db, fileclasses=[File]):
+def get_files(path, fileclasses=[File]):
     """Returns a list of :class:`File` for each file contained in ``path``.
 
     :param Path path: path to scan
@@ -273,7 +385,7 @@ def get_files(path, db, fileclasses=[File]):
     try:
         result = []
         for path in path.listdir():
-            file = get_file(path, db, fileclasses=fileclasses)
+            file = get_file(path, fileclasses=fileclasses)
             if file is not None:
                 result.append(file)
         return result
diff --git a/qt/app.py b/qt/app.py
index 1626a974..06d7eeef 100644
--- a/qt/app.py
+++ b/qt/app.py
@@ -129,11 +129,11 @@ class DupeGuru(QObject):
                 self.showDirectoriesWindow,
             ),
             (
-                "actionClearPictureCache",
+                "actionClearCache",
                 "Ctrl+Shift+P",
                 "",
-                tr("Clear Picture Cache"),
-                self.clearPictureCacheTriggered,
+                tr("Clear Cache"),
+                self.clearCacheTriggered,
             ),
             (
                 "actionExcludeList",
@@ -258,6 +258,7 @@ class DupeGuru(QObject):
         self.willSavePrefs.emit()
         self.prefs.save()
         self.model.save()
+        self.model.close()
         # Workaround for #857, hide() or close().
         if self.details_dialog is not None:
             self.details_dialog.close()
@@ -288,13 +289,14 @@ class DupeGuru(QObject):
                 self.model.load_from(results)
                 self.recentResults.insertItem(results)
 
-    def clearPictureCacheTriggered(self):
-        title = tr("Clear Picture Cache")
-        msg = tr("Do you really want to remove all your cached picture analysis?")
+    def clearCacheTriggered(self):
+        title = tr("Clear Cache")
+        msg = tr("Do you really want to clear the cache? This will remove all cached file hashes and picture analysis.")
         if self.confirm(title, msg, QMessageBox.No):
             self.model.clear_picture_cache()
+            self.model.clear_hash_cache()
             active = QApplication.activeWindow()
-            QMessageBox.information(active, title, tr("Picture cache cleared."))
+            QMessageBox.information(active, title, tr("Cache cleared."))
 
     def ignoreListTriggered(self):
         if self.use_tabs:
diff --git a/qt/directories_dialog.py b/qt/directories_dialog.py
index 07b9a276..56a938ba 100644
--- a/qt/directories_dialog.py
+++ b/qt/directories_dialog.py
@@ -126,7 +126,7 @@ class DirectoriesDialog(QMainWindow):
         self.menuFile.addAction(self.actionLoadResults)
         self.menuFile.addAction(self.menuLoadRecent.menuAction())
         self.menuFile.addSeparator()
-        self.menuFile.addAction(self.app.actionClearPictureCache)
+        self.menuFile.addAction(self.app.actionClearCache)
         self.menuFile.addSeparator()
         self.menuFile.addAction(self.actionLoadDirectories)
         self.menuFile.addAction(self.actionSaveDirectories)