change FilesDB to singleton class

move hash calculation back in to Files class clear cache now clears hash cache in addition to picture cache
2026-01-22 14:41:39 +00:00 · 2021-10-29 12:22:12 +08:00
parent 1ea108fc2b
commit 9753afba74
5 changed files with 149 additions and 146 deletions
--- a/core/fs.py
+++ b/core/fs.py
@@ -14,7 +14,11 @@
 import hashlib
 from math import floor
 import logging
+import sqlite3
+from threading import Lock
+from typing import Any

+from hscommon.path import Path
 from hscommon.util import nonone, get_file_ext

 __all__ = [
@@ -78,6 +82,82 @@ class OperationError(FSError):
    cls_message = "Operation on '{name}' failed."


+class FilesDB:
+
+    create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
+    drop_table_query = "DROP TABLE files;"
+    select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns"
+    insert_query = """
+        INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value)
+        ON CONFLICT(path) DO UPDATE SET size=:size, mtime_ns=:mtime_ns, entry_dt=datetime('now'), {key}=:value;
+    """
+
+    def __init__(self):
+        self.conn = None
+        self.cur = None
+        self.lock = None
+
+    def connect(self, path):
+        # type: (str, ) -> None
+
+        self.conn = sqlite3.connect(path, check_same_thread=False)
+        self.cur = self.conn.cursor()
+        self.cur.execute(self.create_table_query)
+        self.lock = Lock()
+
+    def clear(self):
+        # type: () -> None
+
+        with self.lock:
+            self.cur.execute(self.drop_table_query)
+            self.cur.execute(self.create_table_query)
+
+    def get(self, path, key):
+        # type: (Path, str) -> bytes
+
+        stat = path.stat()
+        size = stat.st_size
+        mtime_ns = stat.st_mtime_ns
+
+        with self.lock:
+            self.cur.execute(self.select_query.format(key=key), {"path": str(path), "size": size, "mtime_ns": mtime_ns})
+            result = self.cur.fetchone()
+
+        if result:
+            return result[0]
+
+        return None
+
+    def put(self, path, key, value):
+        # type: (Path, str, Any) -> None
+
+        stat = path.stat()
+        size = stat.st_size
+        mtime_ns = stat.st_mtime_ns
+
+        with self.lock:
+            self.cur.execute(
+                self.insert_query.format(key=key),
+                {"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value},
+            )
+
+    def commit(self):
+        # type: () -> None
+
+        with self.lock:
+            self.conn.commit()
+
+    def close(self):
+        # type: () -> None
+
+        with self.lock:
+            self.cur.close()
+            self.conn.close()
+
+
+filesdb = FilesDB()  # Singleton
+
+
 class File:
    """Represents a file and holds metadata to be used for scanning."""

@@ -85,11 +165,10 @@ class File:
    # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
    # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
    # even greater when we take into account read attributes (70%!). Yeah, it's worth it.
-    __slots__ = ("path", "db", "is_ref", "words") + tuple(INITIAL_INFO.keys())
+    __slots__ = ("path", "is_ref", "words") + tuple(INITIAL_INFO.keys())

-    def __init__(self, path, db):
+    def __init__(self, path):
        self.path = path
-        self.db = db
        for attrname in self.INITIAL_INFO:
            setattr(self, attrname, NOT_SET)

@@ -108,6 +187,33 @@ class File:
                result = self.INITIAL_INFO[attrname]
        return result

+    def _calc_md5(self):
+        # type: () -> bytes
+
+        with self.path.open("rb") as fp:
+            md5 = hashlib.md5()
+            # The goal here is to not run out of memory on really big files. However, the chunk
+            # size has to be large enough so that the python loop isn't too costly in terms of
+            # CPU.
+            CHUNK_SIZE = 1024 * 1024  # 1 mb
+            filedata = fp.read(CHUNK_SIZE)
+            while filedata:
+                md5.update(filedata)
+                filedata = fp.read(CHUNK_SIZE)
+            return md5.digest()
+
+    def _calc_md5partial(self):
+        # type: () -> bytes
+
+        # This offset is where we should start reading the file to get a partial md5
+        # For audio file, it should be where audio data starts
+        offset, size = (0x4000, 0x4000)
+
+        with self.path.open("rb") as fp:
+            fp.seek(offset)
+            partialdata = fp.read(size)
+            return hashlib.md5(partialdata).digest()
+
    def _read_info(self, field):
        # print(f"_read_info({field}) for {self}")
        if field in ("size", "mtime"):
@@ -116,12 +222,18 @@ class File:
            self.mtime = nonone(stats.st_mtime, 0)
        elif field == "md5partial":
            try:
-                self.md5partial = self.db.get_md5partial(self.path)
+                self.md5partial = filesdb.get(self.path, "md5partial")
+                if self.md5partial is None:
+                    self.md5partial = self._calc_md5partial()
+                    filesdb.put(self.path, "md5partial", self.md5partial)
            except Exception as e:
                logging.warning("Couldn't get md5partial for %s: %s", self.path, e)
        elif field == "md5":
            try:
-                self.md5 = self.db.get_md5(self.path)
+                self.md5 = filesdb.get(self.path, "md5")
+                if self.md5 is None:
+                    self.md5 = self._calc_md5()
+                    filesdb.put(self.path, "md5", self.md5)
            except Exception as e:
                logging.warning("Couldn't get md5 for %s: %s", self.path, e)
        elif field == "md5samples":
@@ -207,13 +319,13 @@ class Folder(File):

    __slots__ = File.__slots__ + ("_subfolders",)

-    def __init__(self, path, db):
-        File.__init__(self, path, db)
+    def __init__(self, path):
+        File.__init__(self, path)
        self._subfolders = None

    def _all_items(self):
        folders = self.subfolders
-        files = get_files(self.path, self.db)
+        files = get_files(self.path)
        return folders + files

    def _read_info(self, field):
@@ -242,7 +354,7 @@ class Folder(File):
    def subfolders(self):
        if self._subfolders is None:
            subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()]
-            self._subfolders = [self.__class__(p, self.db) for p in subfolders]
+            self._subfolders = [self.__class__(p) for p in subfolders]
        return self._subfolders

    @classmethod
@@ -250,7 +362,7 @@ class Folder(File):
        return not path.islink() and path.isdir()


-def get_file(path, db, fileclasses=[File]):
+def get_file(path, fileclasses=[File]):
    """Wraps ``path`` around its appropriate :class:`File` class.

    Whether a class is "appropriate" is decided by :meth:`File.can_handle`
@@ -260,10 +372,10 @@ def get_file(path, db, fileclasses=[File]):
    """
    for fileclass in fileclasses:
        if fileclass.can_handle(path):
-            return fileclass(path, db)
+            return fileclass(path)


-def get_files(path, db, fileclasses=[File]):
+def get_files(path, fileclasses=[File]):
    """Returns a list of :class:`File` for each file contained in ``path``.

    :param Path path: path to scan
@@ -273,7 +385,7 @@ def get_files(path, db, fileclasses=[File]):
    try:
        result = []
        for path in path.listdir():
-            file = get_file(path, db, fileclasses=fileclasses)
+            file = get_file(path, fileclasses=fileclasses)
            if file is not None:
                result.append(file)
        return result