From 9753afba7425ccb58a91eac1cd3d391aba6c30d0 Mon Sep 17 00:00:00 2001 From: Dobatymo Date: Fri, 29 Oct 2021 12:22:12 +0800 Subject: [PATCH] change FilesDB to singleton class move hash calculation back in to Files class clear cache now clears hash cache in addition to picture cache --- core/app.py | 9 ++- core/directories.py | 130 ++---------------------------------- core/fs.py | 138 +++++++++++++++++++++++++++++++++++---- qt/app.py | 16 +++-- qt/directories_dialog.py | 2 +- 5 files changed, 149 insertions(+), 146 deletions(-) diff --git a/core/app.py b/core/app.py index 441bfd64..dc6210fa 100644 --- a/core/app.py +++ b/core/app.py @@ -139,7 +139,8 @@ class DupeGuru(Broadcaster): self.discarded_file_count = 0 self.exclude_list = ExcludeList() hash_cache_file = op.join(self.appdata, "hash_cache.db") - self.directories = directories.Directories(self.exclude_list, hash_cache_file) + fs.filesdb.connect(hash_cache_file) + self.directories = directories.Directories(self.exclude_list) self.results = results.Results(self) self.ignore_list = IgnoreList() # In addition to "app-level" options, this dictionary also holds options that will be @@ -422,6 +423,9 @@ class DupeGuru(Broadcaster): except FileNotFoundError: pass # we don't care + def clear_hash_cache(self): + fs.filesdb.clear() + def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType): source_path = dupe.path location_path = first(p for p in self.directories if dupe.path in p) @@ -753,6 +757,9 @@ class DupeGuru(Broadcaster): self.exclude_list.save_to_xml(p) self.notify("save_session") + def close(self): + fs.filesdb.close() + def save_as(self, filename): """Save results in ``filename``. diff --git a/core/directories.py b/core/directories.py index c50c5790..2c74a4f8 100644 --- a/core/directories.py +++ b/core/directories.py @@ -5,11 +5,8 @@ # http://www.gnu.org/licenses/gpl-3.0.html import os -import hashlib -import sqlite3 from xml.etree import ElementTree as ET import logging -from threading import Lock from hscommon.jobprogress import job from hscommon.path import Path @@ -47,117 +44,6 @@ class InvalidPathError(Exception): """The path being added is invalid""" -def calc_md5(path): - # type: (Path, ) -> bytes - - with path.open("rb") as fp: - md5 = hashlib.md5() - # The goal here is to not run out of memory on really big files. However, the chunk - # size has to be large enough so that the python loop isn't too costly in terms of - # CPU. - CHUNK_SIZE = 1024 * 1024 # 1 mb - filedata = fp.read(CHUNK_SIZE) - while filedata: - md5.update(filedata) - filedata = fp.read(CHUNK_SIZE) - return md5.digest() - - -def calc_md5partial(path): - # type: (Path, ) -> bytes - - # This offset is where we should start reading the file to get a partial md5 - # For audio file, it should be where audio data starts - offset, size = (0x4000, 0x4000) - - with path.open("rb") as fp: - fp.seek(offset) - partialdata = fp.read(size) - return hashlib.md5(partialdata).digest() - - -class FilesDB: - - create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)" - select_query = "SELECT md5, md5partial FROM files WHERE path=? AND size=? and mtime_ns=?" - insert_query = "REPLACE INTO files (path, size, mtime_ns, entry_dt, md5, md5partial) VALUES (?, ?, ?, datetime('now'), ?, ?)" - - def __init__(self, path): - # type: (str, ) -> None - - self.conn = sqlite3.connect(path, check_same_thread=False) - self.cur = self.conn.cursor() - self.setup() - self.lock = Lock() - - def setup(self): - self.cur.execute(self.create_table_query) - - def get_md5(self, path): - # type: (Path, ) -> bytes - - stat = path.stat() - size = stat.st_size - mtime_ns = stat.st_mtime_ns - - with self.lock: - self.cur.execute(self.select_query, (str(path), size, mtime_ns)) - result = self.cur.fetchone() - - md5 = None - md5partial = None - - if result: - md5, md5partial = result - if md5: - return md5 - - md5 = calc_md5(path) - self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial)) - return md5 - - def get_md5partial(self, path): - # type: (Path, ) -> bytes - - stat = path.stat() - size = stat.st_size - mtime_ns = stat.st_mtime_ns - - with self.lock: - self.cur.execute(self.select_query, (str(path), size, mtime_ns)) - result = self.cur.fetchone() - - md5 = None - md5partial = None - - if result: - md5, md5partial = result - if md5partial: - return md5partial - - md5partial = calc_md5partial(path) - self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial)) - return md5partial - - def close(self): - logging.debug("Closing FilesDB") - - self.conn.commit() - self.conn.close() - - -class FilesDBDummy: - - def get_md5(self, path): - return calc_md5(path) - - def get_md5partial(self, path): - return calc_md5partial(path) - - def close(self): - pass - - class Directories: """Holds user folder selection. @@ -169,15 +55,11 @@ class Directories: """ # ---Override - def __init__(self, exclude_list=None, hash_cache_file=None): + def __init__(self, exclude_list=None): self._dirs = [] # {path: state} self.states = {} self._exclude_list = exclude_list - if hash_cache_file: - self.filesdb = FilesDB(hash_cache_file) - else: - self.filesdb = FilesDBDummy() def __contains__(self, path): for p in self._dirs: @@ -221,19 +103,19 @@ class Directories: if state != DirectoryState.EXCLUDED: # Old logic if self._exclude_list is None or not self._exclude_list.mark_count: - found_files = [fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses) for f in files] + found_files = [fs.get_file(root_path + f, fileclasses=fileclasses) for f in files] else: found_files = [] # print(f"len of files: {len(files)} {files}") for f in files: if not self._exclude_list.is_excluded(root, f): - found_files.append(fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses)) + found_files.append(fs.get_file(root_path + f, fileclasses=fileclasses)) found_files = [f for f in found_files if f is not None] # In some cases, directories can be considered as files by dupeGuru, which is # why we have this line below. In fact, there only one case: Bundle files under # OS X... In other situations, this forloop will do nothing. for d in dirs[:]: - f = fs.get_file(root_path + d, self.filesdb, fileclasses=fileclasses) + f = fs.get_file(root_path + d, fileclasses=fileclasses) if f is not None: found_files.append(f) dirs.remove(d) @@ -318,7 +200,7 @@ class Directories: folderclass = fs.Folder folder_count = 0 for path in self._dirs: - from_folder = folderclass(path, self.filesdb) + from_folder = folderclass(path) for folder in self._get_folders(from_folder, j): folder_count += 1 if type(j) != job.NullJob: @@ -405,7 +287,7 @@ class Directories: tree.write(fp, encoding="utf-8") def save_hashes(self): - self.filesdb.close() + fs.filesdb.commit() def set_state(self, path, state): """Set the state of folder at ``path``. diff --git a/core/fs.py b/core/fs.py index c6f5734d..9a078818 100644 --- a/core/fs.py +++ b/core/fs.py @@ -14,7 +14,11 @@ import hashlib from math import floor import logging +import sqlite3 +from threading import Lock +from typing import Any +from hscommon.path import Path from hscommon.util import nonone, get_file_ext __all__ = [ @@ -78,6 +82,82 @@ class OperationError(FSError): cls_message = "Operation on '{name}' failed." +class FilesDB: + + create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)" + drop_table_query = "DROP TABLE files;" + select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns" + insert_query = """ + INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value) + ON CONFLICT(path) DO UPDATE SET size=:size, mtime_ns=:mtime_ns, entry_dt=datetime('now'), {key}=:value; + """ + + def __init__(self): + self.conn = None + self.cur = None + self.lock = None + + def connect(self, path): + # type: (str, ) -> None + + self.conn = sqlite3.connect(path, check_same_thread=False) + self.cur = self.conn.cursor() + self.cur.execute(self.create_table_query) + self.lock = Lock() + + def clear(self): + # type: () -> None + + with self.lock: + self.cur.execute(self.drop_table_query) + self.cur.execute(self.create_table_query) + + def get(self, path, key): + # type: (Path, str) -> bytes + + stat = path.stat() + size = stat.st_size + mtime_ns = stat.st_mtime_ns + + with self.lock: + self.cur.execute(self.select_query.format(key=key), {"path": str(path), "size": size, "mtime_ns": mtime_ns}) + result = self.cur.fetchone() + + if result: + return result[0] + + return None + + def put(self, path, key, value): + # type: (Path, str, Any) -> None + + stat = path.stat() + size = stat.st_size + mtime_ns = stat.st_mtime_ns + + with self.lock: + self.cur.execute( + self.insert_query.format(key=key), + {"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value}, + ) + + def commit(self): + # type: () -> None + + with self.lock: + self.conn.commit() + + def close(self): + # type: () -> None + + with self.lock: + self.cur.close() + self.conn.close() + + +filesdb = FilesDB() # Singleton + + class File: """Represents a file and holds metadata to be used for scanning.""" @@ -85,11 +165,10 @@ class File: # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become # even greater when we take into account read attributes (70%!). Yeah, it's worth it. - __slots__ = ("path", "db", "is_ref", "words") + tuple(INITIAL_INFO.keys()) + __slots__ = ("path", "is_ref", "words") + tuple(INITIAL_INFO.keys()) - def __init__(self, path, db): + def __init__(self, path): self.path = path - self.db = db for attrname in self.INITIAL_INFO: setattr(self, attrname, NOT_SET) @@ -108,6 +187,33 @@ class File: result = self.INITIAL_INFO[attrname] return result + def _calc_md5(self): + # type: () -> bytes + + with self.path.open("rb") as fp: + md5 = hashlib.md5() + # The goal here is to not run out of memory on really big files. However, the chunk + # size has to be large enough so that the python loop isn't too costly in terms of + # CPU. + CHUNK_SIZE = 1024 * 1024 # 1 mb + filedata = fp.read(CHUNK_SIZE) + while filedata: + md5.update(filedata) + filedata = fp.read(CHUNK_SIZE) + return md5.digest() + + def _calc_md5partial(self): + # type: () -> bytes + + # This offset is where we should start reading the file to get a partial md5 + # For audio file, it should be where audio data starts + offset, size = (0x4000, 0x4000) + + with self.path.open("rb") as fp: + fp.seek(offset) + partialdata = fp.read(size) + return hashlib.md5(partialdata).digest() + def _read_info(self, field): # print(f"_read_info({field}) for {self}") if field in ("size", "mtime"): @@ -116,12 +222,18 @@ class File: self.mtime = nonone(stats.st_mtime, 0) elif field == "md5partial": try: - self.md5partial = self.db.get_md5partial(self.path) + self.md5partial = filesdb.get(self.path, "md5partial") + if self.md5partial is None: + self.md5partial = self._calc_md5partial() + filesdb.put(self.path, "md5partial", self.md5partial) except Exception as e: logging.warning("Couldn't get md5partial for %s: %s", self.path, e) elif field == "md5": try: - self.md5 = self.db.get_md5(self.path) + self.md5 = filesdb.get(self.path, "md5") + if self.md5 is None: + self.md5 = self._calc_md5() + filesdb.put(self.path, "md5", self.md5) except Exception as e: logging.warning("Couldn't get md5 for %s: %s", self.path, e) elif field == "md5samples": @@ -207,13 +319,13 @@ class Folder(File): __slots__ = File.__slots__ + ("_subfolders",) - def __init__(self, path, db): - File.__init__(self, path, db) + def __init__(self, path): + File.__init__(self, path) self._subfolders = None def _all_items(self): folders = self.subfolders - files = get_files(self.path, self.db) + files = get_files(self.path) return folders + files def _read_info(self, field): @@ -242,7 +354,7 @@ class Folder(File): def subfolders(self): if self._subfolders is None: subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()] - self._subfolders = [self.__class__(p, self.db) for p in subfolders] + self._subfolders = [self.__class__(p) for p in subfolders] return self._subfolders @classmethod @@ -250,7 +362,7 @@ class Folder(File): return not path.islink() and path.isdir() -def get_file(path, db, fileclasses=[File]): +def get_file(path, fileclasses=[File]): """Wraps ``path`` around its appropriate :class:`File` class. Whether a class is "appropriate" is decided by :meth:`File.can_handle` @@ -260,10 +372,10 @@ def get_file(path, db, fileclasses=[File]): """ for fileclass in fileclasses: if fileclass.can_handle(path): - return fileclass(path, db) + return fileclass(path) -def get_files(path, db, fileclasses=[File]): +def get_files(path, fileclasses=[File]): """Returns a list of :class:`File` for each file contained in ``path``. :param Path path: path to scan @@ -273,7 +385,7 @@ def get_files(path, db, fileclasses=[File]): try: result = [] for path in path.listdir(): - file = get_file(path, db, fileclasses=fileclasses) + file = get_file(path, fileclasses=fileclasses) if file is not None: result.append(file) return result diff --git a/qt/app.py b/qt/app.py index 1626a974..06d7eeef 100644 --- a/qt/app.py +++ b/qt/app.py @@ -129,11 +129,11 @@ class DupeGuru(QObject): self.showDirectoriesWindow, ), ( - "actionClearPictureCache", + "actionClearCache", "Ctrl+Shift+P", "", - tr("Clear Picture Cache"), - self.clearPictureCacheTriggered, + tr("Clear Cache"), + self.clearCacheTriggered, ), ( "actionExcludeList", @@ -258,6 +258,7 @@ class DupeGuru(QObject): self.willSavePrefs.emit() self.prefs.save() self.model.save() + self.model.close() # Workaround for #857, hide() or close(). if self.details_dialog is not None: self.details_dialog.close() @@ -288,13 +289,14 @@ class DupeGuru(QObject): self.model.load_from(results) self.recentResults.insertItem(results) - def clearPictureCacheTriggered(self): - title = tr("Clear Picture Cache") - msg = tr("Do you really want to remove all your cached picture analysis?") + def clearCacheTriggered(self): + title = tr("Clear Cache") + msg = tr("Do you really want to clear the cache? This will remove all cached file hashes and picture analysis.") if self.confirm(title, msg, QMessageBox.No): self.model.clear_picture_cache() + self.model.clear_hash_cache() active = QApplication.activeWindow() - QMessageBox.information(active, title, tr("Picture cache cleared.")) + QMessageBox.information(active, title, tr("Cache cleared.")) def ignoreListTriggered(self): if self.use_tabs: diff --git a/qt/directories_dialog.py b/qt/directories_dialog.py index 07b9a276..56a938ba 100644 --- a/qt/directories_dialog.py +++ b/qt/directories_dialog.py @@ -126,7 +126,7 @@ class DirectoriesDialog(QMainWindow): self.menuFile.addAction(self.actionLoadResults) self.menuFile.addAction(self.menuLoadRecent.menuAction()) self.menuFile.addSeparator() - self.menuFile.addAction(self.app.actionClearPictureCache) + self.menuFile.addAction(self.app.actionClearCache) self.menuFile.addSeparator() self.menuFile.addAction(self.actionLoadDirectories) self.menuFile.addAction(self.actionSaveDirectories)