change FilesDB to singleton class

move hash calculation back in to Files class
clear cache now clears hash cache in addition to picture cache
This commit is contained in:
Dobatymo 2021-10-29 12:22:12 +08:00
parent 1ea108fc2b
commit 9753afba74
5 changed files with 149 additions and 146 deletions

View File

@ -139,7 +139,8 @@ class DupeGuru(Broadcaster):
self.discarded_file_count = 0 self.discarded_file_count = 0
self.exclude_list = ExcludeList() self.exclude_list = ExcludeList()
hash_cache_file = op.join(self.appdata, "hash_cache.db") hash_cache_file = op.join(self.appdata, "hash_cache.db")
self.directories = directories.Directories(self.exclude_list, hash_cache_file) fs.filesdb.connect(hash_cache_file)
self.directories = directories.Directories(self.exclude_list)
self.results = results.Results(self) self.results = results.Results(self)
self.ignore_list = IgnoreList() self.ignore_list = IgnoreList()
# In addition to "app-level" options, this dictionary also holds options that will be # In addition to "app-level" options, this dictionary also holds options that will be
@ -422,6 +423,9 @@ class DupeGuru(Broadcaster):
except FileNotFoundError: except FileNotFoundError:
pass # we don't care pass # we don't care
def clear_hash_cache(self):
fs.filesdb.clear()
def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType): def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType):
source_path = dupe.path source_path = dupe.path
location_path = first(p for p in self.directories if dupe.path in p) location_path = first(p for p in self.directories if dupe.path in p)
@ -753,6 +757,9 @@ class DupeGuru(Broadcaster):
self.exclude_list.save_to_xml(p) self.exclude_list.save_to_xml(p)
self.notify("save_session") self.notify("save_session")
def close(self):
fs.filesdb.close()
def save_as(self, filename): def save_as(self, filename):
"""Save results in ``filename``. """Save results in ``filename``.

View File

@ -5,11 +5,8 @@
# http://www.gnu.org/licenses/gpl-3.0.html # http://www.gnu.org/licenses/gpl-3.0.html
import os import os
import hashlib
import sqlite3
from xml.etree import ElementTree as ET from xml.etree import ElementTree as ET
import logging import logging
from threading import Lock
from hscommon.jobprogress import job from hscommon.jobprogress import job
from hscommon.path import Path from hscommon.path import Path
@ -47,117 +44,6 @@ class InvalidPathError(Exception):
"""The path being added is invalid""" """The path being added is invalid"""
def calc_md5(path):
# type: (Path, ) -> bytes
with path.open("rb") as fp:
md5 = hashlib.md5()
# The goal here is to not run out of memory on really big files. However, the chunk
# size has to be large enough so that the python loop isn't too costly in terms of
# CPU.
CHUNK_SIZE = 1024 * 1024 # 1 mb
filedata = fp.read(CHUNK_SIZE)
while filedata:
md5.update(filedata)
filedata = fp.read(CHUNK_SIZE)
return md5.digest()
def calc_md5partial(path):
# type: (Path, ) -> bytes
# This offset is where we should start reading the file to get a partial md5
# For audio file, it should be where audio data starts
offset, size = (0x4000, 0x4000)
with path.open("rb") as fp:
fp.seek(offset)
partialdata = fp.read(size)
return hashlib.md5(partialdata).digest()
class FilesDB:
create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
select_query = "SELECT md5, md5partial FROM files WHERE path=? AND size=? and mtime_ns=?"
insert_query = "REPLACE INTO files (path, size, mtime_ns, entry_dt, md5, md5partial) VALUES (?, ?, ?, datetime('now'), ?, ?)"
def __init__(self, path):
# type: (str, ) -> None
self.conn = sqlite3.connect(path, check_same_thread=False)
self.cur = self.conn.cursor()
self.setup()
self.lock = Lock()
def setup(self):
self.cur.execute(self.create_table_query)
def get_md5(self, path):
# type: (Path, ) -> bytes
stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns
with self.lock:
self.cur.execute(self.select_query, (str(path), size, mtime_ns))
result = self.cur.fetchone()
md5 = None
md5partial = None
if result:
md5, md5partial = result
if md5:
return md5
md5 = calc_md5(path)
self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial))
return md5
def get_md5partial(self, path):
# type: (Path, ) -> bytes
stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns
with self.lock:
self.cur.execute(self.select_query, (str(path), size, mtime_ns))
result = self.cur.fetchone()
md5 = None
md5partial = None
if result:
md5, md5partial = result
if md5partial:
return md5partial
md5partial = calc_md5partial(path)
self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial))
return md5partial
def close(self):
logging.debug("Closing FilesDB")
self.conn.commit()
self.conn.close()
class FilesDBDummy:
def get_md5(self, path):
return calc_md5(path)
def get_md5partial(self, path):
return calc_md5partial(path)
def close(self):
pass
class Directories: class Directories:
"""Holds user folder selection. """Holds user folder selection.
@ -169,15 +55,11 @@ class Directories:
""" """
# ---Override # ---Override
def __init__(self, exclude_list=None, hash_cache_file=None): def __init__(self, exclude_list=None):
self._dirs = [] self._dirs = []
# {path: state} # {path: state}
self.states = {} self.states = {}
self._exclude_list = exclude_list self._exclude_list = exclude_list
if hash_cache_file:
self.filesdb = FilesDB(hash_cache_file)
else:
self.filesdb = FilesDBDummy()
def __contains__(self, path): def __contains__(self, path):
for p in self._dirs: for p in self._dirs:
@ -221,19 +103,19 @@ class Directories:
if state != DirectoryState.EXCLUDED: if state != DirectoryState.EXCLUDED:
# Old logic # Old logic
if self._exclude_list is None or not self._exclude_list.mark_count: if self._exclude_list is None or not self._exclude_list.mark_count:
found_files = [fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses) for f in files] found_files = [fs.get_file(root_path + f, fileclasses=fileclasses) for f in files]
else: else:
found_files = [] found_files = []
# print(f"len of files: {len(files)} {files}") # print(f"len of files: {len(files)} {files}")
for f in files: for f in files:
if not self._exclude_list.is_excluded(root, f): if not self._exclude_list.is_excluded(root, f):
found_files.append(fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses)) found_files.append(fs.get_file(root_path + f, fileclasses=fileclasses))
found_files = [f for f in found_files if f is not None] found_files = [f for f in found_files if f is not None]
# In some cases, directories can be considered as files by dupeGuru, which is # In some cases, directories can be considered as files by dupeGuru, which is
# why we have this line below. In fact, there only one case: Bundle files under # why we have this line below. In fact, there only one case: Bundle files under
# OS X... In other situations, this forloop will do nothing. # OS X... In other situations, this forloop will do nothing.
for d in dirs[:]: for d in dirs[:]:
f = fs.get_file(root_path + d, self.filesdb, fileclasses=fileclasses) f = fs.get_file(root_path + d, fileclasses=fileclasses)
if f is not None: if f is not None:
found_files.append(f) found_files.append(f)
dirs.remove(d) dirs.remove(d)
@ -318,7 +200,7 @@ class Directories:
folderclass = fs.Folder folderclass = fs.Folder
folder_count = 0 folder_count = 0
for path in self._dirs: for path in self._dirs:
from_folder = folderclass(path, self.filesdb) from_folder = folderclass(path)
for folder in self._get_folders(from_folder, j): for folder in self._get_folders(from_folder, j):
folder_count += 1 folder_count += 1
if type(j) != job.NullJob: if type(j) != job.NullJob:
@ -405,7 +287,7 @@ class Directories:
tree.write(fp, encoding="utf-8") tree.write(fp, encoding="utf-8")
def save_hashes(self): def save_hashes(self):
self.filesdb.close() fs.filesdb.commit()
def set_state(self, path, state): def set_state(self, path, state):
"""Set the state of folder at ``path``. """Set the state of folder at ``path``.

View File

@ -14,7 +14,11 @@
import hashlib import hashlib
from math import floor from math import floor
import logging import logging
import sqlite3
from threading import Lock
from typing import Any
from hscommon.path import Path
from hscommon.util import nonone, get_file_ext from hscommon.util import nonone, get_file_ext
__all__ = [ __all__ = [
@ -78,6 +82,82 @@ class OperationError(FSError):
cls_message = "Operation on '{name}' failed." cls_message = "Operation on '{name}' failed."
class FilesDB:
create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
drop_table_query = "DROP TABLE files;"
select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns"
insert_query = """
INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value)
ON CONFLICT(path) DO UPDATE SET size=:size, mtime_ns=:mtime_ns, entry_dt=datetime('now'), {key}=:value;
"""
def __init__(self):
self.conn = None
self.cur = None
self.lock = None
def connect(self, path):
# type: (str, ) -> None
self.conn = sqlite3.connect(path, check_same_thread=False)
self.cur = self.conn.cursor()
self.cur.execute(self.create_table_query)
self.lock = Lock()
def clear(self):
# type: () -> None
with self.lock:
self.cur.execute(self.drop_table_query)
self.cur.execute(self.create_table_query)
def get(self, path, key):
# type: (Path, str) -> bytes
stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns
with self.lock:
self.cur.execute(self.select_query.format(key=key), {"path": str(path), "size": size, "mtime_ns": mtime_ns})
result = self.cur.fetchone()
if result:
return result[0]
return None
def put(self, path, key, value):
# type: (Path, str, Any) -> None
stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns
with self.lock:
self.cur.execute(
self.insert_query.format(key=key),
{"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value},
)
def commit(self):
# type: () -> None
with self.lock:
self.conn.commit()
def close(self):
# type: () -> None
with self.lock:
self.cur.close()
self.conn.close()
filesdb = FilesDB() # Singleton
class File: class File:
"""Represents a file and holds metadata to be used for scanning.""" """Represents a file and holds metadata to be used for scanning."""
@ -85,11 +165,10 @@ class File:
# Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
# files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
# even greater when we take into account read attributes (70%!). Yeah, it's worth it. # even greater when we take into account read attributes (70%!). Yeah, it's worth it.
__slots__ = ("path", "db", "is_ref", "words") + tuple(INITIAL_INFO.keys()) __slots__ = ("path", "is_ref", "words") + tuple(INITIAL_INFO.keys())
def __init__(self, path, db): def __init__(self, path):
self.path = path self.path = path
self.db = db
for attrname in self.INITIAL_INFO: for attrname in self.INITIAL_INFO:
setattr(self, attrname, NOT_SET) setattr(self, attrname, NOT_SET)
@ -108,6 +187,33 @@ class File:
result = self.INITIAL_INFO[attrname] result = self.INITIAL_INFO[attrname]
return result return result
def _calc_md5(self):
# type: () -> bytes
with self.path.open("rb") as fp:
md5 = hashlib.md5()
# The goal here is to not run out of memory on really big files. However, the chunk
# size has to be large enough so that the python loop isn't too costly in terms of
# CPU.
CHUNK_SIZE = 1024 * 1024 # 1 mb
filedata = fp.read(CHUNK_SIZE)
while filedata:
md5.update(filedata)
filedata = fp.read(CHUNK_SIZE)
return md5.digest()
def _calc_md5partial(self):
# type: () -> bytes
# This offset is where we should start reading the file to get a partial md5
# For audio file, it should be where audio data starts
offset, size = (0x4000, 0x4000)
with self.path.open("rb") as fp:
fp.seek(offset)
partialdata = fp.read(size)
return hashlib.md5(partialdata).digest()
def _read_info(self, field): def _read_info(self, field):
# print(f"_read_info({field}) for {self}") # print(f"_read_info({field}) for {self}")
if field in ("size", "mtime"): if field in ("size", "mtime"):
@ -116,12 +222,18 @@ class File:
self.mtime = nonone(stats.st_mtime, 0) self.mtime = nonone(stats.st_mtime, 0)
elif field == "md5partial": elif field == "md5partial":
try: try:
self.md5partial = self.db.get_md5partial(self.path) self.md5partial = filesdb.get(self.path, "md5partial")
if self.md5partial is None:
self.md5partial = self._calc_md5partial()
filesdb.put(self.path, "md5partial", self.md5partial)
except Exception as e: except Exception as e:
logging.warning("Couldn't get md5partial for %s: %s", self.path, e) logging.warning("Couldn't get md5partial for %s: %s", self.path, e)
elif field == "md5": elif field == "md5":
try: try:
self.md5 = self.db.get_md5(self.path) self.md5 = filesdb.get(self.path, "md5")
if self.md5 is None:
self.md5 = self._calc_md5()
filesdb.put(self.path, "md5", self.md5)
except Exception as e: except Exception as e:
logging.warning("Couldn't get md5 for %s: %s", self.path, e) logging.warning("Couldn't get md5 for %s: %s", self.path, e)
elif field == "md5samples": elif field == "md5samples":
@ -207,13 +319,13 @@ class Folder(File):
__slots__ = File.__slots__ + ("_subfolders",) __slots__ = File.__slots__ + ("_subfolders",)
def __init__(self, path, db): def __init__(self, path):
File.__init__(self, path, db) File.__init__(self, path)
self._subfolders = None self._subfolders = None
def _all_items(self): def _all_items(self):
folders = self.subfolders folders = self.subfolders
files = get_files(self.path, self.db) files = get_files(self.path)
return folders + files return folders + files
def _read_info(self, field): def _read_info(self, field):
@ -242,7 +354,7 @@ class Folder(File):
def subfolders(self): def subfolders(self):
if self._subfolders is None: if self._subfolders is None:
subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()] subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()]
self._subfolders = [self.__class__(p, self.db) for p in subfolders] self._subfolders = [self.__class__(p) for p in subfolders]
return self._subfolders return self._subfolders
@classmethod @classmethod
@ -250,7 +362,7 @@ class Folder(File):
return not path.islink() and path.isdir() return not path.islink() and path.isdir()
def get_file(path, db, fileclasses=[File]): def get_file(path, fileclasses=[File]):
"""Wraps ``path`` around its appropriate :class:`File` class. """Wraps ``path`` around its appropriate :class:`File` class.
Whether a class is "appropriate" is decided by :meth:`File.can_handle` Whether a class is "appropriate" is decided by :meth:`File.can_handle`
@ -260,10 +372,10 @@ def get_file(path, db, fileclasses=[File]):
""" """
for fileclass in fileclasses: for fileclass in fileclasses:
if fileclass.can_handle(path): if fileclass.can_handle(path):
return fileclass(path, db) return fileclass(path)
def get_files(path, db, fileclasses=[File]): def get_files(path, fileclasses=[File]):
"""Returns a list of :class:`File` for each file contained in ``path``. """Returns a list of :class:`File` for each file contained in ``path``.
:param Path path: path to scan :param Path path: path to scan
@ -273,7 +385,7 @@ def get_files(path, db, fileclasses=[File]):
try: try:
result = [] result = []
for path in path.listdir(): for path in path.listdir():
file = get_file(path, db, fileclasses=fileclasses) file = get_file(path, fileclasses=fileclasses)
if file is not None: if file is not None:
result.append(file) result.append(file)
return result return result

View File

@ -129,11 +129,11 @@ class DupeGuru(QObject):
self.showDirectoriesWindow, self.showDirectoriesWindow,
), ),
( (
"actionClearPictureCache", "actionClearCache",
"Ctrl+Shift+P", "Ctrl+Shift+P",
"", "",
tr("Clear Picture Cache"), tr("Clear Cache"),
self.clearPictureCacheTriggered, self.clearCacheTriggered,
), ),
( (
"actionExcludeList", "actionExcludeList",
@ -258,6 +258,7 @@ class DupeGuru(QObject):
self.willSavePrefs.emit() self.willSavePrefs.emit()
self.prefs.save() self.prefs.save()
self.model.save() self.model.save()
self.model.close()
# Workaround for #857, hide() or close(). # Workaround for #857, hide() or close().
if self.details_dialog is not None: if self.details_dialog is not None:
self.details_dialog.close() self.details_dialog.close()
@ -288,13 +289,14 @@ class DupeGuru(QObject):
self.model.load_from(results) self.model.load_from(results)
self.recentResults.insertItem(results) self.recentResults.insertItem(results)
def clearPictureCacheTriggered(self): def clearCacheTriggered(self):
title = tr("Clear Picture Cache") title = tr("Clear Cache")
msg = tr("Do you really want to remove all your cached picture analysis?") msg = tr("Do you really want to clear the cache? This will remove all cached file hashes and picture analysis.")
if self.confirm(title, msg, QMessageBox.No): if self.confirm(title, msg, QMessageBox.No):
self.model.clear_picture_cache() self.model.clear_picture_cache()
self.model.clear_hash_cache()
active = QApplication.activeWindow() active = QApplication.activeWindow()
QMessageBox.information(active, title, tr("Picture cache cleared.")) QMessageBox.information(active, title, tr("Cache cleared."))
def ignoreListTriggered(self): def ignoreListTriggered(self):
if self.use_tabs: if self.use_tabs:

View File

@ -126,7 +126,7 @@ class DirectoriesDialog(QMainWindow):
self.menuFile.addAction(self.actionLoadResults) self.menuFile.addAction(self.actionLoadResults)
self.menuFile.addAction(self.menuLoadRecent.menuAction()) self.menuFile.addAction(self.menuLoadRecent.menuAction())
self.menuFile.addSeparator() self.menuFile.addSeparator()
self.menuFile.addAction(self.app.actionClearPictureCache) self.menuFile.addAction(self.app.actionClearCache)
self.menuFile.addSeparator() self.menuFile.addSeparator()
self.menuFile.addAction(self.actionLoadDirectories) self.menuFile.addAction(self.actionLoadDirectories)
self.menuFile.addAction(self.actionSaveDirectories) self.menuFile.addAction(self.actionSaveDirectories)