mirror of
https://github.com/arsenetar/dupeguru.git
synced 2025-03-10 05:34:36 +00:00
change FilesDB to singleton class
move hash calculation back in to Files class clear cache now clears hash cache in addition to picture cache
This commit is contained in:
parent
1ea108fc2b
commit
9753afba74
@ -139,7 +139,8 @@ class DupeGuru(Broadcaster):
|
||||
self.discarded_file_count = 0
|
||||
self.exclude_list = ExcludeList()
|
||||
hash_cache_file = op.join(self.appdata, "hash_cache.db")
|
||||
self.directories = directories.Directories(self.exclude_list, hash_cache_file)
|
||||
fs.filesdb.connect(hash_cache_file)
|
||||
self.directories = directories.Directories(self.exclude_list)
|
||||
self.results = results.Results(self)
|
||||
self.ignore_list = IgnoreList()
|
||||
# In addition to "app-level" options, this dictionary also holds options that will be
|
||||
@ -422,6 +423,9 @@ class DupeGuru(Broadcaster):
|
||||
except FileNotFoundError:
|
||||
pass # we don't care
|
||||
|
||||
def clear_hash_cache(self):
|
||||
fs.filesdb.clear()
|
||||
|
||||
def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType):
|
||||
source_path = dupe.path
|
||||
location_path = first(p for p in self.directories if dupe.path in p)
|
||||
@ -753,6 +757,9 @@ class DupeGuru(Broadcaster):
|
||||
self.exclude_list.save_to_xml(p)
|
||||
self.notify("save_session")
|
||||
|
||||
def close(self):
|
||||
fs.filesdb.close()
|
||||
|
||||
def save_as(self, filename):
|
||||
"""Save results in ``filename``.
|
||||
|
||||
|
@ -5,11 +5,8 @@
|
||||
# http://www.gnu.org/licenses/gpl-3.0.html
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
import sqlite3
|
||||
from xml.etree import ElementTree as ET
|
||||
import logging
|
||||
from threading import Lock
|
||||
|
||||
from hscommon.jobprogress import job
|
||||
from hscommon.path import Path
|
||||
@ -47,117 +44,6 @@ class InvalidPathError(Exception):
|
||||
"""The path being added is invalid"""
|
||||
|
||||
|
||||
def calc_md5(path):
|
||||
# type: (Path, ) -> bytes
|
||||
|
||||
with path.open("rb") as fp:
|
||||
md5 = hashlib.md5()
|
||||
# The goal here is to not run out of memory on really big files. However, the chunk
|
||||
# size has to be large enough so that the python loop isn't too costly in terms of
|
||||
# CPU.
|
||||
CHUNK_SIZE = 1024 * 1024 # 1 mb
|
||||
filedata = fp.read(CHUNK_SIZE)
|
||||
while filedata:
|
||||
md5.update(filedata)
|
||||
filedata = fp.read(CHUNK_SIZE)
|
||||
return md5.digest()
|
||||
|
||||
|
||||
def calc_md5partial(path):
|
||||
# type: (Path, ) -> bytes
|
||||
|
||||
# This offset is where we should start reading the file to get a partial md5
|
||||
# For audio file, it should be where audio data starts
|
||||
offset, size = (0x4000, 0x4000)
|
||||
|
||||
with path.open("rb") as fp:
|
||||
fp.seek(offset)
|
||||
partialdata = fp.read(size)
|
||||
return hashlib.md5(partialdata).digest()
|
||||
|
||||
|
||||
class FilesDB:
|
||||
|
||||
create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
|
||||
select_query = "SELECT md5, md5partial FROM files WHERE path=? AND size=? and mtime_ns=?"
|
||||
insert_query = "REPLACE INTO files (path, size, mtime_ns, entry_dt, md5, md5partial) VALUES (?, ?, ?, datetime('now'), ?, ?)"
|
||||
|
||||
def __init__(self, path):
|
||||
# type: (str, ) -> None
|
||||
|
||||
self.conn = sqlite3.connect(path, check_same_thread=False)
|
||||
self.cur = self.conn.cursor()
|
||||
self.setup()
|
||||
self.lock = Lock()
|
||||
|
||||
def setup(self):
|
||||
self.cur.execute(self.create_table_query)
|
||||
|
||||
def get_md5(self, path):
|
||||
# type: (Path, ) -> bytes
|
||||
|
||||
stat = path.stat()
|
||||
size = stat.st_size
|
||||
mtime_ns = stat.st_mtime_ns
|
||||
|
||||
with self.lock:
|
||||
self.cur.execute(self.select_query, (str(path), size, mtime_ns))
|
||||
result = self.cur.fetchone()
|
||||
|
||||
md5 = None
|
||||
md5partial = None
|
||||
|
||||
if result:
|
||||
md5, md5partial = result
|
||||
if md5:
|
||||
return md5
|
||||
|
||||
md5 = calc_md5(path)
|
||||
self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial))
|
||||
return md5
|
||||
|
||||
def get_md5partial(self, path):
|
||||
# type: (Path, ) -> bytes
|
||||
|
||||
stat = path.stat()
|
||||
size = stat.st_size
|
||||
mtime_ns = stat.st_mtime_ns
|
||||
|
||||
with self.lock:
|
||||
self.cur.execute(self.select_query, (str(path), size, mtime_ns))
|
||||
result = self.cur.fetchone()
|
||||
|
||||
md5 = None
|
||||
md5partial = None
|
||||
|
||||
if result:
|
||||
md5, md5partial = result
|
||||
if md5partial:
|
||||
return md5partial
|
||||
|
||||
md5partial = calc_md5partial(path)
|
||||
self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial))
|
||||
return md5partial
|
||||
|
||||
def close(self):
|
||||
logging.debug("Closing FilesDB")
|
||||
|
||||
self.conn.commit()
|
||||
self.conn.close()
|
||||
|
||||
|
||||
class FilesDBDummy:
|
||||
|
||||
def get_md5(self, path):
|
||||
return calc_md5(path)
|
||||
|
||||
def get_md5partial(self, path):
|
||||
return calc_md5partial(path)
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
|
||||
class Directories:
|
||||
"""Holds user folder selection.
|
||||
|
||||
@ -169,15 +55,11 @@ class Directories:
|
||||
"""
|
||||
|
||||
# ---Override
|
||||
def __init__(self, exclude_list=None, hash_cache_file=None):
|
||||
def __init__(self, exclude_list=None):
|
||||
self._dirs = []
|
||||
# {path: state}
|
||||
self.states = {}
|
||||
self._exclude_list = exclude_list
|
||||
if hash_cache_file:
|
||||
self.filesdb = FilesDB(hash_cache_file)
|
||||
else:
|
||||
self.filesdb = FilesDBDummy()
|
||||
|
||||
def __contains__(self, path):
|
||||
for p in self._dirs:
|
||||
@ -221,19 +103,19 @@ class Directories:
|
||||
if state != DirectoryState.EXCLUDED:
|
||||
# Old logic
|
||||
if self._exclude_list is None or not self._exclude_list.mark_count:
|
||||
found_files = [fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses) for f in files]
|
||||
found_files = [fs.get_file(root_path + f, fileclasses=fileclasses) for f in files]
|
||||
else:
|
||||
found_files = []
|
||||
# print(f"len of files: {len(files)} {files}")
|
||||
for f in files:
|
||||
if not self._exclude_list.is_excluded(root, f):
|
||||
found_files.append(fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses))
|
||||
found_files.append(fs.get_file(root_path + f, fileclasses=fileclasses))
|
||||
found_files = [f for f in found_files if f is not None]
|
||||
# In some cases, directories can be considered as files by dupeGuru, which is
|
||||
# why we have this line below. In fact, there only one case: Bundle files under
|
||||
# OS X... In other situations, this forloop will do nothing.
|
||||
for d in dirs[:]:
|
||||
f = fs.get_file(root_path + d, self.filesdb, fileclasses=fileclasses)
|
||||
f = fs.get_file(root_path + d, fileclasses=fileclasses)
|
||||
if f is not None:
|
||||
found_files.append(f)
|
||||
dirs.remove(d)
|
||||
@ -318,7 +200,7 @@ class Directories:
|
||||
folderclass = fs.Folder
|
||||
folder_count = 0
|
||||
for path in self._dirs:
|
||||
from_folder = folderclass(path, self.filesdb)
|
||||
from_folder = folderclass(path)
|
||||
for folder in self._get_folders(from_folder, j):
|
||||
folder_count += 1
|
||||
if type(j) != job.NullJob:
|
||||
@ -405,7 +287,7 @@ class Directories:
|
||||
tree.write(fp, encoding="utf-8")
|
||||
|
||||
def save_hashes(self):
|
||||
self.filesdb.close()
|
||||
fs.filesdb.commit()
|
||||
|
||||
def set_state(self, path, state):
|
||||
"""Set the state of folder at ``path``.
|
||||
|
138
core/fs.py
138
core/fs.py
@ -14,7 +14,11 @@
|
||||
import hashlib
|
||||
from math import floor
|
||||
import logging
|
||||
import sqlite3
|
||||
from threading import Lock
|
||||
from typing import Any
|
||||
|
||||
from hscommon.path import Path
|
||||
from hscommon.util import nonone, get_file_ext
|
||||
|
||||
__all__ = [
|
||||
@ -78,6 +82,82 @@ class OperationError(FSError):
|
||||
cls_message = "Operation on '{name}' failed."
|
||||
|
||||
|
||||
class FilesDB:
|
||||
|
||||
create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
|
||||
drop_table_query = "DROP TABLE files;"
|
||||
select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns"
|
||||
insert_query = """
|
||||
INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value)
|
||||
ON CONFLICT(path) DO UPDATE SET size=:size, mtime_ns=:mtime_ns, entry_dt=datetime('now'), {key}=:value;
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.conn = None
|
||||
self.cur = None
|
||||
self.lock = None
|
||||
|
||||
def connect(self, path):
|
||||
# type: (str, ) -> None
|
||||
|
||||
self.conn = sqlite3.connect(path, check_same_thread=False)
|
||||
self.cur = self.conn.cursor()
|
||||
self.cur.execute(self.create_table_query)
|
||||
self.lock = Lock()
|
||||
|
||||
def clear(self):
|
||||
# type: () -> None
|
||||
|
||||
with self.lock:
|
||||
self.cur.execute(self.drop_table_query)
|
||||
self.cur.execute(self.create_table_query)
|
||||
|
||||
def get(self, path, key):
|
||||
# type: (Path, str) -> bytes
|
||||
|
||||
stat = path.stat()
|
||||
size = stat.st_size
|
||||
mtime_ns = stat.st_mtime_ns
|
||||
|
||||
with self.lock:
|
||||
self.cur.execute(self.select_query.format(key=key), {"path": str(path), "size": size, "mtime_ns": mtime_ns})
|
||||
result = self.cur.fetchone()
|
||||
|
||||
if result:
|
||||
return result[0]
|
||||
|
||||
return None
|
||||
|
||||
def put(self, path, key, value):
|
||||
# type: (Path, str, Any) -> None
|
||||
|
||||
stat = path.stat()
|
||||
size = stat.st_size
|
||||
mtime_ns = stat.st_mtime_ns
|
||||
|
||||
with self.lock:
|
||||
self.cur.execute(
|
||||
self.insert_query.format(key=key),
|
||||
{"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value},
|
||||
)
|
||||
|
||||
def commit(self):
|
||||
# type: () -> None
|
||||
|
||||
with self.lock:
|
||||
self.conn.commit()
|
||||
|
||||
def close(self):
|
||||
# type: () -> None
|
||||
|
||||
with self.lock:
|
||||
self.cur.close()
|
||||
self.conn.close()
|
||||
|
||||
|
||||
filesdb = FilesDB() # Singleton
|
||||
|
||||
|
||||
class File:
|
||||
"""Represents a file and holds metadata to be used for scanning."""
|
||||
|
||||
@ -85,11 +165,10 @@ class File:
|
||||
# Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
|
||||
# files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
|
||||
# even greater when we take into account read attributes (70%!). Yeah, it's worth it.
|
||||
__slots__ = ("path", "db", "is_ref", "words") + tuple(INITIAL_INFO.keys())
|
||||
__slots__ = ("path", "is_ref", "words") + tuple(INITIAL_INFO.keys())
|
||||
|
||||
def __init__(self, path, db):
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
self.db = db
|
||||
for attrname in self.INITIAL_INFO:
|
||||
setattr(self, attrname, NOT_SET)
|
||||
|
||||
@ -108,6 +187,33 @@ class File:
|
||||
result = self.INITIAL_INFO[attrname]
|
||||
return result
|
||||
|
||||
def _calc_md5(self):
|
||||
# type: () -> bytes
|
||||
|
||||
with self.path.open("rb") as fp:
|
||||
md5 = hashlib.md5()
|
||||
# The goal here is to not run out of memory on really big files. However, the chunk
|
||||
# size has to be large enough so that the python loop isn't too costly in terms of
|
||||
# CPU.
|
||||
CHUNK_SIZE = 1024 * 1024 # 1 mb
|
||||
filedata = fp.read(CHUNK_SIZE)
|
||||
while filedata:
|
||||
md5.update(filedata)
|
||||
filedata = fp.read(CHUNK_SIZE)
|
||||
return md5.digest()
|
||||
|
||||
def _calc_md5partial(self):
|
||||
# type: () -> bytes
|
||||
|
||||
# This offset is where we should start reading the file to get a partial md5
|
||||
# For audio file, it should be where audio data starts
|
||||
offset, size = (0x4000, 0x4000)
|
||||
|
||||
with self.path.open("rb") as fp:
|
||||
fp.seek(offset)
|
||||
partialdata = fp.read(size)
|
||||
return hashlib.md5(partialdata).digest()
|
||||
|
||||
def _read_info(self, field):
|
||||
# print(f"_read_info({field}) for {self}")
|
||||
if field in ("size", "mtime"):
|
||||
@ -116,12 +222,18 @@ class File:
|
||||
self.mtime = nonone(stats.st_mtime, 0)
|
||||
elif field == "md5partial":
|
||||
try:
|
||||
self.md5partial = self.db.get_md5partial(self.path)
|
||||
self.md5partial = filesdb.get(self.path, "md5partial")
|
||||
if self.md5partial is None:
|
||||
self.md5partial = self._calc_md5partial()
|
||||
filesdb.put(self.path, "md5partial", self.md5partial)
|
||||
except Exception as e:
|
||||
logging.warning("Couldn't get md5partial for %s: %s", self.path, e)
|
||||
elif field == "md5":
|
||||
try:
|
||||
self.md5 = self.db.get_md5(self.path)
|
||||
self.md5 = filesdb.get(self.path, "md5")
|
||||
if self.md5 is None:
|
||||
self.md5 = self._calc_md5()
|
||||
filesdb.put(self.path, "md5", self.md5)
|
||||
except Exception as e:
|
||||
logging.warning("Couldn't get md5 for %s: %s", self.path, e)
|
||||
elif field == "md5samples":
|
||||
@ -207,13 +319,13 @@ class Folder(File):
|
||||
|
||||
__slots__ = File.__slots__ + ("_subfolders",)
|
||||
|
||||
def __init__(self, path, db):
|
||||
File.__init__(self, path, db)
|
||||
def __init__(self, path):
|
||||
File.__init__(self, path)
|
||||
self._subfolders = None
|
||||
|
||||
def _all_items(self):
|
||||
folders = self.subfolders
|
||||
files = get_files(self.path, self.db)
|
||||
files = get_files(self.path)
|
||||
return folders + files
|
||||
|
||||
def _read_info(self, field):
|
||||
@ -242,7 +354,7 @@ class Folder(File):
|
||||
def subfolders(self):
|
||||
if self._subfolders is None:
|
||||
subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()]
|
||||
self._subfolders = [self.__class__(p, self.db) for p in subfolders]
|
||||
self._subfolders = [self.__class__(p) for p in subfolders]
|
||||
return self._subfolders
|
||||
|
||||
@classmethod
|
||||
@ -250,7 +362,7 @@ class Folder(File):
|
||||
return not path.islink() and path.isdir()
|
||||
|
||||
|
||||
def get_file(path, db, fileclasses=[File]):
|
||||
def get_file(path, fileclasses=[File]):
|
||||
"""Wraps ``path`` around its appropriate :class:`File` class.
|
||||
|
||||
Whether a class is "appropriate" is decided by :meth:`File.can_handle`
|
||||
@ -260,10 +372,10 @@ def get_file(path, db, fileclasses=[File]):
|
||||
"""
|
||||
for fileclass in fileclasses:
|
||||
if fileclass.can_handle(path):
|
||||
return fileclass(path, db)
|
||||
return fileclass(path)
|
||||
|
||||
|
||||
def get_files(path, db, fileclasses=[File]):
|
||||
def get_files(path, fileclasses=[File]):
|
||||
"""Returns a list of :class:`File` for each file contained in ``path``.
|
||||
|
||||
:param Path path: path to scan
|
||||
@ -273,7 +385,7 @@ def get_files(path, db, fileclasses=[File]):
|
||||
try:
|
||||
result = []
|
||||
for path in path.listdir():
|
||||
file = get_file(path, db, fileclasses=fileclasses)
|
||||
file = get_file(path, fileclasses=fileclasses)
|
||||
if file is not None:
|
||||
result.append(file)
|
||||
return result
|
||||
|
16
qt/app.py
16
qt/app.py
@ -129,11 +129,11 @@ class DupeGuru(QObject):
|
||||
self.showDirectoriesWindow,
|
||||
),
|
||||
(
|
||||
"actionClearPictureCache",
|
||||
"actionClearCache",
|
||||
"Ctrl+Shift+P",
|
||||
"",
|
||||
tr("Clear Picture Cache"),
|
||||
self.clearPictureCacheTriggered,
|
||||
tr("Clear Cache"),
|
||||
self.clearCacheTriggered,
|
||||
),
|
||||
(
|
||||
"actionExcludeList",
|
||||
@ -258,6 +258,7 @@ class DupeGuru(QObject):
|
||||
self.willSavePrefs.emit()
|
||||
self.prefs.save()
|
||||
self.model.save()
|
||||
self.model.close()
|
||||
# Workaround for #857, hide() or close().
|
||||
if self.details_dialog is not None:
|
||||
self.details_dialog.close()
|
||||
@ -288,13 +289,14 @@ class DupeGuru(QObject):
|
||||
self.model.load_from(results)
|
||||
self.recentResults.insertItem(results)
|
||||
|
||||
def clearPictureCacheTriggered(self):
|
||||
title = tr("Clear Picture Cache")
|
||||
msg = tr("Do you really want to remove all your cached picture analysis?")
|
||||
def clearCacheTriggered(self):
|
||||
title = tr("Clear Cache")
|
||||
msg = tr("Do you really want to clear the cache? This will remove all cached file hashes and picture analysis.")
|
||||
if self.confirm(title, msg, QMessageBox.No):
|
||||
self.model.clear_picture_cache()
|
||||
self.model.clear_hash_cache()
|
||||
active = QApplication.activeWindow()
|
||||
QMessageBox.information(active, title, tr("Picture cache cleared."))
|
||||
QMessageBox.information(active, title, tr("Cache cleared."))
|
||||
|
||||
def ignoreListTriggered(self):
|
||||
if self.use_tabs:
|
||||
|
@ -126,7 +126,7 @@ class DirectoriesDialog(QMainWindow):
|
||||
self.menuFile.addAction(self.actionLoadResults)
|
||||
self.menuFile.addAction(self.menuLoadRecent.menuAction())
|
||||
self.menuFile.addSeparator()
|
||||
self.menuFile.addAction(self.app.actionClearPictureCache)
|
||||
self.menuFile.addAction(self.app.actionClearCache)
|
||||
self.menuFile.addSeparator()
|
||||
self.menuFile.addAction(self.actionLoadDirectories)
|
||||
self.menuFile.addAction(self.actionSaveDirectories)
|
||||
|
Loading…
x
Reference in New Issue
Block a user