1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2025-03-10 05:34:36 +00:00

Merge pull request #942 from Dobatymo/hash-cache

Implement hash cache for md5 hash based on sqlite
This commit is contained in:
Andrew Senetar 2021-11-23 21:08:22 -06:00 committed by GitHub
commit 34f41dc522
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 138 additions and 32 deletions

1
.gitignore vendored
View File

@ -7,6 +7,7 @@ __pycache__
.lock-waf* .lock-waf*
.tox .tox
/tags /tags
*.eggs
build build
dist dist

View File

@ -138,6 +138,8 @@ class DupeGuru(Broadcaster):
self.app_mode = AppMode.STANDARD self.app_mode = AppMode.STANDARD
self.discarded_file_count = 0 self.discarded_file_count = 0
self.exclude_list = ExcludeList() self.exclude_list = ExcludeList()
hash_cache_file = op.join(self.appdata, "hash_cache.db")
fs.filesdb.connect(hash_cache_file)
self.directories = directories.Directories(self.exclude_list) self.directories = directories.Directories(self.exclude_list)
self.results = results.Results(self) self.results = results.Results(self)
self.ignore_list = IgnoreList() self.ignore_list = IgnoreList()
@ -293,6 +295,7 @@ class DupeGuru(Broadcaster):
def _job_completed(self, jobid): def _job_completed(self, jobid):
if jobid == JobType.SCAN: if jobid == JobType.SCAN:
self._results_changed() self._results_changed()
fs.filesdb.commit()
if not self.results.groups: if not self.results.groups:
self.view.show_message(tr("No duplicates found.")) self.view.show_message(tr("No duplicates found."))
else: else:
@ -420,6 +423,9 @@ class DupeGuru(Broadcaster):
except FileNotFoundError: except FileNotFoundError:
pass # we don't care pass # we don't care
def clear_hash_cache(self):
fs.filesdb.clear()
def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType): def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType):
source_path = dupe.path source_path = dupe.path
location_path = first(p for p in self.directories if dupe.path in p) location_path = first(p for p in self.directories if dupe.path in p)
@ -751,6 +757,9 @@ class DupeGuru(Broadcaster):
self.exclude_list.save_to_xml(p) self.exclude_list.save_to_xml(p)
self.notify("save_session") self.notify("save_session")
def close(self):
fs.filesdb.close()
def save_as(self, filename): def save_as(self, filename):
"""Save results in ``filename``. """Save results in ``filename``.

View File

@ -14,7 +14,11 @@
import hashlib import hashlib
from math import floor from math import floor
import logging import logging
import sqlite3
from threading import Lock
from typing import Any
from hscommon.path import Path
from hscommon.util import nonone, get_file_ext from hscommon.util import nonone, get_file_ext
__all__ = [ __all__ = [
@ -78,6 +82,82 @@ class OperationError(FSError):
cls_message = "Operation on '{name}' failed." cls_message = "Operation on '{name}' failed."
class FilesDB:
create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
drop_table_query = "DROP TABLE files;"
select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns"
insert_query = """
INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value)
ON CONFLICT(path) DO UPDATE SET size=:size, mtime_ns=:mtime_ns, entry_dt=datetime('now'), {key}=:value;
"""
def __init__(self):
self.conn = None
self.cur = None
self.lock = None
def connect(self, path):
# type: (str, ) -> None
self.conn = sqlite3.connect(path, check_same_thread=False)
self.cur = self.conn.cursor()
self.cur.execute(self.create_table_query)
self.lock = Lock()
def clear(self):
# type: () -> None
with self.lock:
self.cur.execute(self.drop_table_query)
self.cur.execute(self.create_table_query)
def get(self, path, key):
# type: (Path, str) -> bytes
stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns
with self.lock:
self.cur.execute(self.select_query.format(key=key), {"path": str(path), "size": size, "mtime_ns": mtime_ns})
result = self.cur.fetchone()
if result:
return result[0]
return None
def put(self, path, key, value):
# type: (Path, str, Any) -> None
stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns
with self.lock:
self.cur.execute(
self.insert_query.format(key=key),
{"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value},
)
def commit(self):
# type: () -> None
with self.lock:
self.conn.commit()
def close(self):
# type: () -> None
with self.lock:
self.cur.close()
self.conn.close()
filesdb = FilesDB() # Singleton
class File: class File:
"""Represents a file and holds metadata to be used for scanning.""" """Represents a file and holds metadata to be used for scanning."""
@ -107,10 +187,32 @@ class File:
result = self.INITIAL_INFO[attrname] result = self.INITIAL_INFO[attrname]
return result return result
# This offset is where we should start reading the file to get a partial md5 def _calc_md5(self):
# For audio file, it should be where audio data starts # type: () -> bytes
def _get_md5partial_offset_and_size(self):
return (0x4000, 0x4000) # 16Kb with self.path.open("rb") as fp:
md5 = hashlib.md5()
# The goal here is to not run out of memory on really big files. However, the chunk
# size has to be large enough so that the python loop isn't too costly in terms of
# CPU.
CHUNK_SIZE = 1024 * 1024 # 1 mb
filedata = fp.read(CHUNK_SIZE)
while filedata:
md5.update(filedata)
filedata = fp.read(CHUNK_SIZE)
return md5.digest()
def _calc_md5partial(self):
# type: () -> bytes
# This offset is where we should start reading the file to get a partial md5
# For audio file, it should be where audio data starts
offset, size = (0x4000, 0x4000)
with self.path.open("rb") as fp:
fp.seek(offset)
partialdata = fp.read(size)
return hashlib.md5(partialdata).digest()
def _read_info(self, field): def _read_info(self, field):
# print(f"_read_info({field}) for {self}") # print(f"_read_info({field}) for {self}")
@ -120,28 +222,20 @@ class File:
self.mtime = nonone(stats.st_mtime, 0) self.mtime = nonone(stats.st_mtime, 0)
elif field == "md5partial": elif field == "md5partial":
try: try:
with self.path.open("rb") as fp: self.md5partial = filesdb.get(self.path, "md5partial")
offset, size = self._get_md5partial_offset_and_size() if self.md5partial is None:
fp.seek(offset) self.md5partial = self._calc_md5partial()
partialdata = fp.read(size) filesdb.put(self.path, "md5partial", self.md5partial)
md5 = hashlib.md5(partialdata) except Exception as e:
self.md5partial = md5.digest() logging.warning("Couldn't get md5partial for %s: %s", self.path, e)
except Exception:
pass
elif field == "md5": elif field == "md5":
try: try:
with self.path.open("rb") as fp: self.md5 = filesdb.get(self.path, "md5")
md5 = hashlib.md5() if self.md5 is None:
filedata = fp.read(CHUNK_SIZE) self.md5 = self._calc_md5()
while filedata: filesdb.put(self.path, "md5", self.md5)
md5.update(filedata) except Exception as e:
filedata = fp.read(CHUNK_SIZE) logging.warning("Couldn't get md5 for %s: %s", self.path, e)
# FIXME For python 3.8 and later
# while filedata := fp.read(CHUNK_SIZE):
# md5.update(filedata)
self.md5 = md5.digest()
except Exception:
pass
elif field == "md5samples": elif field == "md5samples":
try: try:
with self.path.open("rb") as fp: with self.path.open("rb") as fp:

View File

@ -129,11 +129,11 @@ class DupeGuru(QObject):
self.showDirectoriesWindow, self.showDirectoriesWindow,
), ),
( (
"actionClearPictureCache", "actionClearCache",
"Ctrl+Shift+P", "Ctrl+Shift+P",
"", "",
tr("Clear Picture Cache"), tr("Clear Cache"),
self.clearPictureCacheTriggered, self.clearCacheTriggered,
), ),
( (
"actionExcludeList", "actionExcludeList",
@ -258,6 +258,7 @@ class DupeGuru(QObject):
self.willSavePrefs.emit() self.willSavePrefs.emit()
self.prefs.save() self.prefs.save()
self.model.save() self.model.save()
self.model.close()
# Workaround for #857, hide() or close(). # Workaround for #857, hide() or close().
if self.details_dialog is not None: if self.details_dialog is not None:
self.details_dialog.close() self.details_dialog.close()
@ -288,13 +289,14 @@ class DupeGuru(QObject):
self.model.load_from(results) self.model.load_from(results)
self.recentResults.insertItem(results) self.recentResults.insertItem(results)
def clearPictureCacheTriggered(self): def clearCacheTriggered(self):
title = tr("Clear Picture Cache") title = tr("Clear Cache")
msg = tr("Do you really want to remove all your cached picture analysis?") msg = tr("Do you really want to clear the cache? This will remove all cached file hashes and picture analysis.")
if self.confirm(title, msg, QMessageBox.No): if self.confirm(title, msg, QMessageBox.No):
self.model.clear_picture_cache() self.model.clear_picture_cache()
self.model.clear_hash_cache()
active = QApplication.activeWindow() active = QApplication.activeWindow()
QMessageBox.information(active, title, tr("Picture cache cleared.")) QMessageBox.information(active, title, tr("Cache cleared."))
def ignoreListTriggered(self): def ignoreListTriggered(self):
if self.use_tabs: if self.use_tabs:

View File

@ -126,7 +126,7 @@ class DirectoriesDialog(QMainWindow):
self.menuFile.addAction(self.actionLoadResults) self.menuFile.addAction(self.actionLoadResults)
self.menuFile.addAction(self.menuLoadRecent.menuAction()) self.menuFile.addAction(self.menuLoadRecent.menuAction())
self.menuFile.addSeparator() self.menuFile.addSeparator()
self.menuFile.addAction(self.app.actionClearPictureCache) self.menuFile.addAction(self.app.actionClearCache)
self.menuFile.addSeparator() self.menuFile.addSeparator()
self.menuFile.addAction(self.actionLoadDirectories) self.menuFile.addAction(self.actionLoadDirectories)
self.menuFile.addAction(self.actionSaveDirectories) self.menuFile.addAction(self.actionSaveDirectories)