1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2025-05-08 17:59:50 +00:00

Compare commits

..

6 Commits

Author SHA1 Message Date
eb57d269fc
Update translation source files 2021-11-23 21:11:30 -06:00
34f41dc522
Merge pull request #942 from Dobatymo/hash-cache
Implement hash cache for md5 hash based on sqlite
2021-11-23 21:08:22 -06:00
Dobatymo
77460045c4 clean up abstraction 2021-10-29 15:24:47 +08:00
Dobatymo
9753afba74 change FilesDB to singleton class
move hash calculation back in to Files class
clear cache now clears hash cache in addition to picture cache
2021-10-29 15:12:40 +08:00
Dobatymo
1ea108fc2b changed cache filename 2021-10-29 15:12:40 +08:00
Dobatymo
2f02a6010d implement hash cache for md5 hash based on sqlite 2021-10-29 15:12:40 +08:00
7 changed files with 172 additions and 52 deletions

1
.gitignore vendored
View File

@ -7,6 +7,7 @@ __pycache__
.lock-waf* .lock-waf*
.tox .tox
/tags /tags
*.eggs
build build
dist dist

View File

@ -138,6 +138,8 @@ class DupeGuru(Broadcaster):
self.app_mode = AppMode.STANDARD self.app_mode = AppMode.STANDARD
self.discarded_file_count = 0 self.discarded_file_count = 0
self.exclude_list = ExcludeList() self.exclude_list = ExcludeList()
hash_cache_file = op.join(self.appdata, "hash_cache.db")
fs.filesdb.connect(hash_cache_file)
self.directories = directories.Directories(self.exclude_list) self.directories = directories.Directories(self.exclude_list)
self.results = results.Results(self) self.results = results.Results(self)
self.ignore_list = IgnoreList() self.ignore_list = IgnoreList()
@ -293,6 +295,7 @@ class DupeGuru(Broadcaster):
def _job_completed(self, jobid): def _job_completed(self, jobid):
if jobid == JobType.SCAN: if jobid == JobType.SCAN:
self._results_changed() self._results_changed()
fs.filesdb.commit()
if not self.results.groups: if not self.results.groups:
self.view.show_message(tr("No duplicates found.")) self.view.show_message(tr("No duplicates found."))
else: else:
@ -420,6 +423,9 @@ class DupeGuru(Broadcaster):
except FileNotFoundError: except FileNotFoundError:
pass # we don't care pass # we don't care
def clear_hash_cache(self):
fs.filesdb.clear()
def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType): def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType):
source_path = dupe.path source_path = dupe.path
location_path = first(p for p in self.directories if dupe.path in p) location_path = first(p for p in self.directories if dupe.path in p)
@ -751,6 +757,9 @@ class DupeGuru(Broadcaster):
self.exclude_list.save_to_xml(p) self.exclude_list.save_to_xml(p)
self.notify("save_session") self.notify("save_session")
def close(self):
fs.filesdb.close()
def save_as(self, filename): def save_as(self, filename):
"""Save results in ``filename``. """Save results in ``filename``.

View File

@ -14,7 +14,11 @@
import hashlib import hashlib
from math import floor from math import floor
import logging import logging
import sqlite3
from threading import Lock
from typing import Any
from hscommon.path import Path
from hscommon.util import nonone, get_file_ext from hscommon.util import nonone, get_file_ext
__all__ = [ __all__ = [
@ -78,6 +82,82 @@ class OperationError(FSError):
cls_message = "Operation on '{name}' failed." cls_message = "Operation on '{name}' failed."
class FilesDB:
create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
drop_table_query = "DROP TABLE files;"
select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns"
insert_query = """
INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value)
ON CONFLICT(path) DO UPDATE SET size=:size, mtime_ns=:mtime_ns, entry_dt=datetime('now'), {key}=:value;
"""
def __init__(self):
self.conn = None
self.cur = None
self.lock = None
def connect(self, path):
# type: (str, ) -> None
self.conn = sqlite3.connect(path, check_same_thread=False)
self.cur = self.conn.cursor()
self.cur.execute(self.create_table_query)
self.lock = Lock()
def clear(self):
# type: () -> None
with self.lock:
self.cur.execute(self.drop_table_query)
self.cur.execute(self.create_table_query)
def get(self, path, key):
# type: (Path, str) -> bytes
stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns
with self.lock:
self.cur.execute(self.select_query.format(key=key), {"path": str(path), "size": size, "mtime_ns": mtime_ns})
result = self.cur.fetchone()
if result:
return result[0]
return None
def put(self, path, key, value):
# type: (Path, str, Any) -> None
stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns
with self.lock:
self.cur.execute(
self.insert_query.format(key=key),
{"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value},
)
def commit(self):
# type: () -> None
with self.lock:
self.conn.commit()
def close(self):
# type: () -> None
with self.lock:
self.cur.close()
self.conn.close()
filesdb = FilesDB() # Singleton
class File: class File:
"""Represents a file and holds metadata to be used for scanning.""" """Represents a file and holds metadata to be used for scanning."""
@ -107,10 +187,32 @@ class File:
result = self.INITIAL_INFO[attrname] result = self.INITIAL_INFO[attrname]
return result return result
def _calc_md5(self):
# type: () -> bytes
with self.path.open("rb") as fp:
md5 = hashlib.md5()
# The goal here is to not run out of memory on really big files. However, the chunk
# size has to be large enough so that the python loop isn't too costly in terms of
# CPU.
CHUNK_SIZE = 1024 * 1024 # 1 mb
filedata = fp.read(CHUNK_SIZE)
while filedata:
md5.update(filedata)
filedata = fp.read(CHUNK_SIZE)
return md5.digest()
def _calc_md5partial(self):
# type: () -> bytes
# This offset is where we should start reading the file to get a partial md5 # This offset is where we should start reading the file to get a partial md5
# For audio file, it should be where audio data starts # For audio file, it should be where audio data starts
def _get_md5partial_offset_and_size(self): offset, size = (0x4000, 0x4000)
return (0x4000, 0x4000) # 16Kb
with self.path.open("rb") as fp:
fp.seek(offset)
partialdata = fp.read(size)
return hashlib.md5(partialdata).digest()
def _read_info(self, field): def _read_info(self, field):
# print(f"_read_info({field}) for {self}") # print(f"_read_info({field}) for {self}")
@ -120,28 +222,20 @@ class File:
self.mtime = nonone(stats.st_mtime, 0) self.mtime = nonone(stats.st_mtime, 0)
elif field == "md5partial": elif field == "md5partial":
try: try:
with self.path.open("rb") as fp: self.md5partial = filesdb.get(self.path, "md5partial")
offset, size = self._get_md5partial_offset_and_size() if self.md5partial is None:
fp.seek(offset) self.md5partial = self._calc_md5partial()
partialdata = fp.read(size) filesdb.put(self.path, "md5partial", self.md5partial)
md5 = hashlib.md5(partialdata) except Exception as e:
self.md5partial = md5.digest() logging.warning("Couldn't get md5partial for %s: %s", self.path, e)
except Exception:
pass
elif field == "md5": elif field == "md5":
try: try:
with self.path.open("rb") as fp: self.md5 = filesdb.get(self.path, "md5")
md5 = hashlib.md5() if self.md5 is None:
filedata = fp.read(CHUNK_SIZE) self.md5 = self._calc_md5()
while filedata: filesdb.put(self.path, "md5", self.md5)
md5.update(filedata) except Exception as e:
filedata = fp.read(CHUNK_SIZE) logging.warning("Couldn't get md5 for %s: %s", self.path, e)
# FIXME For python 3.8 and later
# while filedata := fp.read(CHUNK_SIZE):
# md5.update(filedata)
self.md5 = md5.digest()
except Exception:
pass
elif field == "md5samples": elif field == "md5samples":
try: try:
with self.path.open("rb") as fp: with self.path.open("rb") as fp:

View File

@ -36,83 +36,83 @@ msgstr ""
msgid "Sending to Trash" msgid "Sending to Trash"
msgstr "" msgstr ""
#: core\app.py:287 #: core\app.py:289
msgid "A previous action is still hanging in there. You can't start a new one yet. Wait a few seconds, then try again." msgid "A previous action is still hanging in there. You can't start a new one yet. Wait a few seconds, then try again."
msgstr "" msgstr ""
#: core\app.py:297 #: core\app.py:300
msgid "No duplicates found." msgid "No duplicates found."
msgstr "" msgstr ""
#: core\app.py:312 #: core\app.py:315
msgid "All marked files were copied successfully." msgid "All marked files were copied successfully."
msgstr "" msgstr ""
#: core\app.py:314 #: core\app.py:317
msgid "All marked files were moved successfully." msgid "All marked files were moved successfully."
msgstr "" msgstr ""
#: core\app.py:316 #: core\app.py:319
msgid "All marked files were deleted successfully." msgid "All marked files were deleted successfully."
msgstr "" msgstr ""
#: core\app.py:318 #: core\app.py:321
msgid "All marked files were successfully sent to Trash." msgid "All marked files were successfully sent to Trash."
msgstr "" msgstr ""
#: core\app.py:323 #: core\app.py:326
msgid "Could not load file: {}" msgid "Could not load file: {}"
msgstr "" msgstr ""
#: core\app.py:379 #: core\app.py:382
msgid "'{}' already is in the list." msgid "'{}' already is in the list."
msgstr "" msgstr ""
#: core\app.py:381 #: core\app.py:384
msgid "'{}' does not exist." msgid "'{}' does not exist."
msgstr "" msgstr ""
#: core\app.py:389 #: core\app.py:392
msgid "All selected %d matches are going to be ignored in all subsequent scans. Continue?" msgid "All selected %d matches are going to be ignored in all subsequent scans. Continue?"
msgstr "" msgstr ""
#: core\app.py:463 #: core\app.py:469
msgid "Select a directory to copy marked files to" msgid "Select a directory to copy marked files to"
msgstr "" msgstr ""
#: core\app.py:465 #: core\app.py:471
msgid "Select a directory to move marked files to" msgid "Select a directory to move marked files to"
msgstr "" msgstr ""
#: core\app.py:504 #: core\app.py:510
msgid "Select a destination for your exported CSV" msgid "Select a destination for your exported CSV"
msgstr "" msgstr ""
#: core\app.py:510 core\app.py:762 core\app.py:772 #: core\app.py:516 core\app.py:771 core\app.py:781
msgid "Couldn't write to file: {}" msgid "Couldn't write to file: {}"
msgstr "" msgstr ""
#: core\app.py:533 #: core\app.py:539
msgid "You have no custom command set up. Set it up in your preferences." msgid "You have no custom command set up. Set it up in your preferences."
msgstr "" msgstr ""
#: core\app.py:689 core\app.py:701 #: core\app.py:695 core\app.py:707
msgid "You are about to remove %d files from results. Continue?" msgid "You are about to remove %d files from results. Continue?"
msgstr "" msgstr ""
#: core\app.py:737 #: core\app.py:743
msgid "{} duplicate groups were changed by the re-prioritization." msgid "{} duplicate groups were changed by the re-prioritization."
msgstr "" msgstr ""
#: core\app.py:781 #: core\app.py:790
msgid "The selected directories contain no scannable file." msgid "The selected directories contain no scannable file."
msgstr "" msgstr ""
#: core\app.py:794 #: core\app.py:803
msgid "Collecting files to scan" msgid "Collecting files to scan"
msgstr "" msgstr ""
#: core\app.py:841 #: core\app.py:850
msgid "%s (%d discarded)" msgid "%s (%d discarded)"
msgstr "" msgstr ""

View File

@ -927,3 +927,17 @@ msgstr ""
#: qt\se\preferences_dialog.py:68 #: qt\se\preferences_dialog.py:68
msgid "Ignore files larger than" msgid "Ignore files larger than"
msgstr "" msgstr ""
#: qt\app.py:135 qt\app.py:293
msgid "Clear Cache"
msgstr ""
#: qt\app.py:294
msgid ""
"Do you really want to clear the cache? This will remove all cached file "
"hashes and picture analysis."
msgstr ""
#: qt\app.py:299
msgid "Cache cleared."
msgstr ""

View File

@ -129,11 +129,11 @@ class DupeGuru(QObject):
self.showDirectoriesWindow, self.showDirectoriesWindow,
), ),
( (
"actionClearPictureCache", "actionClearCache",
"Ctrl+Shift+P", "Ctrl+Shift+P",
"", "",
tr("Clear Picture Cache"), tr("Clear Cache"),
self.clearPictureCacheTriggered, self.clearCacheTriggered,
), ),
( (
"actionExcludeList", "actionExcludeList",
@ -258,6 +258,7 @@ class DupeGuru(QObject):
self.willSavePrefs.emit() self.willSavePrefs.emit()
self.prefs.save() self.prefs.save()
self.model.save() self.model.save()
self.model.close()
# Workaround for #857, hide() or close(). # Workaround for #857, hide() or close().
if self.details_dialog is not None: if self.details_dialog is not None:
self.details_dialog.close() self.details_dialog.close()
@ -288,13 +289,14 @@ class DupeGuru(QObject):
self.model.load_from(results) self.model.load_from(results)
self.recentResults.insertItem(results) self.recentResults.insertItem(results)
def clearPictureCacheTriggered(self): def clearCacheTriggered(self):
title = tr("Clear Picture Cache") title = tr("Clear Cache")
msg = tr("Do you really want to remove all your cached picture analysis?") msg = tr("Do you really want to clear the cache? This will remove all cached file hashes and picture analysis.")
if self.confirm(title, msg, QMessageBox.No): if self.confirm(title, msg, QMessageBox.No):
self.model.clear_picture_cache() self.model.clear_picture_cache()
self.model.clear_hash_cache()
active = QApplication.activeWindow() active = QApplication.activeWindow()
QMessageBox.information(active, title, tr("Picture cache cleared.")) QMessageBox.information(active, title, tr("Cache cleared."))
def ignoreListTriggered(self): def ignoreListTriggered(self):
if self.use_tabs: if self.use_tabs:

View File

@ -126,7 +126,7 @@ class DirectoriesDialog(QMainWindow):
self.menuFile.addAction(self.actionLoadResults) self.menuFile.addAction(self.actionLoadResults)
self.menuFile.addAction(self.menuLoadRecent.menuAction()) self.menuFile.addAction(self.menuLoadRecent.menuAction())
self.menuFile.addSeparator() self.menuFile.addSeparator()
self.menuFile.addAction(self.app.actionClearPictureCache) self.menuFile.addAction(self.app.actionClearCache)
self.menuFile.addSeparator() self.menuFile.addSeparator()
self.menuFile.addAction(self.actionLoadDirectories) self.menuFile.addAction(self.actionLoadDirectories)
self.menuFile.addAction(self.actionSaveDirectories) self.menuFile.addAction(self.actionSaveDirectories)