implement hash cache for md5 hash based on sqlite

This commit is contained in:
Dobatymo 2020-11-27 14:49:14 +08:00
parent b80489fd66
commit 2f02a6010d
4 changed files with 147 additions and 41 deletions

1
.gitignore vendored
View File

@ -7,6 +7,7 @@ __pycache__
.lock-waf* .lock-waf*
.tox .tox
/tags /tags
*.eggs
build build
dist dist

View File

@ -138,7 +138,8 @@ class DupeGuru(Broadcaster):
self.app_mode = AppMode.STANDARD self.app_mode = AppMode.STANDARD
self.discarded_file_count = 0 self.discarded_file_count = 0
self.exclude_list = ExcludeList() self.exclude_list = ExcludeList()
self.directories = directories.Directories(self.exclude_list) hash_cache_file = op.join(self.appdata, "hash.cache")
self.directories = directories.Directories(self.exclude_list, hash_cache_file)
self.results = results.Results(self) self.results = results.Results(self)
self.ignore_list = IgnoreList() self.ignore_list = IgnoreList()
# In addition to "app-level" options, this dictionary also holds options that will be # In addition to "app-level" options, this dictionary also holds options that will be
@ -293,6 +294,7 @@ class DupeGuru(Broadcaster):
def _job_completed(self, jobid): def _job_completed(self, jobid):
if jobid == JobType.SCAN: if jobid == JobType.SCAN:
self._results_changed() self._results_changed()
self.directories.save_hashes()
if not self.results.groups: if not self.results.groups:
self.view.show_message(tr("No duplicates found.")) self.view.show_message(tr("No duplicates found."))
else: else:

View File

@ -5,8 +5,11 @@
# http://www.gnu.org/licenses/gpl-3.0.html # http://www.gnu.org/licenses/gpl-3.0.html
import os import os
import hashlib
import sqlite3
from xml.etree import ElementTree as ET from xml.etree import ElementTree as ET
import logging import logging
from threading import Lock
from hscommon.jobprogress import job from hscommon.jobprogress import job
from hscommon.path import Path from hscommon.path import Path
@ -44,6 +47,117 @@ class InvalidPathError(Exception):
"""The path being added is invalid""" """The path being added is invalid"""
def calc_md5(path):
# type: (Path, ) -> bytes
with path.open("rb") as fp:
md5 = hashlib.md5()
# The goal here is to not run out of memory on really big files. However, the chunk
# size has to be large enough so that the python loop isn't too costly in terms of
# CPU.
CHUNK_SIZE = 1024 * 1024 # 1 mb
filedata = fp.read(CHUNK_SIZE)
while filedata:
md5.update(filedata)
filedata = fp.read(CHUNK_SIZE)
return md5.digest()
def calc_md5partial(path):
# type: (Path, ) -> bytes
# This offset is where we should start reading the file to get a partial md5
# For audio file, it should be where audio data starts
offset, size = (0x4000, 0x4000)
with path.open("rb") as fp:
fp.seek(offset)
partialdata = fp.read(size)
return hashlib.md5(partialdata).digest()
class FilesDB:
create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
select_query = "SELECT md5, md5partial FROM files WHERE path=? AND size=? and mtime_ns=?"
insert_query = "REPLACE INTO files (path, size, mtime_ns, entry_dt, md5, md5partial) VALUES (?, ?, ?, datetime('now'), ?, ?)"
def __init__(self, path):
# type: (str, ) -> None
self.conn = sqlite3.connect(path, check_same_thread=False)
self.cur = self.conn.cursor()
self.setup()
self.lock = Lock()
def setup(self):
self.cur.execute(self.create_table_query)
def get_md5(self, path):
# type: (Path, ) -> bytes
stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns
with self.lock:
self.cur.execute(self.select_query, (str(path), size, mtime_ns))
result = self.cur.fetchone()
md5 = None
md5partial = None
if result:
md5, md5partial = result
if md5:
return md5
md5 = calc_md5(path)
self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial))
return md5
def get_md5partial(self, path):
# type: (Path, ) -> bytes
stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns
with self.lock:
self.cur.execute(self.select_query, (str(path), size, mtime_ns))
result = self.cur.fetchone()
md5 = None
md5partial = None
if result:
md5, md5partial = result
if md5partial:
return md5partial
md5partial = calc_md5partial(path)
self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial))
return md5partial
def close(self):
logging.debug("Closing FilesDB")
self.conn.commit()
self.conn.close()
class FilesDBDummy:
def get_md5(self, path):
return calc_md5(path)
def get_md5partial(self, path):
return calc_md5partial(path)
def close(self):
pass
class Directories: class Directories:
"""Holds user folder selection. """Holds user folder selection.
@ -55,11 +169,15 @@ class Directories:
""" """
# ---Override # ---Override
def __init__(self, exclude_list=None): def __init__(self, exclude_list=None, hash_cache_file=None):
self._dirs = [] self._dirs = []
# {path: state} # {path: state}
self.states = {} self.states = {}
self._exclude_list = exclude_list self._exclude_list = exclude_list
if hash_cache_file:
self.filesdb = FilesDB(hash_cache_file)
else:
self.filesdb = FilesDBDummy()
def __contains__(self, path): def __contains__(self, path):
for p in self._dirs: for p in self._dirs:
@ -103,19 +221,19 @@ class Directories:
if state != DirectoryState.EXCLUDED: if state != DirectoryState.EXCLUDED:
# Old logic # Old logic
if self._exclude_list is None or not self._exclude_list.mark_count: if self._exclude_list is None or not self._exclude_list.mark_count:
found_files = [fs.get_file(root_path + f, fileclasses=fileclasses) for f in files] found_files = [fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses) for f in files]
else: else:
found_files = [] found_files = []
# print(f"len of files: {len(files)} {files}") # print(f"len of files: {len(files)} {files}")
for f in files: for f in files:
if not self._exclude_list.is_excluded(root, f): if not self._exclude_list.is_excluded(root, f):
found_files.append(fs.get_file(root_path + f, fileclasses=fileclasses)) found_files.append(fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses))
found_files = [f for f in found_files if f is not None] found_files = [f for f in found_files if f is not None]
# In some cases, directories can be considered as files by dupeGuru, which is # In some cases, directories can be considered as files by dupeGuru, which is
# why we have this line below. In fact, there only one case: Bundle files under # why we have this line below. In fact, there only one case: Bundle files under
# OS X... In other situations, this forloop will do nothing. # OS X... In other situations, this forloop will do nothing.
for d in dirs[:]: for d in dirs[:]:
f = fs.get_file(root_path + d, fileclasses=fileclasses) f = fs.get_file(root_path + d, self.filesdb, fileclasses=fileclasses)
if f is not None: if f is not None:
found_files.append(f) found_files.append(f)
dirs.remove(d) dirs.remove(d)
@ -200,7 +318,7 @@ class Directories:
folderclass = fs.Folder folderclass = fs.Folder
folder_count = 0 folder_count = 0
for path in self._dirs: for path in self._dirs:
from_folder = folderclass(path) from_folder = folderclass(path, self.filesdb)
for folder in self._get_folders(from_folder, j): for folder in self._get_folders(from_folder, j):
folder_count += 1 folder_count += 1
if type(j) != job.NullJob: if type(j) != job.NullJob:
@ -286,6 +404,9 @@ class Directories:
tree = ET.ElementTree(root) tree = ET.ElementTree(root)
tree.write(fp, encoding="utf-8") tree.write(fp, encoding="utf-8")
def save_hashes(self):
self.filesdb.close()
def set_state(self, path, state): def set_state(self, path, state):
"""Set the state of folder at ``path``. """Set the state of folder at ``path``.

View File

@ -85,10 +85,11 @@ class File:
# Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
# files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
# even greater when we take into account read attributes (70%!). Yeah, it's worth it. # even greater when we take into account read attributes (70%!). Yeah, it's worth it.
__slots__ = ("path", "is_ref", "words") + tuple(INITIAL_INFO.keys()) __slots__ = ("path", "db", "is_ref", "words") + tuple(INITIAL_INFO.keys())
def __init__(self, path): def __init__(self, path, db):
self.path = path self.path = path
self.db = db
for attrname in self.INITIAL_INFO: for attrname in self.INITIAL_INFO:
setattr(self, attrname, NOT_SET) setattr(self, attrname, NOT_SET)
@ -107,11 +108,6 @@ class File:
result = self.INITIAL_INFO[attrname] result = self.INITIAL_INFO[attrname]
return result return result
# This offset is where we should start reading the file to get a partial md5
# For audio file, it should be where audio data starts
def _get_md5partial_offset_and_size(self):
return (0x4000, 0x4000) # 16Kb
def _read_info(self, field): def _read_info(self, field):
# print(f"_read_info({field}) for {self}") # print(f"_read_info({field}) for {self}")
if field in ("size", "mtime"): if field in ("size", "mtime"):
@ -120,28 +116,14 @@ class File:
self.mtime = nonone(stats.st_mtime, 0) self.mtime = nonone(stats.st_mtime, 0)
elif field == "md5partial": elif field == "md5partial":
try: try:
with self.path.open("rb") as fp: self.md5partial = self.db.get_md5partial(self.path)
offset, size = self._get_md5partial_offset_and_size() except Exception as e:
fp.seek(offset) logging.warning("Couldn't get md5partial for %s: %s", self.path, e)
partialdata = fp.read(size)
md5 = hashlib.md5(partialdata)
self.md5partial = md5.digest()
except Exception:
pass
elif field == "md5": elif field == "md5":
try: try:
with self.path.open("rb") as fp: self.md5 = self.db.get_md5(self.path)
md5 = hashlib.md5() except Exception as e:
filedata = fp.read(CHUNK_SIZE) logging.warning("Couldn't get md5 for %s: %s", self.path, e)
while filedata:
md5.update(filedata)
filedata = fp.read(CHUNK_SIZE)
# FIXME For python 3.8 and later
# while filedata := fp.read(CHUNK_SIZE):
# md5.update(filedata)
self.md5 = md5.digest()
except Exception:
pass
elif field == "md5samples": elif field == "md5samples":
try: try:
with self.path.open("rb") as fp: with self.path.open("rb") as fp:
@ -225,13 +207,13 @@ class Folder(File):
__slots__ = File.__slots__ + ("_subfolders",) __slots__ = File.__slots__ + ("_subfolders",)
def __init__(self, path): def __init__(self, path, db):
File.__init__(self, path) File.__init__(self, path, db)
self._subfolders = None self._subfolders = None
def _all_items(self): def _all_items(self):
folders = self.subfolders folders = self.subfolders
files = get_files(self.path) files = get_files(self.path, self.db)
return folders + files return folders + files
def _read_info(self, field): def _read_info(self, field):
@ -260,7 +242,7 @@ class Folder(File):
def subfolders(self): def subfolders(self):
if self._subfolders is None: if self._subfolders is None:
subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()] subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()]
self._subfolders = [self.__class__(p) for p in subfolders] self._subfolders = [self.__class__(p, self.db) for p in subfolders]
return self._subfolders return self._subfolders
@classmethod @classmethod
@ -268,7 +250,7 @@ class Folder(File):
return not path.islink() and path.isdir() return not path.islink() and path.isdir()
def get_file(path, fileclasses=[File]): def get_file(path, db, fileclasses=[File]):
"""Wraps ``path`` around its appropriate :class:`File` class. """Wraps ``path`` around its appropriate :class:`File` class.
Whether a class is "appropriate" is decided by :meth:`File.can_handle` Whether a class is "appropriate" is decided by :meth:`File.can_handle`
@ -278,10 +260,10 @@ def get_file(path, fileclasses=[File]):
""" """
for fileclass in fileclasses: for fileclass in fileclasses:
if fileclass.can_handle(path): if fileclass.can_handle(path):
return fileclass(path) return fileclass(path, db)
def get_files(path, fileclasses=[File]): def get_files(path, db, fileclasses=[File]):
"""Returns a list of :class:`File` for each file contained in ``path``. """Returns a list of :class:`File` for each file contained in ``path``.
:param Path path: path to scan :param Path path: path to scan
@ -291,7 +273,7 @@ def get_files(path, fileclasses=[File]):
try: try:
result = [] result = []
for path in path.listdir(): for path in path.listdir():
file = get_file(path, fileclasses=fileclasses) file = get_file(path, db, fileclasses=fileclasses)
if file is not None: if file is not None:
result.append(file) result.append(file)
return result return result