mirror of
https://github.com/arsenetar/dupeguru.git
synced 2025-03-10 05:34:36 +00:00
implement hash cache for md5 hash based on sqlite
This commit is contained in:
parent
b80489fd66
commit
2f02a6010d
1
.gitignore
vendored
1
.gitignore
vendored
@ -7,6 +7,7 @@ __pycache__
|
|||||||
.lock-waf*
|
.lock-waf*
|
||||||
.tox
|
.tox
|
||||||
/tags
|
/tags
|
||||||
|
*.eggs
|
||||||
|
|
||||||
build
|
build
|
||||||
dist
|
dist
|
||||||
|
@ -138,7 +138,8 @@ class DupeGuru(Broadcaster):
|
|||||||
self.app_mode = AppMode.STANDARD
|
self.app_mode = AppMode.STANDARD
|
||||||
self.discarded_file_count = 0
|
self.discarded_file_count = 0
|
||||||
self.exclude_list = ExcludeList()
|
self.exclude_list = ExcludeList()
|
||||||
self.directories = directories.Directories(self.exclude_list)
|
hash_cache_file = op.join(self.appdata, "hash.cache")
|
||||||
|
self.directories = directories.Directories(self.exclude_list, hash_cache_file)
|
||||||
self.results = results.Results(self)
|
self.results = results.Results(self)
|
||||||
self.ignore_list = IgnoreList()
|
self.ignore_list = IgnoreList()
|
||||||
# In addition to "app-level" options, this dictionary also holds options that will be
|
# In addition to "app-level" options, this dictionary also holds options that will be
|
||||||
@ -293,6 +294,7 @@ class DupeGuru(Broadcaster):
|
|||||||
def _job_completed(self, jobid):
|
def _job_completed(self, jobid):
|
||||||
if jobid == JobType.SCAN:
|
if jobid == JobType.SCAN:
|
||||||
self._results_changed()
|
self._results_changed()
|
||||||
|
self.directories.save_hashes()
|
||||||
if not self.results.groups:
|
if not self.results.groups:
|
||||||
self.view.show_message(tr("No duplicates found."))
|
self.view.show_message(tr("No duplicates found."))
|
||||||
else:
|
else:
|
||||||
|
@ -5,8 +5,11 @@
|
|||||||
# http://www.gnu.org/licenses/gpl-3.0.html
|
# http://www.gnu.org/licenses/gpl-3.0.html
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import hashlib
|
||||||
|
import sqlite3
|
||||||
from xml.etree import ElementTree as ET
|
from xml.etree import ElementTree as ET
|
||||||
import logging
|
import logging
|
||||||
|
from threading import Lock
|
||||||
|
|
||||||
from hscommon.jobprogress import job
|
from hscommon.jobprogress import job
|
||||||
from hscommon.path import Path
|
from hscommon.path import Path
|
||||||
@ -44,6 +47,117 @@ class InvalidPathError(Exception):
|
|||||||
"""The path being added is invalid"""
|
"""The path being added is invalid"""
|
||||||
|
|
||||||
|
|
||||||
|
def calc_md5(path):
|
||||||
|
# type: (Path, ) -> bytes
|
||||||
|
|
||||||
|
with path.open("rb") as fp:
|
||||||
|
md5 = hashlib.md5()
|
||||||
|
# The goal here is to not run out of memory on really big files. However, the chunk
|
||||||
|
# size has to be large enough so that the python loop isn't too costly in terms of
|
||||||
|
# CPU.
|
||||||
|
CHUNK_SIZE = 1024 * 1024 # 1 mb
|
||||||
|
filedata = fp.read(CHUNK_SIZE)
|
||||||
|
while filedata:
|
||||||
|
md5.update(filedata)
|
||||||
|
filedata = fp.read(CHUNK_SIZE)
|
||||||
|
return md5.digest()
|
||||||
|
|
||||||
|
|
||||||
|
def calc_md5partial(path):
|
||||||
|
# type: (Path, ) -> bytes
|
||||||
|
|
||||||
|
# This offset is where we should start reading the file to get a partial md5
|
||||||
|
# For audio file, it should be where audio data starts
|
||||||
|
offset, size = (0x4000, 0x4000)
|
||||||
|
|
||||||
|
with path.open("rb") as fp:
|
||||||
|
fp.seek(offset)
|
||||||
|
partialdata = fp.read(size)
|
||||||
|
return hashlib.md5(partialdata).digest()
|
||||||
|
|
||||||
|
|
||||||
|
class FilesDB:
|
||||||
|
|
||||||
|
create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
|
||||||
|
select_query = "SELECT md5, md5partial FROM files WHERE path=? AND size=? and mtime_ns=?"
|
||||||
|
insert_query = "REPLACE INTO files (path, size, mtime_ns, entry_dt, md5, md5partial) VALUES (?, ?, ?, datetime('now'), ?, ?)"
|
||||||
|
|
||||||
|
def __init__(self, path):
|
||||||
|
# type: (str, ) -> None
|
||||||
|
|
||||||
|
self.conn = sqlite3.connect(path, check_same_thread=False)
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
self.setup()
|
||||||
|
self.lock = Lock()
|
||||||
|
|
||||||
|
def setup(self):
|
||||||
|
self.cur.execute(self.create_table_query)
|
||||||
|
|
||||||
|
def get_md5(self, path):
|
||||||
|
# type: (Path, ) -> bytes
|
||||||
|
|
||||||
|
stat = path.stat()
|
||||||
|
size = stat.st_size
|
||||||
|
mtime_ns = stat.st_mtime_ns
|
||||||
|
|
||||||
|
with self.lock:
|
||||||
|
self.cur.execute(self.select_query, (str(path), size, mtime_ns))
|
||||||
|
result = self.cur.fetchone()
|
||||||
|
|
||||||
|
md5 = None
|
||||||
|
md5partial = None
|
||||||
|
|
||||||
|
if result:
|
||||||
|
md5, md5partial = result
|
||||||
|
if md5:
|
||||||
|
return md5
|
||||||
|
|
||||||
|
md5 = calc_md5(path)
|
||||||
|
self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial))
|
||||||
|
return md5
|
||||||
|
|
||||||
|
def get_md5partial(self, path):
|
||||||
|
# type: (Path, ) -> bytes
|
||||||
|
|
||||||
|
stat = path.stat()
|
||||||
|
size = stat.st_size
|
||||||
|
mtime_ns = stat.st_mtime_ns
|
||||||
|
|
||||||
|
with self.lock:
|
||||||
|
self.cur.execute(self.select_query, (str(path), size, mtime_ns))
|
||||||
|
result = self.cur.fetchone()
|
||||||
|
|
||||||
|
md5 = None
|
||||||
|
md5partial = None
|
||||||
|
|
||||||
|
if result:
|
||||||
|
md5, md5partial = result
|
||||||
|
if md5partial:
|
||||||
|
return md5partial
|
||||||
|
|
||||||
|
md5partial = calc_md5partial(path)
|
||||||
|
self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial))
|
||||||
|
return md5partial
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
logging.debug("Closing FilesDB")
|
||||||
|
|
||||||
|
self.conn.commit()
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
class FilesDBDummy:
|
||||||
|
|
||||||
|
def get_md5(self, path):
|
||||||
|
return calc_md5(path)
|
||||||
|
|
||||||
|
def get_md5partial(self, path):
|
||||||
|
return calc_md5partial(path)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Directories:
|
class Directories:
|
||||||
"""Holds user folder selection.
|
"""Holds user folder selection.
|
||||||
|
|
||||||
@ -55,11 +169,15 @@ class Directories:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# ---Override
|
# ---Override
|
||||||
def __init__(self, exclude_list=None):
|
def __init__(self, exclude_list=None, hash_cache_file=None):
|
||||||
self._dirs = []
|
self._dirs = []
|
||||||
# {path: state}
|
# {path: state}
|
||||||
self.states = {}
|
self.states = {}
|
||||||
self._exclude_list = exclude_list
|
self._exclude_list = exclude_list
|
||||||
|
if hash_cache_file:
|
||||||
|
self.filesdb = FilesDB(hash_cache_file)
|
||||||
|
else:
|
||||||
|
self.filesdb = FilesDBDummy()
|
||||||
|
|
||||||
def __contains__(self, path):
|
def __contains__(self, path):
|
||||||
for p in self._dirs:
|
for p in self._dirs:
|
||||||
@ -103,19 +221,19 @@ class Directories:
|
|||||||
if state != DirectoryState.EXCLUDED:
|
if state != DirectoryState.EXCLUDED:
|
||||||
# Old logic
|
# Old logic
|
||||||
if self._exclude_list is None or not self._exclude_list.mark_count:
|
if self._exclude_list is None or not self._exclude_list.mark_count:
|
||||||
found_files = [fs.get_file(root_path + f, fileclasses=fileclasses) for f in files]
|
found_files = [fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses) for f in files]
|
||||||
else:
|
else:
|
||||||
found_files = []
|
found_files = []
|
||||||
# print(f"len of files: {len(files)} {files}")
|
# print(f"len of files: {len(files)} {files}")
|
||||||
for f in files:
|
for f in files:
|
||||||
if not self._exclude_list.is_excluded(root, f):
|
if not self._exclude_list.is_excluded(root, f):
|
||||||
found_files.append(fs.get_file(root_path + f, fileclasses=fileclasses))
|
found_files.append(fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses))
|
||||||
found_files = [f for f in found_files if f is not None]
|
found_files = [f for f in found_files if f is not None]
|
||||||
# In some cases, directories can be considered as files by dupeGuru, which is
|
# In some cases, directories can be considered as files by dupeGuru, which is
|
||||||
# why we have this line below. In fact, there only one case: Bundle files under
|
# why we have this line below. In fact, there only one case: Bundle files under
|
||||||
# OS X... In other situations, this forloop will do nothing.
|
# OS X... In other situations, this forloop will do nothing.
|
||||||
for d in dirs[:]:
|
for d in dirs[:]:
|
||||||
f = fs.get_file(root_path + d, fileclasses=fileclasses)
|
f = fs.get_file(root_path + d, self.filesdb, fileclasses=fileclasses)
|
||||||
if f is not None:
|
if f is not None:
|
||||||
found_files.append(f)
|
found_files.append(f)
|
||||||
dirs.remove(d)
|
dirs.remove(d)
|
||||||
@ -200,7 +318,7 @@ class Directories:
|
|||||||
folderclass = fs.Folder
|
folderclass = fs.Folder
|
||||||
folder_count = 0
|
folder_count = 0
|
||||||
for path in self._dirs:
|
for path in self._dirs:
|
||||||
from_folder = folderclass(path)
|
from_folder = folderclass(path, self.filesdb)
|
||||||
for folder in self._get_folders(from_folder, j):
|
for folder in self._get_folders(from_folder, j):
|
||||||
folder_count += 1
|
folder_count += 1
|
||||||
if type(j) != job.NullJob:
|
if type(j) != job.NullJob:
|
||||||
@ -286,6 +404,9 @@ class Directories:
|
|||||||
tree = ET.ElementTree(root)
|
tree = ET.ElementTree(root)
|
||||||
tree.write(fp, encoding="utf-8")
|
tree.write(fp, encoding="utf-8")
|
||||||
|
|
||||||
|
def save_hashes(self):
|
||||||
|
self.filesdb.close()
|
||||||
|
|
||||||
def set_state(self, path, state):
|
def set_state(self, path, state):
|
||||||
"""Set the state of folder at ``path``.
|
"""Set the state of folder at ``path``.
|
||||||
|
|
||||||
|
52
core/fs.py
52
core/fs.py
@ -85,10 +85,11 @@ class File:
|
|||||||
# Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
|
# Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
|
||||||
# files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
|
# files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
|
||||||
# even greater when we take into account read attributes (70%!). Yeah, it's worth it.
|
# even greater when we take into account read attributes (70%!). Yeah, it's worth it.
|
||||||
__slots__ = ("path", "is_ref", "words") + tuple(INITIAL_INFO.keys())
|
__slots__ = ("path", "db", "is_ref", "words") + tuple(INITIAL_INFO.keys())
|
||||||
|
|
||||||
def __init__(self, path):
|
def __init__(self, path, db):
|
||||||
self.path = path
|
self.path = path
|
||||||
|
self.db = db
|
||||||
for attrname in self.INITIAL_INFO:
|
for attrname in self.INITIAL_INFO:
|
||||||
setattr(self, attrname, NOT_SET)
|
setattr(self, attrname, NOT_SET)
|
||||||
|
|
||||||
@ -107,11 +108,6 @@ class File:
|
|||||||
result = self.INITIAL_INFO[attrname]
|
result = self.INITIAL_INFO[attrname]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# This offset is where we should start reading the file to get a partial md5
|
|
||||||
# For audio file, it should be where audio data starts
|
|
||||||
def _get_md5partial_offset_and_size(self):
|
|
||||||
return (0x4000, 0x4000) # 16Kb
|
|
||||||
|
|
||||||
def _read_info(self, field):
|
def _read_info(self, field):
|
||||||
# print(f"_read_info({field}) for {self}")
|
# print(f"_read_info({field}) for {self}")
|
||||||
if field in ("size", "mtime"):
|
if field in ("size", "mtime"):
|
||||||
@ -120,28 +116,14 @@ class File:
|
|||||||
self.mtime = nonone(stats.st_mtime, 0)
|
self.mtime = nonone(stats.st_mtime, 0)
|
||||||
elif field == "md5partial":
|
elif field == "md5partial":
|
||||||
try:
|
try:
|
||||||
with self.path.open("rb") as fp:
|
self.md5partial = self.db.get_md5partial(self.path)
|
||||||
offset, size = self._get_md5partial_offset_and_size()
|
except Exception as e:
|
||||||
fp.seek(offset)
|
logging.warning("Couldn't get md5partial for %s: %s", self.path, e)
|
||||||
partialdata = fp.read(size)
|
|
||||||
md5 = hashlib.md5(partialdata)
|
|
||||||
self.md5partial = md5.digest()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
elif field == "md5":
|
elif field == "md5":
|
||||||
try:
|
try:
|
||||||
with self.path.open("rb") as fp:
|
self.md5 = self.db.get_md5(self.path)
|
||||||
md5 = hashlib.md5()
|
except Exception as e:
|
||||||
filedata = fp.read(CHUNK_SIZE)
|
logging.warning("Couldn't get md5 for %s: %s", self.path, e)
|
||||||
while filedata:
|
|
||||||
md5.update(filedata)
|
|
||||||
filedata = fp.read(CHUNK_SIZE)
|
|
||||||
# FIXME For python 3.8 and later
|
|
||||||
# while filedata := fp.read(CHUNK_SIZE):
|
|
||||||
# md5.update(filedata)
|
|
||||||
self.md5 = md5.digest()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
elif field == "md5samples":
|
elif field == "md5samples":
|
||||||
try:
|
try:
|
||||||
with self.path.open("rb") as fp:
|
with self.path.open("rb") as fp:
|
||||||
@ -225,13 +207,13 @@ class Folder(File):
|
|||||||
|
|
||||||
__slots__ = File.__slots__ + ("_subfolders",)
|
__slots__ = File.__slots__ + ("_subfolders",)
|
||||||
|
|
||||||
def __init__(self, path):
|
def __init__(self, path, db):
|
||||||
File.__init__(self, path)
|
File.__init__(self, path, db)
|
||||||
self._subfolders = None
|
self._subfolders = None
|
||||||
|
|
||||||
def _all_items(self):
|
def _all_items(self):
|
||||||
folders = self.subfolders
|
folders = self.subfolders
|
||||||
files = get_files(self.path)
|
files = get_files(self.path, self.db)
|
||||||
return folders + files
|
return folders + files
|
||||||
|
|
||||||
def _read_info(self, field):
|
def _read_info(self, field):
|
||||||
@ -260,7 +242,7 @@ class Folder(File):
|
|||||||
def subfolders(self):
|
def subfolders(self):
|
||||||
if self._subfolders is None:
|
if self._subfolders is None:
|
||||||
subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()]
|
subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()]
|
||||||
self._subfolders = [self.__class__(p) for p in subfolders]
|
self._subfolders = [self.__class__(p, self.db) for p in subfolders]
|
||||||
return self._subfolders
|
return self._subfolders
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -268,7 +250,7 @@ class Folder(File):
|
|||||||
return not path.islink() and path.isdir()
|
return not path.islink() and path.isdir()
|
||||||
|
|
||||||
|
|
||||||
def get_file(path, fileclasses=[File]):
|
def get_file(path, db, fileclasses=[File]):
|
||||||
"""Wraps ``path`` around its appropriate :class:`File` class.
|
"""Wraps ``path`` around its appropriate :class:`File` class.
|
||||||
|
|
||||||
Whether a class is "appropriate" is decided by :meth:`File.can_handle`
|
Whether a class is "appropriate" is decided by :meth:`File.can_handle`
|
||||||
@ -278,10 +260,10 @@ def get_file(path, fileclasses=[File]):
|
|||||||
"""
|
"""
|
||||||
for fileclass in fileclasses:
|
for fileclass in fileclasses:
|
||||||
if fileclass.can_handle(path):
|
if fileclass.can_handle(path):
|
||||||
return fileclass(path)
|
return fileclass(path, db)
|
||||||
|
|
||||||
|
|
||||||
def get_files(path, fileclasses=[File]):
|
def get_files(path, db, fileclasses=[File]):
|
||||||
"""Returns a list of :class:`File` for each file contained in ``path``.
|
"""Returns a list of :class:`File` for each file contained in ``path``.
|
||||||
|
|
||||||
:param Path path: path to scan
|
:param Path path: path to scan
|
||||||
@ -291,7 +273,7 @@ def get_files(path, fileclasses=[File]):
|
|||||||
try:
|
try:
|
||||||
result = []
|
result = []
|
||||||
for path in path.listdir():
|
for path in path.listdir():
|
||||||
file = get_file(path, fileclasses=fileclasses)
|
file = get_file(path, db, fileclasses=fileclasses)
|
||||||
if file is not None:
|
if file is not None:
|
||||||
result.append(file)
|
result.append(file)
|
||||||
return result
|
return result
|
||||||
|
Loading…
x
Reference in New Issue
Block a user