diff --git a/.gitignore b/.gitignore index ff5fc581..2e5f5d6c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ __pycache__ .lock-waf* .tox /tags +*.eggs build dist diff --git a/core/app.py b/core/app.py index f2066a75..d2627f6b 100644 --- a/core/app.py +++ b/core/app.py @@ -138,7 +138,8 @@ class DupeGuru(Broadcaster): self.app_mode = AppMode.STANDARD self.discarded_file_count = 0 self.exclude_list = ExcludeList() - self.directories = directories.Directories(self.exclude_list) + hash_cache_file = op.join(self.appdata, "hash.cache") + self.directories = directories.Directories(self.exclude_list, hash_cache_file) self.results = results.Results(self) self.ignore_list = IgnoreList() # In addition to "app-level" options, this dictionary also holds options that will be @@ -293,6 +294,7 @@ class DupeGuru(Broadcaster): def _job_completed(self, jobid): if jobid == JobType.SCAN: self._results_changed() + self.directories.save_hashes() if not self.results.groups: self.view.show_message(tr("No duplicates found.")) else: diff --git a/core/directories.py b/core/directories.py index 2a381c14..c50c5790 100644 --- a/core/directories.py +++ b/core/directories.py @@ -5,8 +5,11 @@ # http://www.gnu.org/licenses/gpl-3.0.html import os +import hashlib +import sqlite3 from xml.etree import ElementTree as ET import logging +from threading import Lock from hscommon.jobprogress import job from hscommon.path import Path @@ -44,6 +47,117 @@ class InvalidPathError(Exception): """The path being added is invalid""" +def calc_md5(path): + # type: (Path, ) -> bytes + + with path.open("rb") as fp: + md5 = hashlib.md5() + # The goal here is to not run out of memory on really big files. However, the chunk + # size has to be large enough so that the python loop isn't too costly in terms of + # CPU. + CHUNK_SIZE = 1024 * 1024 # 1 mb + filedata = fp.read(CHUNK_SIZE) + while filedata: + md5.update(filedata) + filedata = fp.read(CHUNK_SIZE) + return md5.digest() + + +def calc_md5partial(path): + # type: (Path, ) -> bytes + + # This offset is where we should start reading the file to get a partial md5 + # For audio file, it should be where audio data starts + offset, size = (0x4000, 0x4000) + + with path.open("rb") as fp: + fp.seek(offset) + partialdata = fp.read(size) + return hashlib.md5(partialdata).digest() + + +class FilesDB: + + create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)" + select_query = "SELECT md5, md5partial FROM files WHERE path=? AND size=? and mtime_ns=?" + insert_query = "REPLACE INTO files (path, size, mtime_ns, entry_dt, md5, md5partial) VALUES (?, ?, ?, datetime('now'), ?, ?)" + + def __init__(self, path): + # type: (str, ) -> None + + self.conn = sqlite3.connect(path, check_same_thread=False) + self.cur = self.conn.cursor() + self.setup() + self.lock = Lock() + + def setup(self): + self.cur.execute(self.create_table_query) + + def get_md5(self, path): + # type: (Path, ) -> bytes + + stat = path.stat() + size = stat.st_size + mtime_ns = stat.st_mtime_ns + + with self.lock: + self.cur.execute(self.select_query, (str(path), size, mtime_ns)) + result = self.cur.fetchone() + + md5 = None + md5partial = None + + if result: + md5, md5partial = result + if md5: + return md5 + + md5 = calc_md5(path) + self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial)) + return md5 + + def get_md5partial(self, path): + # type: (Path, ) -> bytes + + stat = path.stat() + size = stat.st_size + mtime_ns = stat.st_mtime_ns + + with self.lock: + self.cur.execute(self.select_query, (str(path), size, mtime_ns)) + result = self.cur.fetchone() + + md5 = None + md5partial = None + + if result: + md5, md5partial = result + if md5partial: + return md5partial + + md5partial = calc_md5partial(path) + self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial)) + return md5partial + + def close(self): + logging.debug("Closing FilesDB") + + self.conn.commit() + self.conn.close() + + +class FilesDBDummy: + + def get_md5(self, path): + return calc_md5(path) + + def get_md5partial(self, path): + return calc_md5partial(path) + + def close(self): + pass + + class Directories: """Holds user folder selection. @@ -55,11 +169,15 @@ class Directories: """ # ---Override - def __init__(self, exclude_list=None): + def __init__(self, exclude_list=None, hash_cache_file=None): self._dirs = [] # {path: state} self.states = {} self._exclude_list = exclude_list + if hash_cache_file: + self.filesdb = FilesDB(hash_cache_file) + else: + self.filesdb = FilesDBDummy() def __contains__(self, path): for p in self._dirs: @@ -103,19 +221,19 @@ class Directories: if state != DirectoryState.EXCLUDED: # Old logic if self._exclude_list is None or not self._exclude_list.mark_count: - found_files = [fs.get_file(root_path + f, fileclasses=fileclasses) for f in files] + found_files = [fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses) for f in files] else: found_files = [] # print(f"len of files: {len(files)} {files}") for f in files: if not self._exclude_list.is_excluded(root, f): - found_files.append(fs.get_file(root_path + f, fileclasses=fileclasses)) + found_files.append(fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses)) found_files = [f for f in found_files if f is not None] # In some cases, directories can be considered as files by dupeGuru, which is # why we have this line below. In fact, there only one case: Bundle files under # OS X... In other situations, this forloop will do nothing. for d in dirs[:]: - f = fs.get_file(root_path + d, fileclasses=fileclasses) + f = fs.get_file(root_path + d, self.filesdb, fileclasses=fileclasses) if f is not None: found_files.append(f) dirs.remove(d) @@ -200,7 +318,7 @@ class Directories: folderclass = fs.Folder folder_count = 0 for path in self._dirs: - from_folder = folderclass(path) + from_folder = folderclass(path, self.filesdb) for folder in self._get_folders(from_folder, j): folder_count += 1 if type(j) != job.NullJob: @@ -286,6 +404,9 @@ class Directories: tree = ET.ElementTree(root) tree.write(fp, encoding="utf-8") + def save_hashes(self): + self.filesdb.close() + def set_state(self, path, state): """Set the state of folder at ``path``. diff --git a/core/fs.py b/core/fs.py index a7978c0f..c6f5734d 100644 --- a/core/fs.py +++ b/core/fs.py @@ -85,10 +85,11 @@ class File: # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become # even greater when we take into account read attributes (70%!). Yeah, it's worth it. - __slots__ = ("path", "is_ref", "words") + tuple(INITIAL_INFO.keys()) + __slots__ = ("path", "db", "is_ref", "words") + tuple(INITIAL_INFO.keys()) - def __init__(self, path): + def __init__(self, path, db): self.path = path + self.db = db for attrname in self.INITIAL_INFO: setattr(self, attrname, NOT_SET) @@ -107,11 +108,6 @@ class File: result = self.INITIAL_INFO[attrname] return result - # This offset is where we should start reading the file to get a partial md5 - # For audio file, it should be where audio data starts - def _get_md5partial_offset_and_size(self): - return (0x4000, 0x4000) # 16Kb - def _read_info(self, field): # print(f"_read_info({field}) for {self}") if field in ("size", "mtime"): @@ -120,28 +116,14 @@ class File: self.mtime = nonone(stats.st_mtime, 0) elif field == "md5partial": try: - with self.path.open("rb") as fp: - offset, size = self._get_md5partial_offset_and_size() - fp.seek(offset) - partialdata = fp.read(size) - md5 = hashlib.md5(partialdata) - self.md5partial = md5.digest() - except Exception: - pass + self.md5partial = self.db.get_md5partial(self.path) + except Exception as e: + logging.warning("Couldn't get md5partial for %s: %s", self.path, e) elif field == "md5": try: - with self.path.open("rb") as fp: - md5 = hashlib.md5() - filedata = fp.read(CHUNK_SIZE) - while filedata: - md5.update(filedata) - filedata = fp.read(CHUNK_SIZE) - # FIXME For python 3.8 and later - # while filedata := fp.read(CHUNK_SIZE): - # md5.update(filedata) - self.md5 = md5.digest() - except Exception: - pass + self.md5 = self.db.get_md5(self.path) + except Exception as e: + logging.warning("Couldn't get md5 for %s: %s", self.path, e) elif field == "md5samples": try: with self.path.open("rb") as fp: @@ -225,13 +207,13 @@ class Folder(File): __slots__ = File.__slots__ + ("_subfolders",) - def __init__(self, path): - File.__init__(self, path) + def __init__(self, path, db): + File.__init__(self, path, db) self._subfolders = None def _all_items(self): folders = self.subfolders - files = get_files(self.path) + files = get_files(self.path, self.db) return folders + files def _read_info(self, field): @@ -260,7 +242,7 @@ class Folder(File): def subfolders(self): if self._subfolders is None: subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()] - self._subfolders = [self.__class__(p) for p in subfolders] + self._subfolders = [self.__class__(p, self.db) for p in subfolders] return self._subfolders @classmethod @@ -268,7 +250,7 @@ class Folder(File): return not path.islink() and path.isdir() -def get_file(path, fileclasses=[File]): +def get_file(path, db, fileclasses=[File]): """Wraps ``path`` around its appropriate :class:`File` class. Whether a class is "appropriate" is decided by :meth:`File.can_handle` @@ -278,10 +260,10 @@ def get_file(path, fileclasses=[File]): """ for fileclass in fileclasses: if fileclass.can_handle(path): - return fileclass(path) + return fileclass(path, db) -def get_files(path, fileclasses=[File]): +def get_files(path, db, fileclasses=[File]): """Returns a list of :class:`File` for each file contained in ``path``. :param Path path: path to scan @@ -291,7 +273,7 @@ def get_files(path, fileclasses=[File]): try: result = [] for path in path.listdir(): - file = get_file(path, fileclasses=fileclasses) + file = get_file(path, db, fileclasses=fileclasses) if file is not None: result.append(file) return result