mirror of
				https://github.com/arsenetar/dupeguru.git
				synced 2025-09-11 17:58:17 +00:00 
			
		
		
		
	implement hash cache for md5 hash based on sqlite
This commit is contained in:
		
							parent
							
								
									b80489fd66
								
							
						
					
					
						commit
						2f02a6010d
					
				
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -7,6 +7,7 @@ __pycache__ | ||||
| .lock-waf* | ||||
| .tox | ||||
| /tags | ||||
| *.eggs | ||||
| 
 | ||||
| build | ||||
| dist | ||||
|  | ||||
| @ -138,7 +138,8 @@ class DupeGuru(Broadcaster): | ||||
|         self.app_mode = AppMode.STANDARD | ||||
|         self.discarded_file_count = 0 | ||||
|         self.exclude_list = ExcludeList() | ||||
|         self.directories = directories.Directories(self.exclude_list) | ||||
|         hash_cache_file = op.join(self.appdata, "hash.cache") | ||||
|         self.directories = directories.Directories(self.exclude_list, hash_cache_file) | ||||
|         self.results = results.Results(self) | ||||
|         self.ignore_list = IgnoreList() | ||||
|         # In addition to "app-level" options, this dictionary also holds options that will be | ||||
| @ -293,6 +294,7 @@ class DupeGuru(Broadcaster): | ||||
|     def _job_completed(self, jobid): | ||||
|         if jobid == JobType.SCAN: | ||||
|             self._results_changed() | ||||
|             self.directories.save_hashes() | ||||
|             if not self.results.groups: | ||||
|                 self.view.show_message(tr("No duplicates found.")) | ||||
|             else: | ||||
|  | ||||
| @ -5,8 +5,11 @@ | ||||
| # http://www.gnu.org/licenses/gpl-3.0.html | ||||
| 
 | ||||
| import os | ||||
| import hashlib | ||||
| import sqlite3 | ||||
| from xml.etree import ElementTree as ET | ||||
| import logging | ||||
| from threading import Lock | ||||
| 
 | ||||
| from hscommon.jobprogress import job | ||||
| from hscommon.path import Path | ||||
| @ -44,6 +47,117 @@ class InvalidPathError(Exception): | ||||
|     """The path being added is invalid""" | ||||
| 
 | ||||
| 
 | ||||
| def calc_md5(path): | ||||
|     # type: (Path, ) -> bytes | ||||
| 
 | ||||
|     with path.open("rb") as fp: | ||||
|         md5 = hashlib.md5() | ||||
|         # The goal here is to not run out of memory on really big files. However, the chunk | ||||
|         # size has to be large enough so that the python loop isn't too costly in terms of | ||||
|         # CPU. | ||||
|         CHUNK_SIZE = 1024 * 1024  # 1 mb | ||||
|         filedata = fp.read(CHUNK_SIZE) | ||||
|         while filedata: | ||||
|             md5.update(filedata) | ||||
|             filedata = fp.read(CHUNK_SIZE) | ||||
|         return md5.digest() | ||||
| 
 | ||||
| 
 | ||||
| def calc_md5partial(path): | ||||
|     # type: (Path, ) -> bytes | ||||
| 
 | ||||
|     # This offset is where we should start reading the file to get a partial md5 | ||||
|     # For audio file, it should be where audio data starts | ||||
|     offset, size = (0x4000, 0x4000) | ||||
| 
 | ||||
|     with path.open("rb") as fp: | ||||
|         fp.seek(offset) | ||||
|         partialdata = fp.read(size) | ||||
|         return hashlib.md5(partialdata).digest() | ||||
| 
 | ||||
| 
 | ||||
| class FilesDB: | ||||
| 
 | ||||
|     create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)" | ||||
|     select_query = "SELECT md5, md5partial FROM files WHERE path=? AND size=? and mtime_ns=?" | ||||
|     insert_query = "REPLACE INTO files (path, size, mtime_ns, entry_dt, md5, md5partial) VALUES (?, ?, ?, datetime('now'), ?, ?)" | ||||
| 
 | ||||
|     def __init__(self, path): | ||||
|         # type: (str, ) -> None | ||||
| 
 | ||||
|         self.conn = sqlite3.connect(path, check_same_thread=False) | ||||
|         self.cur = self.conn.cursor() | ||||
|         self.setup() | ||||
|         self.lock = Lock() | ||||
| 
 | ||||
|     def setup(self): | ||||
|         self.cur.execute(self.create_table_query) | ||||
| 
 | ||||
|     def get_md5(self, path): | ||||
|         # type: (Path, ) -> bytes | ||||
| 
 | ||||
|         stat = path.stat() | ||||
|         size = stat.st_size | ||||
|         mtime_ns = stat.st_mtime_ns | ||||
| 
 | ||||
|         with self.lock: | ||||
|             self.cur.execute(self.select_query, (str(path), size, mtime_ns)) | ||||
|             result = self.cur.fetchone() | ||||
| 
 | ||||
|             md5 = None | ||||
|             md5partial = None | ||||
| 
 | ||||
|             if result: | ||||
|                 md5, md5partial = result | ||||
|                 if md5: | ||||
|                     return md5 | ||||
| 
 | ||||
|             md5 = calc_md5(path) | ||||
|             self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial)) | ||||
|             return md5 | ||||
| 
 | ||||
|     def get_md5partial(self, path): | ||||
|         # type: (Path, ) -> bytes | ||||
| 
 | ||||
|         stat = path.stat() | ||||
|         size = stat.st_size | ||||
|         mtime_ns = stat.st_mtime_ns | ||||
| 
 | ||||
|         with self.lock: | ||||
|             self.cur.execute(self.select_query, (str(path), size, mtime_ns)) | ||||
|             result = self.cur.fetchone() | ||||
| 
 | ||||
|             md5 = None | ||||
|             md5partial = None | ||||
| 
 | ||||
|             if result: | ||||
|                 md5, md5partial = result | ||||
|                 if md5partial: | ||||
|                     return md5partial | ||||
| 
 | ||||
|             md5partial = calc_md5partial(path) | ||||
|             self.cur.execute(self.insert_query, (str(path), size, mtime_ns, md5, md5partial)) | ||||
|             return md5partial | ||||
| 
 | ||||
|     def close(self): | ||||
|         logging.debug("Closing FilesDB") | ||||
| 
 | ||||
|         self.conn.commit() | ||||
|         self.conn.close() | ||||
| 
 | ||||
| 
 | ||||
| class FilesDBDummy: | ||||
| 
 | ||||
|     def get_md5(self, path): | ||||
|         return calc_md5(path) | ||||
| 
 | ||||
|     def get_md5partial(self, path): | ||||
|         return calc_md5partial(path) | ||||
| 
 | ||||
|     def close(self): | ||||
|         pass | ||||
| 
 | ||||
| 
 | ||||
| class Directories: | ||||
|     """Holds user folder selection. | ||||
| 
 | ||||
| @ -55,11 +169,15 @@ class Directories: | ||||
|     """ | ||||
| 
 | ||||
|     # ---Override | ||||
|     def __init__(self, exclude_list=None): | ||||
|     def __init__(self, exclude_list=None, hash_cache_file=None): | ||||
|         self._dirs = [] | ||||
|         # {path: state} | ||||
|         self.states = {} | ||||
|         self._exclude_list = exclude_list | ||||
|         if hash_cache_file: | ||||
|             self.filesdb = FilesDB(hash_cache_file) | ||||
|         else: | ||||
|             self.filesdb = FilesDBDummy() | ||||
| 
 | ||||
|     def __contains__(self, path): | ||||
|         for p in self._dirs: | ||||
| @ -103,19 +221,19 @@ class Directories: | ||||
|                 if state != DirectoryState.EXCLUDED: | ||||
|                     # Old logic | ||||
|                     if self._exclude_list is None or not self._exclude_list.mark_count: | ||||
|                         found_files = [fs.get_file(root_path + f, fileclasses=fileclasses) for f in files] | ||||
|                         found_files = [fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses) for f in files] | ||||
|                     else: | ||||
|                         found_files = [] | ||||
|                         # print(f"len of files: {len(files)} {files}") | ||||
|                         for f in files: | ||||
|                             if not self._exclude_list.is_excluded(root, f): | ||||
|                                 found_files.append(fs.get_file(root_path + f, fileclasses=fileclasses)) | ||||
|                                 found_files.append(fs.get_file(root_path + f, self.filesdb, fileclasses=fileclasses)) | ||||
|                     found_files = [f for f in found_files if f is not None] | ||||
|                     # In some cases, directories can be considered as files by dupeGuru, which is | ||||
|                     # why we have this line below. In fact, there only one case: Bundle files under | ||||
|                     # OS X... In other situations, this forloop will do nothing. | ||||
|                     for d in dirs[:]: | ||||
|                         f = fs.get_file(root_path + d, fileclasses=fileclasses) | ||||
|                         f = fs.get_file(root_path + d, self.filesdb, fileclasses=fileclasses) | ||||
|                         if f is not None: | ||||
|                             found_files.append(f) | ||||
|                             dirs.remove(d) | ||||
| @ -200,7 +318,7 @@ class Directories: | ||||
|             folderclass = fs.Folder | ||||
|         folder_count = 0 | ||||
|         for path in self._dirs: | ||||
|             from_folder = folderclass(path) | ||||
|             from_folder = folderclass(path, self.filesdb) | ||||
|             for folder in self._get_folders(from_folder, j): | ||||
|                 folder_count += 1 | ||||
|                 if type(j) != job.NullJob: | ||||
| @ -286,6 +404,9 @@ class Directories: | ||||
|             tree = ET.ElementTree(root) | ||||
|             tree.write(fp, encoding="utf-8") | ||||
| 
 | ||||
|     def save_hashes(self): | ||||
|         self.filesdb.close() | ||||
| 
 | ||||
|     def set_state(self, path, state): | ||||
|         """Set the state of folder at ``path``. | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										52
									
								
								core/fs.py
									
									
									
									
									
								
							
							
						
						
									
										52
									
								
								core/fs.py
									
									
									
									
									
								
							| @ -85,10 +85,11 @@ class File: | ||||
|     # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of | ||||
|     # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become | ||||
|     # even greater when we take into account read attributes (70%!). Yeah, it's worth it. | ||||
|     __slots__ = ("path", "is_ref", "words") + tuple(INITIAL_INFO.keys()) | ||||
|     __slots__ = ("path", "db", "is_ref", "words") + tuple(INITIAL_INFO.keys()) | ||||
| 
 | ||||
|     def __init__(self, path): | ||||
|     def __init__(self, path, db): | ||||
|         self.path = path | ||||
|         self.db = db | ||||
|         for attrname in self.INITIAL_INFO: | ||||
|             setattr(self, attrname, NOT_SET) | ||||
| 
 | ||||
| @ -107,11 +108,6 @@ class File: | ||||
|                 result = self.INITIAL_INFO[attrname] | ||||
|         return result | ||||
| 
 | ||||
|     # This offset is where we should start reading the file to get a partial md5 | ||||
|     # For audio file, it should be where audio data starts | ||||
|     def _get_md5partial_offset_and_size(self): | ||||
|         return (0x4000, 0x4000)  # 16Kb | ||||
| 
 | ||||
|     def _read_info(self, field): | ||||
|         # print(f"_read_info({field}) for {self}") | ||||
|         if field in ("size", "mtime"): | ||||
| @ -120,28 +116,14 @@ class File: | ||||
|             self.mtime = nonone(stats.st_mtime, 0) | ||||
|         elif field == "md5partial": | ||||
|             try: | ||||
|                 with self.path.open("rb") as fp: | ||||
|                     offset, size = self._get_md5partial_offset_and_size() | ||||
|                     fp.seek(offset) | ||||
|                     partialdata = fp.read(size) | ||||
|                     md5 = hashlib.md5(partialdata) | ||||
|                     self.md5partial = md5.digest() | ||||
|             except Exception: | ||||
|                 pass | ||||
|                 self.md5partial = self.db.get_md5partial(self.path) | ||||
|             except Exception as e: | ||||
|                 logging.warning("Couldn't get md5partial for %s: %s", self.path, e) | ||||
|         elif field == "md5": | ||||
|             try: | ||||
|                 with self.path.open("rb") as fp: | ||||
|                     md5 = hashlib.md5() | ||||
|                     filedata = fp.read(CHUNK_SIZE) | ||||
|                     while filedata: | ||||
|                         md5.update(filedata) | ||||
|                         filedata = fp.read(CHUNK_SIZE) | ||||
|                     # FIXME For python 3.8 and later | ||||
|                     # while filedata := fp.read(CHUNK_SIZE): | ||||
|                     #     md5.update(filedata) | ||||
|                     self.md5 = md5.digest() | ||||
|             except Exception: | ||||
|                 pass | ||||
|                 self.md5 = self.db.get_md5(self.path) | ||||
|             except Exception as e: | ||||
|                 logging.warning("Couldn't get md5 for %s: %s", self.path, e) | ||||
|         elif field == "md5samples": | ||||
|             try: | ||||
|                 with self.path.open("rb") as fp: | ||||
| @ -225,13 +207,13 @@ class Folder(File): | ||||
| 
 | ||||
|     __slots__ = File.__slots__ + ("_subfolders",) | ||||
| 
 | ||||
|     def __init__(self, path): | ||||
|         File.__init__(self, path) | ||||
|     def __init__(self, path, db): | ||||
|         File.__init__(self, path, db) | ||||
|         self._subfolders = None | ||||
| 
 | ||||
|     def _all_items(self): | ||||
|         folders = self.subfolders | ||||
|         files = get_files(self.path) | ||||
|         files = get_files(self.path, self.db) | ||||
|         return folders + files | ||||
| 
 | ||||
|     def _read_info(self, field): | ||||
| @ -260,7 +242,7 @@ class Folder(File): | ||||
|     def subfolders(self): | ||||
|         if self._subfolders is None: | ||||
|             subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()] | ||||
|             self._subfolders = [self.__class__(p) for p in subfolders] | ||||
|             self._subfolders = [self.__class__(p, self.db) for p in subfolders] | ||||
|         return self._subfolders | ||||
| 
 | ||||
|     @classmethod | ||||
| @ -268,7 +250,7 @@ class Folder(File): | ||||
|         return not path.islink() and path.isdir() | ||||
| 
 | ||||
| 
 | ||||
| def get_file(path, fileclasses=[File]): | ||||
| def get_file(path, db, fileclasses=[File]): | ||||
|     """Wraps ``path`` around its appropriate :class:`File` class. | ||||
| 
 | ||||
|     Whether a class is "appropriate" is decided by :meth:`File.can_handle` | ||||
| @ -278,10 +260,10 @@ def get_file(path, fileclasses=[File]): | ||||
|     """ | ||||
|     for fileclass in fileclasses: | ||||
|         if fileclass.can_handle(path): | ||||
|             return fileclass(path) | ||||
|             return fileclass(path, db) | ||||
| 
 | ||||
| 
 | ||||
| def get_files(path, fileclasses=[File]): | ||||
| def get_files(path, db, fileclasses=[File]): | ||||
|     """Returns a list of :class:`File` for each file contained in ``path``. | ||||
| 
 | ||||
|     :param Path path: path to scan | ||||
| @ -291,7 +273,7 @@ def get_files(path, fileclasses=[File]): | ||||
|     try: | ||||
|         result = [] | ||||
|         for path in path.listdir(): | ||||
|             file = get_file(path, fileclasses=fileclasses) | ||||
|             file = get_file(path, db, fileclasses=fileclasses) | ||||
|             if file is not None: | ||||
|                 result.append(file) | ||||
|         return result | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user