mirror of
				https://github.com/arsenetar/dupeguru.git
				synced 2025-09-11 17:58:17 +00:00 
			
		
		
		
	Add partial hashes optimization for big files
* Big files above the user selected threshold can be partially hashed in 3 places. * If the user is willing to take the risk, we consider files with identical md5samples as being identical.
This commit is contained in:
		
							parent
							
								
									4641bd6ec9
								
							
						
					
					
						commit
						e07dfd5955
					
				| @ -283,9 +283,10 @@ def getmatches( | ||||
|     return result | ||||
| 
 | ||||
| 
 | ||||
| def getmatches_by_contents(files, j=job.nulljob): | ||||
| def getmatches_by_contents(files, bigsize=0, j=job.nulljob): | ||||
|     """Returns a list of :class:`Match` within ``files`` if their contents is the same. | ||||
| 
 | ||||
|     :param bigsize: The size in bytes over which we consider files too big for a full md5. | ||||
|     :param j: A :ref:`job progress instance <jobs>`. | ||||
|     """ | ||||
|     size2files = defaultdict(set) | ||||
| @ -302,8 +303,13 @@ def getmatches_by_contents(files, j=job.nulljob): | ||||
|             if first.is_ref and second.is_ref: | ||||
|                 continue  # Don't spend time comparing two ref pics together. | ||||
|             if first.md5partial == second.md5partial: | ||||
|                 if first.md5 == second.md5: | ||||
|                     result.append(Match(first, second, 100)) | ||||
|                 if bigsize > 0 and first.size > bigsize: | ||||
|                     print(f"first md5chunks {first} {first.md5samples}, second {second} {second.md5samples}") | ||||
|                     if first.md5samples == second.md5samples: | ||||
|                         result.append(Match(first, second, 100)) | ||||
|                 else: | ||||
|                     if first.md5 == second.md5: | ||||
|                         result.append(Match(first, second, 100)) | ||||
|         j.add_progress(desc=tr("%d matches found") % len(result)) | ||||
|     return result | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										67
									
								
								core/fs.py
									
									
									
									
									
								
							
							
						
						
									
										67
									
								
								core/fs.py
									
									
									
									
									
								
							| @ -12,6 +12,8 @@ | ||||
| # and I'm doing it now. | ||||
| 
 | ||||
| import hashlib | ||||
| from math import floor | ||||
| import inspect | ||||
| import logging | ||||
| 
 | ||||
| from hscommon.util import nonone, get_file_ext | ||||
| @ -30,6 +32,11 @@ __all__ = [ | ||||
| 
 | ||||
| NOT_SET = object() | ||||
| 
 | ||||
| # The goal here is to not run out of memory on really big files. However, the chunk | ||||
| # size has to be large enough so that the python loop isn't too costly in terms of | ||||
| # CPU. | ||||
| CHUNK_SIZE = 1024 * 1024  # 1 MiB | ||||
| 
 | ||||
| 
 | ||||
| class FSError(Exception): | ||||
|     cls_message = "An error has occured on '{name}' in '{parent}'" | ||||
| @ -78,6 +85,7 @@ class File: | ||||
|         "mtime": 0, | ||||
|         "md5": "", | ||||
|         "md5partial": "", | ||||
|         "md5samples": [] | ||||
|     } | ||||
|     # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of | ||||
|     # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become | ||||
| @ -96,6 +104,7 @@ class File: | ||||
|         result = object.__getattribute__(self, attrname) | ||||
|         if result is NOT_SET: | ||||
|             try: | ||||
|                 print(f"Try get attr for {self} {attrname}") | ||||
|                 self._read_info(attrname) | ||||
|             except Exception as e: | ||||
|                 logging.warning( | ||||
| @ -117,32 +126,50 @@ class File: | ||||
|             self.size = nonone(stats.st_size, 0) | ||||
|             self.mtime = nonone(stats.st_mtime, 0) | ||||
|         elif field == "md5partial": | ||||
|             print(f"_read_info md5partial {self}") | ||||
|             try: | ||||
|                 fp = self.path.open("rb") | ||||
|                 offset, size = self._get_md5partial_offset_and_size() | ||||
|                 fp.seek(offset) | ||||
|                 partialdata = fp.read(size) | ||||
|                 md5 = hashlib.md5(partialdata) | ||||
|                 self.md5partial = md5.digest() | ||||
|                 fp.close() | ||||
|                 with self.path.open("rb") as fp: | ||||
|                     offset, size = self._get_md5partial_offset_and_size() | ||||
|                     fp.seek(offset) | ||||
|                     partialdata = fp.read(size) | ||||
|                     md5 = hashlib.md5(partialdata) | ||||
|                     self.md5partial = md5.digest() | ||||
|             except Exception: | ||||
|                 pass | ||||
|         elif field == "md5": | ||||
|             try: | ||||
|                 fp = self.path.open("rb") | ||||
|                 md5 = hashlib.md5() | ||||
|                 # The goal here is to not run out of memory on really big files. However, the chunk | ||||
|                 # size has to be large enough so that the python loop isn't too costly in terms of | ||||
|                 # CPU. | ||||
|                 CHUNK_SIZE = 1024 * 1024  # 1 mb | ||||
|                 filedata = fp.read(CHUNK_SIZE) | ||||
|                 while filedata: | ||||
|                     md5.update(filedata) | ||||
|                     filedata = fp.read(CHUNK_SIZE) | ||||
|                 self.md5 = md5.digest() | ||||
|                 fp.close() | ||||
|                 with self.path.open("rb") as fp: | ||||
|                     md5 = hashlib.md5() | ||||
|                     while filedata := fp.read(CHUNK_SIZE): | ||||
|                         md5.update(filedata) | ||||
|                     self.md5 = md5.digest() | ||||
|             except Exception: | ||||
|                 pass | ||||
|         elif field == "md5samples": | ||||
|             print(f"computing md5chunks for {self}, caller: {inspect.stack()[1][3]}") | ||||
|             try: | ||||
|                 with self.path.open("rb") as fp: | ||||
|                     md5chunks = [] | ||||
|                     # Chunk at 25% of the file | ||||
|                     fp.seek(floor(self.size * 25 / 100), 0) | ||||
|                     filedata = fp.read(CHUNK_SIZE) | ||||
|                     md5chunks.append(hashlib.md5(filedata).hexdigest()) | ||||
| 
 | ||||
|                     # Chunk at 60% of the file | ||||
|                     fp.seek(floor(self.size * 60 / 100), 0) | ||||
|                     filedata = fp.read(CHUNK_SIZE) | ||||
|                     md5chunks.append(hashlib.md5(filedata).hexdigest()) | ||||
| 
 | ||||
|                     # Last chunk of the file | ||||
|                     fp.seek(-CHUNK_SIZE, 2) | ||||
|                     filedata = fp.read(CHUNK_SIZE) | ||||
|                     md5chunks.append(hashlib.md5(filedata).hexdigest()) | ||||
| 
 | ||||
|                     # Use setattr to avoid circular (de)reference | ||||
|                     setattr(self, field, tuple(md5chunks)) | ||||
|             except Exception as e: | ||||
|                 logging.error(f"Error computing md5samples: {e}") | ||||
|                 pass | ||||
| 
 | ||||
|     def _read_all_info(self, attrnames=None): | ||||
|         """Cache all possible info. | ||||
| @ -221,6 +248,8 @@ class Folder(File): | ||||
|             # What's sensitive here is that we must make sure that subfiles' | ||||
|             # md5 are always added up in the same order, but we also want a | ||||
|             # different md5 if a file gets moved in a different subdirectory. | ||||
|             print(f"Getting {field} of folder {self}...") | ||||
| 
 | ||||
|             def get_dir_md5_concat(): | ||||
|                 items = self._all_items() | ||||
|                 items.sort(key=lambda f: f.path) | ||||
|  | ||||
| @ -87,7 +87,11 @@ class Scanner: | ||||
|             if self.size_threshold: | ||||
|                 files = [f for f in files if f.size >= self.size_threshold] | ||||
|         if self.scan_type in {ScanType.Contents, ScanType.Folders}: | ||||
|             return engine.getmatches_by_contents(files, j=j) | ||||
|             return engine.getmatches_by_contents( | ||||
|                 files, | ||||
|                 bigsize=self.big_file_size_threshold if self.big_file_partial_hashes else 0, | ||||
|                 j=j | ||||
|             ) | ||||
|         else: | ||||
|             j = j.start_subjob([2, 8]) | ||||
|             kw = {} | ||||
| @ -218,4 +222,6 @@ class Scanner: | ||||
|     scan_type = ScanType.Filename | ||||
|     scanned_tags = {"artist", "title"} | ||||
|     size_threshold = 0 | ||||
|     big_file_partial_hashes = True | ||||
|     big_file_size_threshold = 100 * 1024 * 1024 | ||||
|     word_weighting = False | ||||
|  | ||||
| @ -187,7 +187,14 @@ class DupeGuru(QObject): | ||||
|         ) | ||||
|         self.model.options["size_threshold"] = ( | ||||
|             threshold * 1024 | ||||
|         )  # threshold is in KB. the scanner wants bytes | ||||
|         )  # threshold is in KB. The scanner wants bytes | ||||
|         big_file_size_threshold = ( | ||||
|             self.prefs.big_file_size_threshold if self.prefs.big_file_partial_hashes else 0 | ||||
|         ) | ||||
|         self.model.options["big_file_size_threshold"] = ( | ||||
|             big_file_size_threshold * 1024 * 1024 | ||||
|             # threshold is in MiB. The scanner wants bytes | ||||
|         ) | ||||
|         scanned_tags = set() | ||||
|         if self.prefs.scan_tag_track: | ||||
|             scanned_tags.add("track") | ||||
|  | ||||
| @ -73,6 +73,8 @@ class Preferences(PreferencesBase): | ||||
|         self.match_similar = get("MatchSimilar", self.match_similar) | ||||
|         self.ignore_small_files = get("IgnoreSmallFiles", self.ignore_small_files) | ||||
|         self.small_file_threshold = get("SmallFileThreshold", self.small_file_threshold) | ||||
|         self.big_file_partial_hashes = get("BigFilePartialHashes", self.big_file_partial_hashes) | ||||
|         self.big_file_size_threshold = get("BigFileSizeThreshold", self.big_file_size_threshold) | ||||
|         self.scan_tag_track = get("ScanTagTrack", self.scan_tag_track) | ||||
|         self.scan_tag_artist = get("ScanTagArtist", self.scan_tag_artist) | ||||
|         self.scan_tag_album = get("ScanTagAlbum", self.scan_tag_album) | ||||
| @ -117,6 +119,8 @@ class Preferences(PreferencesBase): | ||||
|         self.match_similar = False | ||||
|         self.ignore_small_files = True | ||||
|         self.small_file_threshold = 10  # KB | ||||
|         self.big_file_partial_hashes = False | ||||
|         self.big_file_size_threshold = 100  # MB | ||||
|         self.scan_tag_track = False | ||||
|         self.scan_tag_artist = True | ||||
|         self.scan_tag_album = True | ||||
| @ -161,6 +165,8 @@ class Preferences(PreferencesBase): | ||||
|         set_("MatchSimilar", self.match_similar) | ||||
|         set_("IgnoreSmallFiles", self.ignore_small_files) | ||||
|         set_("SmallFileThreshold", self.small_file_threshold) | ||||
|         set_("BigFilePartialHashes", self.big_file_partial_hashes) | ||||
|         set_("BigFileSizeThreshold", self.big_file_size_threshold) | ||||
|         set_("ScanTagTrack", self.scan_tag_track) | ||||
|         set_("ScanTagArtist", self.scan_tag_artist) | ||||
|         set_("ScanTagAlbum", self.scan_tag_album) | ||||
|  | ||||
| @ -72,6 +72,21 @@ class PreferencesDialog(PreferencesDialogBase): | ||||
|         spacerItem1 = QSpacerItem(40, 20, QSizePolicy.Expanding, QSizePolicy.Minimum) | ||||
|         self.horizontalLayout_2.addItem(spacerItem1) | ||||
|         self.verticalLayout_4.addLayout(self.horizontalLayout_2) | ||||
|         self.horizontalLayout_2b = QHBoxLayout() | ||||
|         self._setupAddCheckbox( | ||||
|             "bigFilePartialHashesBox", tr("Partially hash files bigger than"), self.widget | ||||
|         ) | ||||
|         self.horizontalLayout_2b.addWidget(self.bigFilePartialHashesBox) | ||||
|         self.bigSizeThresholdEdit = QLineEdit(self.widget) | ||||
|         self.bigSizeThresholdEdit.setSizePolicy(sizePolicy) | ||||
|         self.bigSizeThresholdEdit.setMaximumSize(QSize(75, 16777215)) | ||||
|         self.horizontalLayout_2b.addWidget(self.bigSizeThresholdEdit) | ||||
|         self.label_6b = QLabel(self.widget) | ||||
|         self.label_6b.setText(tr("MB")) | ||||
|         self.horizontalLayout_2b.addWidget(self.label_6b) | ||||
|         spacerItem2 = QSpacerItem(40, 20, QSizePolicy.Expanding, QSizePolicy.Minimum) | ||||
|         self.horizontalLayout_2b.addItem(spacerItem2) | ||||
|         self.verticalLayout_4.addLayout(self.horizontalLayout_2b) | ||||
|         self._setupAddCheckbox( | ||||
|             "ignoreHardlinkMatches", | ||||
|             tr("Ignore duplicates hardlinking to the same file"), | ||||
| @ -90,6 +105,8 @@ class PreferencesDialog(PreferencesDialogBase): | ||||
|         setchecked(self.wordWeightingBox, prefs.word_weighting) | ||||
|         setchecked(self.ignoreSmallFilesBox, prefs.ignore_small_files) | ||||
|         self.sizeThresholdEdit.setText(str(prefs.small_file_threshold)) | ||||
|         setchecked(self.bigFilePartialHashesBox, prefs.big_file_partial_hashes) | ||||
|         self.bigSizeThresholdEdit.setText(str(prefs.big_file_size_threshold)) | ||||
| 
 | ||||
|         # Update UI state based on selected scan type | ||||
|         scan_type = prefs.get_scan_type(AppMode.Standard) | ||||
| @ -103,3 +120,5 @@ class PreferencesDialog(PreferencesDialogBase): | ||||
|         prefs.word_weighting = ischecked(self.wordWeightingBox) | ||||
|         prefs.ignore_small_files = ischecked(self.ignoreSmallFilesBox) | ||||
|         prefs.small_file_threshold = tryint(self.sizeThresholdEdit.text()) | ||||
|         prefs.big_file_partial_hashes = ischecked(self.bigFilePartialHashesBox) | ||||
|         prefs.big_file_size_threshold = tryint(self.bigSizeThresholdEdit.text()) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user