의 미러
				https://github.com/arsenetar/dupeguru.git
				synced 2025-09-11 17:58:17 +00:00 
			
		
		
		
	Add partial hashes optimization for big files
* Big files above the user selected threshold can be partially hashed in 3 places. * If the user is willing to take the risk, we consider files with identical md5samples as being identical.
This commit is contained in:
		
							부모
							
								
									4641bd6ec9
								
							
						
					
					
						커밋
						e07dfd5955
					
				| @ -283,9 +283,10 @@ def getmatches( | |||||||
|     return result |     return result | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def getmatches_by_contents(files, j=job.nulljob): | def getmatches_by_contents(files, bigsize=0, j=job.nulljob): | ||||||
|     """Returns a list of :class:`Match` within ``files`` if their contents is the same. |     """Returns a list of :class:`Match` within ``files`` if their contents is the same. | ||||||
| 
 | 
 | ||||||
|  |     :param bigsize: The size in bytes over which we consider files too big for a full md5. | ||||||
|     :param j: A :ref:`job progress instance <jobs>`. |     :param j: A :ref:`job progress instance <jobs>`. | ||||||
|     """ |     """ | ||||||
|     size2files = defaultdict(set) |     size2files = defaultdict(set) | ||||||
| @ -302,6 +303,11 @@ def getmatches_by_contents(files, j=job.nulljob): | |||||||
|             if first.is_ref and second.is_ref: |             if first.is_ref and second.is_ref: | ||||||
|                 continue  # Don't spend time comparing two ref pics together. |                 continue  # Don't spend time comparing two ref pics together. | ||||||
|             if first.md5partial == second.md5partial: |             if first.md5partial == second.md5partial: | ||||||
|  |                 if bigsize > 0 and first.size > bigsize: | ||||||
|  |                     print(f"first md5chunks {first} {first.md5samples}, second {second} {second.md5samples}") | ||||||
|  |                     if first.md5samples == second.md5samples: | ||||||
|  |                         result.append(Match(first, second, 100)) | ||||||
|  |                 else: | ||||||
|                     if first.md5 == second.md5: |                     if first.md5 == second.md5: | ||||||
|                         result.append(Match(first, second, 100)) |                         result.append(Match(first, second, 100)) | ||||||
|         j.add_progress(desc=tr("%d matches found") % len(result)) |         j.add_progress(desc=tr("%d matches found") % len(result)) | ||||||
|  | |||||||
							
								
								
									
										51
									
								
								core/fs.py
									
									
									
									
									
								
							
							
						
						
									
										51
									
								
								core/fs.py
									
									
									
									
									
								
							| @ -12,6 +12,8 @@ | |||||||
| # and I'm doing it now. | # and I'm doing it now. | ||||||
| 
 | 
 | ||||||
| import hashlib | import hashlib | ||||||
|  | from math import floor | ||||||
|  | import inspect | ||||||
| import logging | import logging | ||||||
| 
 | 
 | ||||||
| from hscommon.util import nonone, get_file_ext | from hscommon.util import nonone, get_file_ext | ||||||
| @ -30,6 +32,11 @@ __all__ = [ | |||||||
| 
 | 
 | ||||||
| NOT_SET = object() | NOT_SET = object() | ||||||
| 
 | 
 | ||||||
|  | # The goal here is to not run out of memory on really big files. However, the chunk | ||||||
|  | # size has to be large enough so that the python loop isn't too costly in terms of | ||||||
|  | # CPU. | ||||||
|  | CHUNK_SIZE = 1024 * 1024  # 1 MiB | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class FSError(Exception): | class FSError(Exception): | ||||||
|     cls_message = "An error has occured on '{name}' in '{parent}'" |     cls_message = "An error has occured on '{name}' in '{parent}'" | ||||||
| @ -78,6 +85,7 @@ class File: | |||||||
|         "mtime": 0, |         "mtime": 0, | ||||||
|         "md5": "", |         "md5": "", | ||||||
|         "md5partial": "", |         "md5partial": "", | ||||||
|  |         "md5samples": [] | ||||||
|     } |     } | ||||||
|     # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of |     # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of | ||||||
|     # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become |     # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become | ||||||
| @ -96,6 +104,7 @@ class File: | |||||||
|         result = object.__getattribute__(self, attrname) |         result = object.__getattribute__(self, attrname) | ||||||
|         if result is NOT_SET: |         if result is NOT_SET: | ||||||
|             try: |             try: | ||||||
|  |                 print(f"Try get attr for {self} {attrname}") | ||||||
|                 self._read_info(attrname) |                 self._read_info(attrname) | ||||||
|             except Exception as e: |             except Exception as e: | ||||||
|                 logging.warning( |                 logging.warning( | ||||||
| @ -117,32 +126,50 @@ class File: | |||||||
|             self.size = nonone(stats.st_size, 0) |             self.size = nonone(stats.st_size, 0) | ||||||
|             self.mtime = nonone(stats.st_mtime, 0) |             self.mtime = nonone(stats.st_mtime, 0) | ||||||
|         elif field == "md5partial": |         elif field == "md5partial": | ||||||
|  |             print(f"_read_info md5partial {self}") | ||||||
|             try: |             try: | ||||||
|                 fp = self.path.open("rb") |                 with self.path.open("rb") as fp: | ||||||
|                     offset, size = self._get_md5partial_offset_and_size() |                     offset, size = self._get_md5partial_offset_and_size() | ||||||
|                     fp.seek(offset) |                     fp.seek(offset) | ||||||
|                     partialdata = fp.read(size) |                     partialdata = fp.read(size) | ||||||
|                     md5 = hashlib.md5(partialdata) |                     md5 = hashlib.md5(partialdata) | ||||||
|                     self.md5partial = md5.digest() |                     self.md5partial = md5.digest() | ||||||
|                 fp.close() |  | ||||||
|             except Exception: |             except Exception: | ||||||
|                 pass |                 pass | ||||||
|         elif field == "md5": |         elif field == "md5": | ||||||
|             try: |             try: | ||||||
|                 fp = self.path.open("rb") |                 with self.path.open("rb") as fp: | ||||||
|                     md5 = hashlib.md5() |                     md5 = hashlib.md5() | ||||||
|                 # The goal here is to not run out of memory on really big files. However, the chunk |                     while filedata := fp.read(CHUNK_SIZE): | ||||||
|                 # size has to be large enough so that the python loop isn't too costly in terms of |  | ||||||
|                 # CPU. |  | ||||||
|                 CHUNK_SIZE = 1024 * 1024  # 1 mb |  | ||||||
|                 filedata = fp.read(CHUNK_SIZE) |  | ||||||
|                 while filedata: |  | ||||||
|                         md5.update(filedata) |                         md5.update(filedata) | ||||||
|                     filedata = fp.read(CHUNK_SIZE) |  | ||||||
|                     self.md5 = md5.digest() |                     self.md5 = md5.digest() | ||||||
|                 fp.close() |  | ||||||
|             except Exception: |             except Exception: | ||||||
|                 pass |                 pass | ||||||
|  |         elif field == "md5samples": | ||||||
|  |             print(f"computing md5chunks for {self}, caller: {inspect.stack()[1][3]}") | ||||||
|  |             try: | ||||||
|  |                 with self.path.open("rb") as fp: | ||||||
|  |                     md5chunks = [] | ||||||
|  |                     # Chunk at 25% of the file | ||||||
|  |                     fp.seek(floor(self.size * 25 / 100), 0) | ||||||
|  |                     filedata = fp.read(CHUNK_SIZE) | ||||||
|  |                     md5chunks.append(hashlib.md5(filedata).hexdigest()) | ||||||
|  | 
 | ||||||
|  |                     # Chunk at 60% of the file | ||||||
|  |                     fp.seek(floor(self.size * 60 / 100), 0) | ||||||
|  |                     filedata = fp.read(CHUNK_SIZE) | ||||||
|  |                     md5chunks.append(hashlib.md5(filedata).hexdigest()) | ||||||
|  | 
 | ||||||
|  |                     # Last chunk of the file | ||||||
|  |                     fp.seek(-CHUNK_SIZE, 2) | ||||||
|  |                     filedata = fp.read(CHUNK_SIZE) | ||||||
|  |                     md5chunks.append(hashlib.md5(filedata).hexdigest()) | ||||||
|  | 
 | ||||||
|  |                     # Use setattr to avoid circular (de)reference | ||||||
|  |                     setattr(self, field, tuple(md5chunks)) | ||||||
|  |             except Exception as e: | ||||||
|  |                 logging.error(f"Error computing md5samples: {e}") | ||||||
|  |                 pass | ||||||
| 
 | 
 | ||||||
|     def _read_all_info(self, attrnames=None): |     def _read_all_info(self, attrnames=None): | ||||||
|         """Cache all possible info. |         """Cache all possible info. | ||||||
| @ -221,6 +248,8 @@ class Folder(File): | |||||||
|             # What's sensitive here is that we must make sure that subfiles' |             # What's sensitive here is that we must make sure that subfiles' | ||||||
|             # md5 are always added up in the same order, but we also want a |             # md5 are always added up in the same order, but we also want a | ||||||
|             # different md5 if a file gets moved in a different subdirectory. |             # different md5 if a file gets moved in a different subdirectory. | ||||||
|  |             print(f"Getting {field} of folder {self}...") | ||||||
|  | 
 | ||||||
|             def get_dir_md5_concat(): |             def get_dir_md5_concat(): | ||||||
|                 items = self._all_items() |                 items = self._all_items() | ||||||
|                 items.sort(key=lambda f: f.path) |                 items.sort(key=lambda f: f.path) | ||||||
|  | |||||||
| @ -87,7 +87,11 @@ class Scanner: | |||||||
|             if self.size_threshold: |             if self.size_threshold: | ||||||
|                 files = [f for f in files if f.size >= self.size_threshold] |                 files = [f for f in files if f.size >= self.size_threshold] | ||||||
|         if self.scan_type in {ScanType.Contents, ScanType.Folders}: |         if self.scan_type in {ScanType.Contents, ScanType.Folders}: | ||||||
|             return engine.getmatches_by_contents(files, j=j) |             return engine.getmatches_by_contents( | ||||||
|  |                 files, | ||||||
|  |                 bigsize=self.big_file_size_threshold if self.big_file_partial_hashes else 0, | ||||||
|  |                 j=j | ||||||
|  |             ) | ||||||
|         else: |         else: | ||||||
|             j = j.start_subjob([2, 8]) |             j = j.start_subjob([2, 8]) | ||||||
|             kw = {} |             kw = {} | ||||||
| @ -218,4 +222,6 @@ class Scanner: | |||||||
|     scan_type = ScanType.Filename |     scan_type = ScanType.Filename | ||||||
|     scanned_tags = {"artist", "title"} |     scanned_tags = {"artist", "title"} | ||||||
|     size_threshold = 0 |     size_threshold = 0 | ||||||
|  |     big_file_partial_hashes = True | ||||||
|  |     big_file_size_threshold = 100 * 1024 * 1024 | ||||||
|     word_weighting = False |     word_weighting = False | ||||||
|  | |||||||
| @ -187,7 +187,14 @@ class DupeGuru(QObject): | |||||||
|         ) |         ) | ||||||
|         self.model.options["size_threshold"] = ( |         self.model.options["size_threshold"] = ( | ||||||
|             threshold * 1024 |             threshold * 1024 | ||||||
|         )  # threshold is in KB. the scanner wants bytes |         )  # threshold is in KB. The scanner wants bytes | ||||||
|  |         big_file_size_threshold = ( | ||||||
|  |             self.prefs.big_file_size_threshold if self.prefs.big_file_partial_hashes else 0 | ||||||
|  |         ) | ||||||
|  |         self.model.options["big_file_size_threshold"] = ( | ||||||
|  |             big_file_size_threshold * 1024 * 1024 | ||||||
|  |             # threshold is in MiB. The scanner wants bytes | ||||||
|  |         ) | ||||||
|         scanned_tags = set() |         scanned_tags = set() | ||||||
|         if self.prefs.scan_tag_track: |         if self.prefs.scan_tag_track: | ||||||
|             scanned_tags.add("track") |             scanned_tags.add("track") | ||||||
|  | |||||||
| @ -73,6 +73,8 @@ class Preferences(PreferencesBase): | |||||||
|         self.match_similar = get("MatchSimilar", self.match_similar) |         self.match_similar = get("MatchSimilar", self.match_similar) | ||||||
|         self.ignore_small_files = get("IgnoreSmallFiles", self.ignore_small_files) |         self.ignore_small_files = get("IgnoreSmallFiles", self.ignore_small_files) | ||||||
|         self.small_file_threshold = get("SmallFileThreshold", self.small_file_threshold) |         self.small_file_threshold = get("SmallFileThreshold", self.small_file_threshold) | ||||||
|  |         self.big_file_partial_hashes = get("BigFilePartialHashes", self.big_file_partial_hashes) | ||||||
|  |         self.big_file_size_threshold = get("BigFileSizeThreshold", self.big_file_size_threshold) | ||||||
|         self.scan_tag_track = get("ScanTagTrack", self.scan_tag_track) |         self.scan_tag_track = get("ScanTagTrack", self.scan_tag_track) | ||||||
|         self.scan_tag_artist = get("ScanTagArtist", self.scan_tag_artist) |         self.scan_tag_artist = get("ScanTagArtist", self.scan_tag_artist) | ||||||
|         self.scan_tag_album = get("ScanTagAlbum", self.scan_tag_album) |         self.scan_tag_album = get("ScanTagAlbum", self.scan_tag_album) | ||||||
| @ -117,6 +119,8 @@ class Preferences(PreferencesBase): | |||||||
|         self.match_similar = False |         self.match_similar = False | ||||||
|         self.ignore_small_files = True |         self.ignore_small_files = True | ||||||
|         self.small_file_threshold = 10  # KB |         self.small_file_threshold = 10  # KB | ||||||
|  |         self.big_file_partial_hashes = False | ||||||
|  |         self.big_file_size_threshold = 100  # MB | ||||||
|         self.scan_tag_track = False |         self.scan_tag_track = False | ||||||
|         self.scan_tag_artist = True |         self.scan_tag_artist = True | ||||||
|         self.scan_tag_album = True |         self.scan_tag_album = True | ||||||
| @ -161,6 +165,8 @@ class Preferences(PreferencesBase): | |||||||
|         set_("MatchSimilar", self.match_similar) |         set_("MatchSimilar", self.match_similar) | ||||||
|         set_("IgnoreSmallFiles", self.ignore_small_files) |         set_("IgnoreSmallFiles", self.ignore_small_files) | ||||||
|         set_("SmallFileThreshold", self.small_file_threshold) |         set_("SmallFileThreshold", self.small_file_threshold) | ||||||
|  |         set_("BigFilePartialHashes", self.big_file_partial_hashes) | ||||||
|  |         set_("BigFileSizeThreshold", self.big_file_size_threshold) | ||||||
|         set_("ScanTagTrack", self.scan_tag_track) |         set_("ScanTagTrack", self.scan_tag_track) | ||||||
|         set_("ScanTagArtist", self.scan_tag_artist) |         set_("ScanTagArtist", self.scan_tag_artist) | ||||||
|         set_("ScanTagAlbum", self.scan_tag_album) |         set_("ScanTagAlbum", self.scan_tag_album) | ||||||
|  | |||||||
| @ -72,6 +72,21 @@ class PreferencesDialog(PreferencesDialogBase): | |||||||
|         spacerItem1 = QSpacerItem(40, 20, QSizePolicy.Expanding, QSizePolicy.Minimum) |         spacerItem1 = QSpacerItem(40, 20, QSizePolicy.Expanding, QSizePolicy.Minimum) | ||||||
|         self.horizontalLayout_2.addItem(spacerItem1) |         self.horizontalLayout_2.addItem(spacerItem1) | ||||||
|         self.verticalLayout_4.addLayout(self.horizontalLayout_2) |         self.verticalLayout_4.addLayout(self.horizontalLayout_2) | ||||||
|  |         self.horizontalLayout_2b = QHBoxLayout() | ||||||
|  |         self._setupAddCheckbox( | ||||||
|  |             "bigFilePartialHashesBox", tr("Partially hash files bigger than"), self.widget | ||||||
|  |         ) | ||||||
|  |         self.horizontalLayout_2b.addWidget(self.bigFilePartialHashesBox) | ||||||
|  |         self.bigSizeThresholdEdit = QLineEdit(self.widget) | ||||||
|  |         self.bigSizeThresholdEdit.setSizePolicy(sizePolicy) | ||||||
|  |         self.bigSizeThresholdEdit.setMaximumSize(QSize(75, 16777215)) | ||||||
|  |         self.horizontalLayout_2b.addWidget(self.bigSizeThresholdEdit) | ||||||
|  |         self.label_6b = QLabel(self.widget) | ||||||
|  |         self.label_6b.setText(tr("MB")) | ||||||
|  |         self.horizontalLayout_2b.addWidget(self.label_6b) | ||||||
|  |         spacerItem2 = QSpacerItem(40, 20, QSizePolicy.Expanding, QSizePolicy.Minimum) | ||||||
|  |         self.horizontalLayout_2b.addItem(spacerItem2) | ||||||
|  |         self.verticalLayout_4.addLayout(self.horizontalLayout_2b) | ||||||
|         self._setupAddCheckbox( |         self._setupAddCheckbox( | ||||||
|             "ignoreHardlinkMatches", |             "ignoreHardlinkMatches", | ||||||
|             tr("Ignore duplicates hardlinking to the same file"), |             tr("Ignore duplicates hardlinking to the same file"), | ||||||
| @ -90,6 +105,8 @@ class PreferencesDialog(PreferencesDialogBase): | |||||||
|         setchecked(self.wordWeightingBox, prefs.word_weighting) |         setchecked(self.wordWeightingBox, prefs.word_weighting) | ||||||
|         setchecked(self.ignoreSmallFilesBox, prefs.ignore_small_files) |         setchecked(self.ignoreSmallFilesBox, prefs.ignore_small_files) | ||||||
|         self.sizeThresholdEdit.setText(str(prefs.small_file_threshold)) |         self.sizeThresholdEdit.setText(str(prefs.small_file_threshold)) | ||||||
|  |         setchecked(self.bigFilePartialHashesBox, prefs.big_file_partial_hashes) | ||||||
|  |         self.bigSizeThresholdEdit.setText(str(prefs.big_file_size_threshold)) | ||||||
| 
 | 
 | ||||||
|         # Update UI state based on selected scan type |         # Update UI state based on selected scan type | ||||||
|         scan_type = prefs.get_scan_type(AppMode.Standard) |         scan_type = prefs.get_scan_type(AppMode.Standard) | ||||||
| @ -103,3 +120,5 @@ class PreferencesDialog(PreferencesDialogBase): | |||||||
|         prefs.word_weighting = ischecked(self.wordWeightingBox) |         prefs.word_weighting = ischecked(self.wordWeightingBox) | ||||||
|         prefs.ignore_small_files = ischecked(self.ignoreSmallFilesBox) |         prefs.ignore_small_files = ischecked(self.ignoreSmallFilesBox) | ||||||
|         prefs.small_file_threshold = tryint(self.sizeThresholdEdit.text()) |         prefs.small_file_threshold = tryint(self.sizeThresholdEdit.text()) | ||||||
|  |         prefs.big_file_partial_hashes = ischecked(self.bigFilePartialHashesBox) | ||||||
|  |         prefs.big_file_size_threshold = tryint(self.bigSizeThresholdEdit.text()) | ||||||
|  | |||||||
		불러오는 중...
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user