mirror of
				https://github.com/arsenetar/dupeguru.git
				synced 2025-09-11 17:58:17 +00:00 
			
		
		
		
	Improved dev docs
This commit is contained in:
		
							parent
							
								
									e99e2b18e0
								
							
						
					
					
						commit
						78c3c8ec2d
					
				| @ -44,10 +44,10 @@ def unpack_fields(fields): | ||||
|     return result | ||||
| 
 | ||||
| def compare(first, second, flags=()): | ||||
|     """Returns the % of words that match between first and second | ||||
|     """Returns the % of words that match between ``first`` and ``second`` | ||||
|      | ||||
|     The result is a int in the range 0..100. | ||||
|     First and second can be either a string or a list. | ||||
|     The result is a ``int`` in the range 0..100. | ||||
|     ``first`` and ``second`` can be either a string or a list (of words). | ||||
|     """ | ||||
|     if not (first and second): | ||||
|         return 0 | ||||
| @ -76,9 +76,10 @@ def compare(first, second, flags=()): | ||||
|     return result | ||||
| 
 | ||||
| def compare_fields(first, second, flags=()): | ||||
|     """Returns the score for the lowest matching fields. | ||||
|     """Returns the score for the lowest matching :ref:`fields`. | ||||
|      | ||||
|     first and second must be lists of lists of string. | ||||
|     ``first`` and ``second`` must be lists of lists of string. Each sub-list is then compared with | ||||
|     :func:`compare`.  | ||||
|     """ | ||||
|     if len(first) != len(second): | ||||
|         return 0 | ||||
| @ -98,13 +99,14 @@ def compare_fields(first, second, flags=()): | ||||
|             if matched_field: | ||||
|                 second.remove(matched_field) | ||||
|     else: | ||||
|         results = [compare(word1, word2, flags) for word1, word2 in zip(first, second)] | ||||
|         results = [compare(field1, field2, flags) for field1, field2 in zip(first, second)] | ||||
|     return min(results) if results else 0 | ||||
| 
 | ||||
| def build_word_dict(objects, j=job.nulljob): | ||||
|     """Returns a dict of objects mapped by their words. | ||||
|      | ||||
|     objects must have a 'words' attribute being a list of strings or a list of lists of strings. | ||||
|     objects must have a ``words`` attribute being a list of strings or a list of lists of strings | ||||
|     (:ref:`fields`). | ||||
|      | ||||
|     The result will be a dict with words as keys, lists of objects as values. | ||||
|     """ | ||||
| @ -115,7 +117,11 @@ def build_word_dict(objects, j=job.nulljob): | ||||
|     return result | ||||
| 
 | ||||
| def merge_similar_words(word_dict): | ||||
|     """Take all keys in word_dict that are similar, and merge them together. | ||||
|     """Take all keys in ``word_dict`` that are similar, and merge them together. | ||||
|      | ||||
|     ``word_dict`` has been built with :func:`build_word_dict`. Similarity is computed with Python's | ||||
|     ``difflib.get_close_matches()``, which computes the number of edits that are necessary to make | ||||
|     a word equal to the other. | ||||
|     """ | ||||
|     keys = list(word_dict.keys()) | ||||
|     keys.sort(key=len)# we want the shortest word to stay | ||||
| @ -131,7 +137,9 @@ def merge_similar_words(word_dict): | ||||
|             keys.remove(similar) | ||||
| 
 | ||||
| def reduce_common_words(word_dict, threshold): | ||||
|     """Remove all objects from word_dict values where the object count >= threshold | ||||
|     """Remove all objects from ``word_dict`` values where the object count >= ``threshold`` | ||||
|      | ||||
|     ``word_dict`` has been built with :func:`build_word_dict`. | ||||
|      | ||||
|     The exception to this removal are the objects where all the words of the object are common. | ||||
|     Because if we remove them, we will miss some duplicates! | ||||
| @ -150,13 +158,42 @@ def reduce_common_words(word_dict, threshold): | ||||
|             del word_dict[word] | ||||
| 
 | ||||
| Match = namedtuple('Match', 'first second percentage') | ||||
| Match.__doc__ = """Represents a match between two :class:`~core.fs.File`. | ||||
| 
 | ||||
| Regarless of the matching method, when two files are determined to match, a Match pair is created, | ||||
| which holds, of course, the two matched files, but also their match "level". | ||||
| 
 | ||||
| .. attribute:: first | ||||
| 
 | ||||
|     first file of the pair. | ||||
| 
 | ||||
| .. attribute:: second | ||||
| 
 | ||||
|     second file of the pair. | ||||
| 
 | ||||
| .. attribute:: percentage | ||||
| 
 | ||||
|     their match level according to the scan method which found the match. int from 1 to 100. For | ||||
|     exact scan methods, such as Contents scans, this will always be 100. | ||||
| """ | ||||
| 
 | ||||
| def get_match(first, second, flags=()): | ||||
|     #it is assumed here that first and second both have a "words" attribute | ||||
|     percentage = compare(first.words, second.words, flags) | ||||
|     return Match(first, second, percentage) | ||||
| 
 | ||||
| def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False,  | ||||
|     no_field_order=False, j=job.nulljob): | ||||
| def getmatches( | ||||
|         objects, min_match_percentage=0, match_similar_words=False, weight_words=False,  | ||||
|         no_field_order=False, j=job.nulljob): | ||||
|     """Returns a list of :class:`Match` within ``objects`` after fuzzily matching their words. | ||||
|      | ||||
|     :param objects: List of :class:`~core.fs.File` to match. | ||||
|     :param int min_match_percentage: minimum % of words that have to match. | ||||
|     :param bool match_similar_words: make similar words (see :func:`merge_similar_words`) match. | ||||
|     :param bool weight_words: longer words are worth more in match % computations. | ||||
|     :param bool no_field_order: match :ref:`fields` regardless of their order. | ||||
|     :param j: A :ref:`job progress instance <jobs>`. | ||||
|     """ | ||||
|     COMMON_WORD_THRESHOLD = 50 | ||||
|     LIMIT = 5000000 | ||||
|     j = j.start_subjob(2) | ||||
| @ -203,6 +240,14 @@ def getmatches(objects, min_match_percentage=0, match_similar_words=False, weigh | ||||
|     return result | ||||
| 
 | ||||
| def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob): | ||||
|     """Returns a list of :class:`Match` within ``files`` if their contents is the same. | ||||
|      | ||||
|     :param str sizeattr: attibute name of the :class:`~core.fs.file` that returns the size of the | ||||
|                          file to use for comparison. | ||||
|     :param bool partial: if true, will use the "md5partial" attribute instead of "md5" to compute | ||||
|                          contents hash. | ||||
|     :param j: A :ref:`job progress instance <jobs>`. | ||||
|     """ | ||||
|     j = j.start_subjob([2, 8]) | ||||
|     size2files = defaultdict(set) | ||||
|     for file in j.iter_with_progress(files, tr("Read size of %d/%d files")): | ||||
| @ -240,6 +285,15 @@ class Group: | ||||
|     .. attribute:: unordered | ||||
|      | ||||
|         Set duplicates in the group (including the :attr:`ref`). | ||||
|      | ||||
|     .. attribute:: dupes | ||||
|      | ||||
|         An ordered list of the group's duplicate, without :attr:`ref`. Equivalent to | ||||
|         ``ordered[1:]`` | ||||
|      | ||||
|     .. attribute:: percentage | ||||
|      | ||||
|         Average match percentage of match pairs containing :attr:`ref`. | ||||
|     """ | ||||
|     #---Override | ||||
|     def __init__(self): | ||||
| @ -362,6 +416,8 @@ class Group: | ||||
|             pass | ||||
|      | ||||
|     def switch_ref(self, with_dupe): | ||||
|         """Make the :attr:`ref` dupe of the group switch position with ``with_dupe``. | ||||
|         """ | ||||
|         if self.ref.is_ref: | ||||
|             return False | ||||
|         try: | ||||
| @ -392,6 +448,10 @@ class Group: | ||||
|      | ||||
| 
 | ||||
| def get_groups(matches, j=job.nulljob): | ||||
|     """Returns a list of :class:`Group` from ``matches``. | ||||
|      | ||||
|     Create groups out of match pairs in the smartest way possible. | ||||
|     """ | ||||
|     matches.sort(key=lambda match: -match.percentage) | ||||
|     dupe2group = {} | ||||
|     groups = [] | ||||
|  | ||||
| @ -195,6 +195,8 @@ class Results(Markable): | ||||
|         self.__dupes = None | ||||
|      | ||||
|     def get_group_of_duplicate(self, dupe): | ||||
|         """Returns :class:`~core.engine.Group` in which ``dupe`` belongs. | ||||
|         """ | ||||
|         try: | ||||
|             return self.__group_of_duplicate[dupe] | ||||
|         except (TypeError, KeyError): | ||||
| @ -203,6 +205,12 @@ class Results(Markable): | ||||
|     is_markable = _is_markable | ||||
|      | ||||
|     def load_from_xml(self, infile, get_file, j=nulljob): | ||||
|         """Load results from ``infile``. | ||||
|          | ||||
|         :param infile: a file or path pointing to an XML file created with :meth:`save_to_xml`. | ||||
|         :param get_file: a function f(path) returning a :class:`~core.fs.File` wrapping the path. | ||||
|         :param j: A :ref:`job progress instance <jobs>`. | ||||
|         """ | ||||
|         def do_match(ref_file, other_files, group): | ||||
|             if not other_files: | ||||
|                 return | ||||
| @ -255,6 +263,8 @@ class Results(Markable): | ||||
|         self.is_modified = False | ||||
|      | ||||
|     def make_ref(self, dupe): | ||||
|         """Make ``dupe`` take the :attr:`~core.engine.Group.ref` position of its group. | ||||
|         """ | ||||
|         g = self.get_group_of_duplicate(dupe) | ||||
|         r = g.ref | ||||
|         if not g.switch_ref(dupe): | ||||
| @ -271,8 +281,14 @@ class Results(Markable): | ||||
|         return True | ||||
|      | ||||
|     def perform_on_marked(self, func, remove_from_results): | ||||
|         # Performs `func` on all marked dupes. If an EnvironmentError is raised during the call, | ||||
|         # the problematic dupe is added to self.problems. | ||||
|         """Performs ``func`` on all marked dupes. | ||||
|          | ||||
|         If an ``EnvironmentError`` is raised during the call, the problematic dupe is added to | ||||
|         self.problems. | ||||
|          | ||||
|         :param bool remove_from_results: If true, dupes which had ``func`` applied and didn't cause | ||||
|                                          any problem. | ||||
|         """ | ||||
|         self.problems = [] | ||||
|         to_remove = [] | ||||
|         marked = (dupe for dupe in self.dupes if self.is_marked(dupe)) | ||||
| @ -317,9 +333,12 @@ class Results(Markable): | ||||
|         self.is_modified = bool(self.__groups) | ||||
|      | ||||
|     def save_to_xml(self, outfile): | ||||
|         """Save results to ``outfile`` in XML. | ||||
|          | ||||
|         :param outfile: file object or path. | ||||
|         """ | ||||
|         self.apply_filter(None) | ||||
|         root = ET.Element('results') | ||||
|         # writer = XMLGenerator(outfile, 'utf-8') | ||||
|         for g in self.groups: | ||||
|             group_elem = ET.SubElement(root, 'group') | ||||
|             dupe2index = {} | ||||
| @ -364,13 +383,26 @@ class Results(Markable): | ||||
|         self.is_modified = False | ||||
|      | ||||
|     def sort_dupes(self, key, asc=True, delta=False): | ||||
|         """Sort :attr:`dupes` according to ``key``. | ||||
|          | ||||
|         :param str key: key attribute name to sort with. | ||||
|         :param bool asc: If false, sorting is reversed. | ||||
|         :param bool delta: If true, sorting occurs using :ref:`delta values <deltavalues>`. | ||||
|         """ | ||||
|         if not self.__dupes: | ||||
|             self.__get_dupe_list() | ||||
|         keyfunc = lambda d: self.app._get_dupe_sort_key(d, lambda: self.get_group_of_duplicate(d), key, delta) | ||||
|         self.__dupes.sort(key=keyfunc, reverse=not asc) | ||||
|         self.__dupes_sort_descriptor = (key,asc,delta) | ||||
|      | ||||
|     def sort_groups(self,key,asc=True): | ||||
|     def sort_groups(self, key, asc=True): | ||||
|         """Sort :attr:`groups` according to ``key``. | ||||
|          | ||||
|         The :attr:`~core.engine.Group.ref` of each group is used to extract values for sorting. | ||||
|          | ||||
|         :param str key: key attribute name to sort with. | ||||
|         :param bool asc: If false, sorting is reversed. | ||||
|         """ | ||||
|         keyfunc = lambda g: self.app._get_group_sort_key(g, key) | ||||
|         self.groups.sort(key=keyfunc, reverse=not asc) | ||||
|         self.__groups_sort_descriptor = (key,asc) | ||||
|  | ||||
| @ -3,5 +3,34 @@ core.engine | ||||
| 
 | ||||
| .. automodule:: core.engine | ||||
|      | ||||
| .. autoclass:: core.engine.Group | ||||
|     :members: | ||||
|     .. autoclass:: Match | ||||
|      | ||||
|     .. autoclass:: Group | ||||
|         :members: | ||||
|      | ||||
|     .. autofunction:: build_word_dict | ||||
|     .. autofunction:: compare | ||||
|     .. autofunction:: compare_fields | ||||
|     .. autofunction:: getmatches | ||||
|     .. autofunction:: getmatches_by_contents | ||||
|     .. autofunction:: get_groups | ||||
|     .. autofunction:: merge_similar_words | ||||
|     .. autofunction:: reduce_common_words | ||||
|      | ||||
| .. _fields: | ||||
| 
 | ||||
| Fields | ||||
| ------ | ||||
| 
 | ||||
| Fields are groups of words which each represent a significant part of the whole name. This concept | ||||
| is sifnificant in music file names, where we often have names like "My Artist - a very long title | ||||
| with many many words". | ||||
| 
 | ||||
| This title has 10 words. If you run as scan with a bit of tolerance, let's say 90%, you'll be able | ||||
| to find a dupe that has only one "many" in the song title. However, you would also get false | ||||
| duplicates from a title like "My Giraffe - a very long title with many many words", which is of | ||||
| course a very different song and it doesn't make sense to match them. | ||||
| 
 | ||||
| When matching by fields, each field (separated by "-") is considered as a separate string to match | ||||
| independently. After all fields are matched, the lowest result is kept. In the "Giraffe" example we | ||||
| gave, the result would be 50% instead of 90% in normal mode. | ||||
|  | ||||
| @ -23,6 +23,8 @@ codebase. For example, when performing "Remove Selected From Results", | ||||
| ``base.app.DupeGuru.remove_duplicates()`` on the PyQt side, are respectively called to perform the | ||||
| thing. All of this is quite ugly, I know (see the "Refactoring" section below). | ||||
| 
 | ||||
| .. _jobs: | ||||
| 
 | ||||
| Jobs | ||||
| ---- | ||||
| 
 | ||||
|  | ||||
| @ -45,6 +45,8 @@ The dupeGuru results, when in normal mode, are sorted according to duplicate gro | ||||
| * Hold Shift and click on it. | ||||
| * Press Space to mark all selected duplicates. | ||||
| 
 | ||||
| .. _deltavalues: | ||||
| 
 | ||||
| Delta Values | ||||
| ------------ | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user