diff --git a/core/engine.py b/core/engine.py index aaf418c8..4336b022 100644 --- a/core/engine.py +++ b/core/engine.py @@ -44,10 +44,10 @@ def unpack_fields(fields): return result def compare(first, second, flags=()): - """Returns the % of words that match between first and second + """Returns the % of words that match between ``first`` and ``second`` - The result is a int in the range 0..100. - First and second can be either a string or a list. + The result is a ``int`` in the range 0..100. + ``first`` and ``second`` can be either a string or a list (of words). """ if not (first and second): return 0 @@ -76,9 +76,10 @@ def compare(first, second, flags=()): return result def compare_fields(first, second, flags=()): - """Returns the score for the lowest matching fields. + """Returns the score for the lowest matching :ref:`fields`. - first and second must be lists of lists of string. + ``first`` and ``second`` must be lists of lists of string. Each sub-list is then compared with + :func:`compare`. """ if len(first) != len(second): return 0 @@ -98,13 +99,14 @@ def compare_fields(first, second, flags=()): if matched_field: second.remove(matched_field) else: - results = [compare(word1, word2, flags) for word1, word2 in zip(first, second)] + results = [compare(field1, field2, flags) for field1, field2 in zip(first, second)] return min(results) if results else 0 def build_word_dict(objects, j=job.nulljob): """Returns a dict of objects mapped by their words. - objects must have a 'words' attribute being a list of strings or a list of lists of strings. + objects must have a ``words`` attribute being a list of strings or a list of lists of strings + (:ref:`fields`). The result will be a dict with words as keys, lists of objects as values. """ @@ -115,7 +117,11 @@ def build_word_dict(objects, j=job.nulljob): return result def merge_similar_words(word_dict): - """Take all keys in word_dict that are similar, and merge them together. + """Take all keys in ``word_dict`` that are similar, and merge them together. + + ``word_dict`` has been built with :func:`build_word_dict`. Similarity is computed with Python's + ``difflib.get_close_matches()``, which computes the number of edits that are necessary to make + a word equal to the other. """ keys = list(word_dict.keys()) keys.sort(key=len)# we want the shortest word to stay @@ -131,7 +137,9 @@ def merge_similar_words(word_dict): keys.remove(similar) def reduce_common_words(word_dict, threshold): - """Remove all objects from word_dict values where the object count >= threshold + """Remove all objects from ``word_dict`` values where the object count >= ``threshold`` + + ``word_dict`` has been built with :func:`build_word_dict`. The exception to this removal are the objects where all the words of the object are common. Because if we remove them, we will miss some duplicates! @@ -150,13 +158,42 @@ def reduce_common_words(word_dict, threshold): del word_dict[word] Match = namedtuple('Match', 'first second percentage') +Match.__doc__ = """Represents a match between two :class:`~core.fs.File`. + +Regarless of the matching method, when two files are determined to match, a Match pair is created, +which holds, of course, the two matched files, but also their match "level". + +.. attribute:: first + + first file of the pair. + +.. attribute:: second + + second file of the pair. + +.. attribute:: percentage + + their match level according to the scan method which found the match. int from 1 to 100. For + exact scan methods, such as Contents scans, this will always be 100. +""" + def get_match(first, second, flags=()): #it is assumed here that first and second both have a "words" attribute percentage = compare(first.words, second.words, flags) return Match(first, second, percentage) -def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False, - no_field_order=False, j=job.nulljob): +def getmatches( + objects, min_match_percentage=0, match_similar_words=False, weight_words=False, + no_field_order=False, j=job.nulljob): + """Returns a list of :class:`Match` within ``objects`` after fuzzily matching their words. + + :param objects: List of :class:`~core.fs.File` to match. + :param int min_match_percentage: minimum % of words that have to match. + :param bool match_similar_words: make similar words (see :func:`merge_similar_words`) match. + :param bool weight_words: longer words are worth more in match % computations. + :param bool no_field_order: match :ref:`fields` regardless of their order. + :param j: A :ref:`job progress instance `. + """ COMMON_WORD_THRESHOLD = 50 LIMIT = 5000000 j = j.start_subjob(2) @@ -203,6 +240,14 @@ def getmatches(objects, min_match_percentage=0, match_similar_words=False, weigh return result def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob): + """Returns a list of :class:`Match` within ``files`` if their contents is the same. + + :param str sizeattr: attibute name of the :class:`~core.fs.file` that returns the size of the + file to use for comparison. + :param bool partial: if true, will use the "md5partial" attribute instead of "md5" to compute + contents hash. + :param j: A :ref:`job progress instance `. + """ j = j.start_subjob([2, 8]) size2files = defaultdict(set) for file in j.iter_with_progress(files, tr("Read size of %d/%d files")): @@ -240,6 +285,15 @@ class Group: .. attribute:: unordered Set duplicates in the group (including the :attr:`ref`). + + .. attribute:: dupes + + An ordered list of the group's duplicate, without :attr:`ref`. Equivalent to + ``ordered[1:]`` + + .. attribute:: percentage + + Average match percentage of match pairs containing :attr:`ref`. """ #---Override def __init__(self): @@ -362,6 +416,8 @@ class Group: pass def switch_ref(self, with_dupe): + """Make the :attr:`ref` dupe of the group switch position with ``with_dupe``. + """ if self.ref.is_ref: return False try: @@ -392,6 +448,10 @@ class Group: def get_groups(matches, j=job.nulljob): + """Returns a list of :class:`Group` from ``matches``. + + Create groups out of match pairs in the smartest way possible. + """ matches.sort(key=lambda match: -match.percentage) dupe2group = {} groups = [] diff --git a/core/results.py b/core/results.py index 5015b4e4..724a6d48 100644 --- a/core/results.py +++ b/core/results.py @@ -195,6 +195,8 @@ class Results(Markable): self.__dupes = None def get_group_of_duplicate(self, dupe): + """Returns :class:`~core.engine.Group` in which ``dupe`` belongs. + """ try: return self.__group_of_duplicate[dupe] except (TypeError, KeyError): @@ -203,6 +205,12 @@ class Results(Markable): is_markable = _is_markable def load_from_xml(self, infile, get_file, j=nulljob): + """Load results from ``infile``. + + :param infile: a file or path pointing to an XML file created with :meth:`save_to_xml`. + :param get_file: a function f(path) returning a :class:`~core.fs.File` wrapping the path. + :param j: A :ref:`job progress instance `. + """ def do_match(ref_file, other_files, group): if not other_files: return @@ -255,6 +263,8 @@ class Results(Markable): self.is_modified = False def make_ref(self, dupe): + """Make ``dupe`` take the :attr:`~core.engine.Group.ref` position of its group. + """ g = self.get_group_of_duplicate(dupe) r = g.ref if not g.switch_ref(dupe): @@ -271,8 +281,14 @@ class Results(Markable): return True def perform_on_marked(self, func, remove_from_results): - # Performs `func` on all marked dupes. If an EnvironmentError is raised during the call, - # the problematic dupe is added to self.problems. + """Performs ``func`` on all marked dupes. + + If an ``EnvironmentError`` is raised during the call, the problematic dupe is added to + self.problems. + + :param bool remove_from_results: If true, dupes which had ``func`` applied and didn't cause + any problem. + """ self.problems = [] to_remove = [] marked = (dupe for dupe in self.dupes if self.is_marked(dupe)) @@ -317,9 +333,12 @@ class Results(Markable): self.is_modified = bool(self.__groups) def save_to_xml(self, outfile): + """Save results to ``outfile`` in XML. + + :param outfile: file object or path. + """ self.apply_filter(None) root = ET.Element('results') - # writer = XMLGenerator(outfile, 'utf-8') for g in self.groups: group_elem = ET.SubElement(root, 'group') dupe2index = {} @@ -364,13 +383,26 @@ class Results(Markable): self.is_modified = False def sort_dupes(self, key, asc=True, delta=False): + """Sort :attr:`dupes` according to ``key``. + + :param str key: key attribute name to sort with. + :param bool asc: If false, sorting is reversed. + :param bool delta: If true, sorting occurs using :ref:`delta values `. + """ if not self.__dupes: self.__get_dupe_list() keyfunc = lambda d: self.app._get_dupe_sort_key(d, lambda: self.get_group_of_duplicate(d), key, delta) self.__dupes.sort(key=keyfunc, reverse=not asc) self.__dupes_sort_descriptor = (key,asc,delta) - def sort_groups(self,key,asc=True): + def sort_groups(self, key, asc=True): + """Sort :attr:`groups` according to ``key``. + + The :attr:`~core.engine.Group.ref` of each group is used to extract values for sorting. + + :param str key: key attribute name to sort with. + :param bool asc: If false, sorting is reversed. + """ keyfunc = lambda g: self.app._get_group_sort_key(g, key) self.groups.sort(key=keyfunc, reverse=not asc) self.__groups_sort_descriptor = (key,asc) diff --git a/help/en/developer/core/engine.rst b/help/en/developer/core/engine.rst index 20bbaed4..3258c776 100644 --- a/help/en/developer/core/engine.rst +++ b/help/en/developer/core/engine.rst @@ -2,6 +2,35 @@ core.engine =========== .. automodule:: core.engine + + .. autoclass:: Match + + .. autoclass:: Group + :members: + + .. autofunction:: build_word_dict + .. autofunction:: compare + .. autofunction:: compare_fields + .. autofunction:: getmatches + .. autofunction:: getmatches_by_contents + .. autofunction:: get_groups + .. autofunction:: merge_similar_words + .. autofunction:: reduce_common_words + +.. _fields: -.. autoclass:: core.engine.Group - :members: +Fields +------ + +Fields are groups of words which each represent a significant part of the whole name. This concept +is sifnificant in music file names, where we often have names like "My Artist - a very long title +with many many words". + +This title has 10 words. If you run as scan with a bit of tolerance, let's say 90%, you'll be able +to find a dupe that has only one "many" in the song title. However, you would also get false +duplicates from a title like "My Giraffe - a very long title with many many words", which is of +course a very different song and it doesn't make sense to match them. + +When matching by fields, each field (separated by "-") is considered as a separate string to match +independently. After all fields are matched, the lowest result is kept. In the "Giraffe" example we +gave, the result would be 50% instead of 90% in normal mode. diff --git a/help/en/developer/index.rst b/help/en/developer/index.rst index 935fcaf8..f242b49c 100644 --- a/help/en/developer/index.rst +++ b/help/en/developer/index.rst @@ -23,6 +23,8 @@ codebase. For example, when performing "Remove Selected From Results", ``base.app.DupeGuru.remove_duplicates()`` on the PyQt side, are respectively called to perform the thing. All of this is quite ugly, I know (see the "Refactoring" section below). +.. _jobs: + Jobs ---- diff --git a/help/en/results.rst b/help/en/results.rst index c161cfed..515b8a31 100644 --- a/help/en/results.rst +++ b/help/en/results.rst @@ -45,6 +45,8 @@ The dupeGuru results, when in normal mode, are sorted according to duplicate gro * Hold Shift and click on it. * Press Space to mark all selected duplicates. +.. _deltavalues: + Delta Values ------------