Improved dev docs

2025-07-23 22:13:19 +00:00 · 2013-08-20 22:52:43 -04:00 · 2013-08-20 22:52:43 -04:00 · 78c3c8ec2d
commit 78c3c8ec2d
parent e99e2b18e0
5 changed files with 142 additions and 17 deletions
--- a/core/engine.py
+++ b/core/engine.py
@ -44,10 +44,10 @@ def unpack_fields(fields):
    return result

 def compare(first, second, flags=()):
-    """Returns the % of words that match between first and second
+    """Returns the % of words that match between ``first`` and ``second``
    
-    The result is a int in the range 0..100.
-    First and second can be either a string or a list.
+    The result is a ``int`` in the range 0..100.
+    ``first`` and ``second`` can be either a string or a list (of words).
    """
    if not (first and second):
        return 0
@ -76,9 +76,10 @@ def compare(first, second, flags=()):
    return result

 def compare_fields(first, second, flags=()):
-    """Returns the score for the lowest matching fields.
+    """Returns the score for the lowest matching :ref:`fields`.
    
-    first and second must be lists of lists of string.
+    ``first`` and ``second`` must be lists of lists of string. Each sub-list is then compared with
+    :func:`compare`. 
    """
    if len(first) != len(second):
        return 0
@ -98,13 +99,14 @@ def compare_fields(first, second, flags=()):
            if matched_field:
                second.remove(matched_field)
    else:
-        results = [compare(word1, word2, flags) for word1, word2 in zip(first, second)]
+        results = [compare(field1, field2, flags) for field1, field2 in zip(first, second)]
    return min(results) if results else 0

 def build_word_dict(objects, j=job.nulljob):
    """Returns a dict of objects mapped by their words.
    
-    objects must have a 'words' attribute being a list of strings or a list of lists of strings.
+    objects must have a ``words`` attribute being a list of strings or a list of lists of strings
+    (:ref:`fields`).
    
    The result will be a dict with words as keys, lists of objects as values.
    """
@ -115,7 +117,11 @@ def build_word_dict(objects, j=job.nulljob):
    return result

 def merge_similar_words(word_dict):
-    """Take all keys in word_dict that are similar, and merge them together.
+    """Take all keys in ``word_dict`` that are similar, and merge them together.
+    
+    ``word_dict`` has been built with :func:`build_word_dict`. Similarity is computed with Python's
+    ``difflib.get_close_matches()``, which computes the number of edits that are necessary to make
+    a word equal to the other.
    """
    keys = list(word_dict.keys())
    keys.sort(key=len)# we want the shortest word to stay
@ -131,7 +137,9 @@ def merge_similar_words(word_dict):
            keys.remove(similar)

 def reduce_common_words(word_dict, threshold):
-    """Remove all objects from word_dict values where the object count >= threshold
+    """Remove all objects from ``word_dict`` values where the object count >= ``threshold``
+    
+    ``word_dict`` has been built with :func:`build_word_dict`.
    
    The exception to this removal are the objects where all the words of the object are common.
    Because if we remove them, we will miss some duplicates!
@ -150,13 +158,42 @@ def reduce_common_words(word_dict, threshold):
            del word_dict[word]

 Match = namedtuple('Match', 'first second percentage')
+Match.__doc__ = """Represents a match between two :class:`~core.fs.File`.
+
+Regarless of the matching method, when two files are determined to match, a Match pair is created,
+which holds, of course, the two matched files, but also their match "level".
+
+.. attribute:: first
+
+    first file of the pair.
+
+.. attribute:: second
+
+    second file of the pair.
+
+.. attribute:: percentage
+
+    their match level according to the scan method which found the match. int from 1 to 100. For
+    exact scan methods, such as Contents scans, this will always be 100.
+"""
+
 def get_match(first, second, flags=()):
    #it is assumed here that first and second both have a "words" attribute
    percentage = compare(first.words, second.words, flags)
    return Match(first, second, percentage)

-def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False, 
-    no_field_order=False, j=job.nulljob):
+def getmatches(
+        objects, min_match_percentage=0, match_similar_words=False, weight_words=False, 
+        no_field_order=False, j=job.nulljob):
+    """Returns a list of :class:`Match` within ``objects`` after fuzzily matching their words.
+    
+    :param objects: List of :class:`~core.fs.File` to match.
+    :param int min_match_percentage: minimum % of words that have to match.
+    :param bool match_similar_words: make similar words (see :func:`merge_similar_words`) match.
+    :param bool weight_words: longer words are worth more in match % computations.
+    :param bool no_field_order: match :ref:`fields` regardless of their order.
+    :param j: A :ref:`job progress instance <jobs>`.
+    """
    COMMON_WORD_THRESHOLD = 50
    LIMIT = 5000000
    j = j.start_subjob(2)
@ -203,6 +240,14 @@ def getmatches(objects, min_match_percentage=0, match_similar_words=False, weigh
    return result

 def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob):
+    """Returns a list of :class:`Match` within ``files`` if their contents is the same.
+    
+    :param str sizeattr: attibute name of the :class:`~core.fs.file` that returns the size of the
+                         file to use for comparison.
+    :param bool partial: if true, will use the "md5partial" attribute instead of "md5" to compute
+                         contents hash.
+    :param j: A :ref:`job progress instance <jobs>`.
+    """
    j = j.start_subjob([2, 8])
    size2files = defaultdict(set)
    for file in j.iter_with_progress(files, tr("Read size of %d/%d files")):
@ -240,6 +285,15 @@ class Group:
    .. attribute:: unordered
    
        Set duplicates in the group (including the :attr:`ref`).
+    
+    .. attribute:: dupes
+    
+        An ordered list of the group's duplicate, without :attr:`ref`. Equivalent to
+        ``ordered[1:]``
+    
+    .. attribute:: percentage
+    
+        Average match percentage of match pairs containing :attr:`ref`.
    """
    #---Override
    def __init__(self):
@ -362,6 +416,8 @@ class Group:
            pass
    
    def switch_ref(self, with_dupe):
+        """Make the :attr:`ref` dupe of the group switch position with ``with_dupe``.
+        """
        if self.ref.is_ref:
            return False
        try:
@ -392,6 +448,10 @@ class Group:
    

 def get_groups(matches, j=job.nulljob):
+    """Returns a list of :class:`Group` from ``matches``.
+    
+    Create groups out of match pairs in the smartest way possible.
+    """
    matches.sort(key=lambda match: -match.percentage)
    dupe2group = {}
    groups = []
--- a/core/results.py
+++ b/core/results.py
@ -195,6 +195,8 @@ class Results(Markable):
        self.__dupes = None
    
    def get_group_of_duplicate(self, dupe):
+        """Returns :class:`~core.engine.Group` in which ``dupe`` belongs.
+        """
        try:
            return self.__group_of_duplicate[dupe]
        except (TypeError, KeyError):
@ -203,6 +205,12 @@ class Results(Markable):
    is_markable = _is_markable
    
    def load_from_xml(self, infile, get_file, j=nulljob):
+        """Load results from ``infile``.
+        
+        :param infile: a file or path pointing to an XML file created with :meth:`save_to_xml`.
+        :param get_file: a function f(path) returning a :class:`~core.fs.File` wrapping the path.
+        :param j: A :ref:`job progress instance <jobs>`.
+        """
        def do_match(ref_file, other_files, group):
            if not other_files:
                return
@ -255,6 +263,8 @@ class Results(Markable):
        self.is_modified = False
    
    def make_ref(self, dupe):
+        """Make ``dupe`` take the :attr:`~core.engine.Group.ref` position of its group.
+        """
        g = self.get_group_of_duplicate(dupe)
        r = g.ref
        if not g.switch_ref(dupe):
@ -271,8 +281,14 @@ class Results(Markable):
        return True
    
    def perform_on_marked(self, func, remove_from_results):
-        # Performs `func` on all marked dupes. If an EnvironmentError is raised during the call,
-        # the problematic dupe is added to self.problems.
+        """Performs ``func`` on all marked dupes.
+        
+        If an ``EnvironmentError`` is raised during the call, the problematic dupe is added to
+        self.problems.
+        
+        :param bool remove_from_results: If true, dupes which had ``func`` applied and didn't cause
+                                         any problem.
+        """
        self.problems = []
        to_remove = []
        marked = (dupe for dupe in self.dupes if self.is_marked(dupe))
@ -317,9 +333,12 @@ class Results(Markable):
        self.is_modified = bool(self.__groups)
    
    def save_to_xml(self, outfile):
+        """Save results to ``outfile`` in XML.
+        
+        :param outfile: file object or path.
+        """
        self.apply_filter(None)
        root = ET.Element('results')
-        # writer = XMLGenerator(outfile, 'utf-8')
        for g in self.groups:
            group_elem = ET.SubElement(root, 'group')
            dupe2index = {}
@ -364,13 +383,26 @@ class Results(Markable):
        self.is_modified = False
    
    def sort_dupes(self, key, asc=True, delta=False):
+        """Sort :attr:`dupes` according to ``key``.
+        
+        :param str key: key attribute name to sort with.
+        :param bool asc: If false, sorting is reversed.
+        :param bool delta: If true, sorting occurs using :ref:`delta values <deltavalues>`.
+        """
        if not self.__dupes:
            self.__get_dupe_list()
        keyfunc = lambda d: self.app._get_dupe_sort_key(d, lambda: self.get_group_of_duplicate(d), key, delta)
        self.__dupes.sort(key=keyfunc, reverse=not asc)
        self.__dupes_sort_descriptor = (key,asc,delta)
    
-    def sort_groups(self,key,asc=True):
+    def sort_groups(self, key, asc=True):
+        """Sort :attr:`groups` according to ``key``.
+        
+        The :attr:`~core.engine.Group.ref` of each group is used to extract values for sorting.
+        
+        :param str key: key attribute name to sort with.
+        :param bool asc: If false, sorting is reversed.
+        """
        keyfunc = lambda g: self.app._get_group_sort_key(g, key)
        self.groups.sort(key=keyfunc, reverse=not asc)
        self.__groups_sort_descriptor = (key,asc)
--- a/help/en/developer/core/engine.rst
+++ b/help/en/developer/core/engine.rst
@ -2,6 +2,35 @@ core.engine
 ===========

 .. automodule:: core.engine
+    
+    .. autoclass:: Match
+    
+    .. autoclass:: Group
+        :members:
+    
+    .. autofunction:: build_word_dict
+    .. autofunction:: compare
+    .. autofunction:: compare_fields
+    .. autofunction:: getmatches
+    .. autofunction:: getmatches_by_contents
+    .. autofunction:: get_groups
+    .. autofunction:: merge_similar_words
+    .. autofunction:: reduce_common_words
+    
+.. _fields:

-.. autoclass:: core.engine.Group
-    :members:
+Fields
+------
+
+Fields are groups of words which each represent a significant part of the whole name. This concept
+is sifnificant in music file names, where we often have names like "My Artist - a very long title
+with many many words".
+
+This title has 10 words. If you run as scan with a bit of tolerance, let's say 90%, you'll be able
+to find a dupe that has only one "many" in the song title. However, you would also get false
+duplicates from a title like "My Giraffe - a very long title with many many words", which is of
+course a very different song and it doesn't make sense to match them.
+
+When matching by fields, each field (separated by "-") is considered as a separate string to match
+independently. After all fields are matched, the lowest result is kept. In the "Giraffe" example we
+gave, the result would be 50% instead of 90% in normal mode.
--- a/help/en/developer/index.rst
+++ b/help/en/developer/index.rst
@ -23,6 +23,8 @@ codebase. For example, when performing "Remove Selected From Results",
 ``base.app.DupeGuru.remove_duplicates()`` on the PyQt side, are respectively called to perform the
 thing. All of this is quite ugly, I know (see the "Refactoring" section below).

+.. _jobs:
+
 Jobs
 ----

--- a/help/en/results.rst
+++ b/help/en/results.rst
@ -45,6 +45,8 @@ The dupeGuru results, when in normal mode, are sorted according to duplicate gro
 * Hold Shift and click on it.
 * Press Space to mark all selected duplicates.

+.. _deltavalues:
+
 Delta Values
 ------------