1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2025-09-11 17:58:17 +00:00

Improved dev docs

This commit is contained in:
Virgil Dupras 2013-08-20 22:52:43 -04:00
parent e99e2b18e0
commit 78c3c8ec2d
5 changed files with 142 additions and 17 deletions

View File

@ -44,10 +44,10 @@ def unpack_fields(fields):
return result return result
def compare(first, second, flags=()): def compare(first, second, flags=()):
"""Returns the % of words that match between first and second """Returns the % of words that match between ``first`` and ``second``
The result is a int in the range 0..100. The result is a ``int`` in the range 0..100.
First and second can be either a string or a list. ``first`` and ``second`` can be either a string or a list (of words).
""" """
if not (first and second): if not (first and second):
return 0 return 0
@ -76,9 +76,10 @@ def compare(first, second, flags=()):
return result return result
def compare_fields(first, second, flags=()): def compare_fields(first, second, flags=()):
"""Returns the score for the lowest matching fields. """Returns the score for the lowest matching :ref:`fields`.
first and second must be lists of lists of string. ``first`` and ``second`` must be lists of lists of string. Each sub-list is then compared with
:func:`compare`.
""" """
if len(first) != len(second): if len(first) != len(second):
return 0 return 0
@ -98,13 +99,14 @@ def compare_fields(first, second, flags=()):
if matched_field: if matched_field:
second.remove(matched_field) second.remove(matched_field)
else: else:
results = [compare(word1, word2, flags) for word1, word2 in zip(first, second)] results = [compare(field1, field2, flags) for field1, field2 in zip(first, second)]
return min(results) if results else 0 return min(results) if results else 0
def build_word_dict(objects, j=job.nulljob): def build_word_dict(objects, j=job.nulljob):
"""Returns a dict of objects mapped by their words. """Returns a dict of objects mapped by their words.
objects must have a 'words' attribute being a list of strings or a list of lists of strings. objects must have a ``words`` attribute being a list of strings or a list of lists of strings
(:ref:`fields`).
The result will be a dict with words as keys, lists of objects as values. The result will be a dict with words as keys, lists of objects as values.
""" """
@ -115,7 +117,11 @@ def build_word_dict(objects, j=job.nulljob):
return result return result
def merge_similar_words(word_dict): def merge_similar_words(word_dict):
"""Take all keys in word_dict that are similar, and merge them together. """Take all keys in ``word_dict`` that are similar, and merge them together.
``word_dict`` has been built with :func:`build_word_dict`. Similarity is computed with Python's
``difflib.get_close_matches()``, which computes the number of edits that are necessary to make
a word equal to the other.
""" """
keys = list(word_dict.keys()) keys = list(word_dict.keys())
keys.sort(key=len)# we want the shortest word to stay keys.sort(key=len)# we want the shortest word to stay
@ -131,7 +137,9 @@ def merge_similar_words(word_dict):
keys.remove(similar) keys.remove(similar)
def reduce_common_words(word_dict, threshold): def reduce_common_words(word_dict, threshold):
"""Remove all objects from word_dict values where the object count >= threshold """Remove all objects from ``word_dict`` values where the object count >= ``threshold``
``word_dict`` has been built with :func:`build_word_dict`.
The exception to this removal are the objects where all the words of the object are common. The exception to this removal are the objects where all the words of the object are common.
Because if we remove them, we will miss some duplicates! Because if we remove them, we will miss some duplicates!
@ -150,13 +158,42 @@ def reduce_common_words(word_dict, threshold):
del word_dict[word] del word_dict[word]
Match = namedtuple('Match', 'first second percentage') Match = namedtuple('Match', 'first second percentage')
Match.__doc__ = """Represents a match between two :class:`~core.fs.File`.
Regarless of the matching method, when two files are determined to match, a Match pair is created,
which holds, of course, the two matched files, but also their match "level".
.. attribute:: first
first file of the pair.
.. attribute:: second
second file of the pair.
.. attribute:: percentage
their match level according to the scan method which found the match. int from 1 to 100. For
exact scan methods, such as Contents scans, this will always be 100.
"""
def get_match(first, second, flags=()): def get_match(first, second, flags=()):
#it is assumed here that first and second both have a "words" attribute #it is assumed here that first and second both have a "words" attribute
percentage = compare(first.words, second.words, flags) percentage = compare(first.words, second.words, flags)
return Match(first, second, percentage) return Match(first, second, percentage)
def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False, def getmatches(
objects, min_match_percentage=0, match_similar_words=False, weight_words=False,
no_field_order=False, j=job.nulljob): no_field_order=False, j=job.nulljob):
"""Returns a list of :class:`Match` within ``objects`` after fuzzily matching their words.
:param objects: List of :class:`~core.fs.File` to match.
:param int min_match_percentage: minimum % of words that have to match.
:param bool match_similar_words: make similar words (see :func:`merge_similar_words`) match.
:param bool weight_words: longer words are worth more in match % computations.
:param bool no_field_order: match :ref:`fields` regardless of their order.
:param j: A :ref:`job progress instance <jobs>`.
"""
COMMON_WORD_THRESHOLD = 50 COMMON_WORD_THRESHOLD = 50
LIMIT = 5000000 LIMIT = 5000000
j = j.start_subjob(2) j = j.start_subjob(2)
@ -203,6 +240,14 @@ def getmatches(objects, min_match_percentage=0, match_similar_words=False, weigh
return result return result
def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob): def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob):
"""Returns a list of :class:`Match` within ``files`` if their contents is the same.
:param str sizeattr: attibute name of the :class:`~core.fs.file` that returns the size of the
file to use for comparison.
:param bool partial: if true, will use the "md5partial" attribute instead of "md5" to compute
contents hash.
:param j: A :ref:`job progress instance <jobs>`.
"""
j = j.start_subjob([2, 8]) j = j.start_subjob([2, 8])
size2files = defaultdict(set) size2files = defaultdict(set)
for file in j.iter_with_progress(files, tr("Read size of %d/%d files")): for file in j.iter_with_progress(files, tr("Read size of %d/%d files")):
@ -240,6 +285,15 @@ class Group:
.. attribute:: unordered .. attribute:: unordered
Set duplicates in the group (including the :attr:`ref`). Set duplicates in the group (including the :attr:`ref`).
.. attribute:: dupes
An ordered list of the group's duplicate, without :attr:`ref`. Equivalent to
``ordered[1:]``
.. attribute:: percentage
Average match percentage of match pairs containing :attr:`ref`.
""" """
#---Override #---Override
def __init__(self): def __init__(self):
@ -362,6 +416,8 @@ class Group:
pass pass
def switch_ref(self, with_dupe): def switch_ref(self, with_dupe):
"""Make the :attr:`ref` dupe of the group switch position with ``with_dupe``.
"""
if self.ref.is_ref: if self.ref.is_ref:
return False return False
try: try:
@ -392,6 +448,10 @@ class Group:
def get_groups(matches, j=job.nulljob): def get_groups(matches, j=job.nulljob):
"""Returns a list of :class:`Group` from ``matches``.
Create groups out of match pairs in the smartest way possible.
"""
matches.sort(key=lambda match: -match.percentage) matches.sort(key=lambda match: -match.percentage)
dupe2group = {} dupe2group = {}
groups = [] groups = []

View File

@ -195,6 +195,8 @@ class Results(Markable):
self.__dupes = None self.__dupes = None
def get_group_of_duplicate(self, dupe): def get_group_of_duplicate(self, dupe):
"""Returns :class:`~core.engine.Group` in which ``dupe`` belongs.
"""
try: try:
return self.__group_of_duplicate[dupe] return self.__group_of_duplicate[dupe]
except (TypeError, KeyError): except (TypeError, KeyError):
@ -203,6 +205,12 @@ class Results(Markable):
is_markable = _is_markable is_markable = _is_markable
def load_from_xml(self, infile, get_file, j=nulljob): def load_from_xml(self, infile, get_file, j=nulljob):
"""Load results from ``infile``.
:param infile: a file or path pointing to an XML file created with :meth:`save_to_xml`.
:param get_file: a function f(path) returning a :class:`~core.fs.File` wrapping the path.
:param j: A :ref:`job progress instance <jobs>`.
"""
def do_match(ref_file, other_files, group): def do_match(ref_file, other_files, group):
if not other_files: if not other_files:
return return
@ -255,6 +263,8 @@ class Results(Markable):
self.is_modified = False self.is_modified = False
def make_ref(self, dupe): def make_ref(self, dupe):
"""Make ``dupe`` take the :attr:`~core.engine.Group.ref` position of its group.
"""
g = self.get_group_of_duplicate(dupe) g = self.get_group_of_duplicate(dupe)
r = g.ref r = g.ref
if not g.switch_ref(dupe): if not g.switch_ref(dupe):
@ -271,8 +281,14 @@ class Results(Markable):
return True return True
def perform_on_marked(self, func, remove_from_results): def perform_on_marked(self, func, remove_from_results):
# Performs `func` on all marked dupes. If an EnvironmentError is raised during the call, """Performs ``func`` on all marked dupes.
# the problematic dupe is added to self.problems.
If an ``EnvironmentError`` is raised during the call, the problematic dupe is added to
self.problems.
:param bool remove_from_results: If true, dupes which had ``func`` applied and didn't cause
any problem.
"""
self.problems = [] self.problems = []
to_remove = [] to_remove = []
marked = (dupe for dupe in self.dupes if self.is_marked(dupe)) marked = (dupe for dupe in self.dupes if self.is_marked(dupe))
@ -317,9 +333,12 @@ class Results(Markable):
self.is_modified = bool(self.__groups) self.is_modified = bool(self.__groups)
def save_to_xml(self, outfile): def save_to_xml(self, outfile):
"""Save results to ``outfile`` in XML.
:param outfile: file object or path.
"""
self.apply_filter(None) self.apply_filter(None)
root = ET.Element('results') root = ET.Element('results')
# writer = XMLGenerator(outfile, 'utf-8')
for g in self.groups: for g in self.groups:
group_elem = ET.SubElement(root, 'group') group_elem = ET.SubElement(root, 'group')
dupe2index = {} dupe2index = {}
@ -364,6 +383,12 @@ class Results(Markable):
self.is_modified = False self.is_modified = False
def sort_dupes(self, key, asc=True, delta=False): def sort_dupes(self, key, asc=True, delta=False):
"""Sort :attr:`dupes` according to ``key``.
:param str key: key attribute name to sort with.
:param bool asc: If false, sorting is reversed.
:param bool delta: If true, sorting occurs using :ref:`delta values <deltavalues>`.
"""
if not self.__dupes: if not self.__dupes:
self.__get_dupe_list() self.__get_dupe_list()
keyfunc = lambda d: self.app._get_dupe_sort_key(d, lambda: self.get_group_of_duplicate(d), key, delta) keyfunc = lambda d: self.app._get_dupe_sort_key(d, lambda: self.get_group_of_duplicate(d), key, delta)
@ -371,6 +396,13 @@ class Results(Markable):
self.__dupes_sort_descriptor = (key,asc,delta) self.__dupes_sort_descriptor = (key,asc,delta)
def sort_groups(self, key, asc=True): def sort_groups(self, key, asc=True):
"""Sort :attr:`groups` according to ``key``.
The :attr:`~core.engine.Group.ref` of each group is used to extract values for sorting.
:param str key: key attribute name to sort with.
:param bool asc: If false, sorting is reversed.
"""
keyfunc = lambda g: self.app._get_group_sort_key(g, key) keyfunc = lambda g: self.app._get_group_sort_key(g, key)
self.groups.sort(key=keyfunc, reverse=not asc) self.groups.sort(key=keyfunc, reverse=not asc)
self.__groups_sort_descriptor = (key,asc) self.__groups_sort_descriptor = (key,asc)

View File

@ -3,5 +3,34 @@ core.engine
.. automodule:: core.engine .. automodule:: core.engine
.. autoclass:: core.engine.Group .. autoclass:: Match
.. autoclass:: Group
:members: :members:
.. autofunction:: build_word_dict
.. autofunction:: compare
.. autofunction:: compare_fields
.. autofunction:: getmatches
.. autofunction:: getmatches_by_contents
.. autofunction:: get_groups
.. autofunction:: merge_similar_words
.. autofunction:: reduce_common_words
.. _fields:
Fields
------
Fields are groups of words which each represent a significant part of the whole name. This concept
is sifnificant in music file names, where we often have names like "My Artist - a very long title
with many many words".
This title has 10 words. If you run as scan with a bit of tolerance, let's say 90%, you'll be able
to find a dupe that has only one "many" in the song title. However, you would also get false
duplicates from a title like "My Giraffe - a very long title with many many words", which is of
course a very different song and it doesn't make sense to match them.
When matching by fields, each field (separated by "-") is considered as a separate string to match
independently. After all fields are matched, the lowest result is kept. In the "Giraffe" example we
gave, the result would be 50% instead of 90% in normal mode.

View File

@ -23,6 +23,8 @@ codebase. For example, when performing "Remove Selected From Results",
``base.app.DupeGuru.remove_duplicates()`` on the PyQt side, are respectively called to perform the ``base.app.DupeGuru.remove_duplicates()`` on the PyQt side, are respectively called to perform the
thing. All of this is quite ugly, I know (see the "Refactoring" section below). thing. All of this is quite ugly, I know (see the "Refactoring" section below).
.. _jobs:
Jobs Jobs
---- ----

View File

@ -45,6 +45,8 @@ The dupeGuru results, when in normal mode, are sorted according to duplicate gro
* Hold Shift and click on it. * Hold Shift and click on it.
* Press Space to mark all selected duplicates. * Press Space to mark all selected duplicates.
.. _deltavalues:
Delta Values Delta Values
------------ ------------