mirror of
https://github.com/arsenetar/dupeguru.git
synced 2025-03-10 05:34:36 +00:00
Improved dev docs
This commit is contained in:
parent
e99e2b18e0
commit
78c3c8ec2d
@ -44,10 +44,10 @@ def unpack_fields(fields):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def compare(first, second, flags=()):
|
def compare(first, second, flags=()):
|
||||||
"""Returns the % of words that match between first and second
|
"""Returns the % of words that match between ``first`` and ``second``
|
||||||
|
|
||||||
The result is a int in the range 0..100.
|
The result is a ``int`` in the range 0..100.
|
||||||
First and second can be either a string or a list.
|
``first`` and ``second`` can be either a string or a list (of words).
|
||||||
"""
|
"""
|
||||||
if not (first and second):
|
if not (first and second):
|
||||||
return 0
|
return 0
|
||||||
@ -76,9 +76,10 @@ def compare(first, second, flags=()):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def compare_fields(first, second, flags=()):
|
def compare_fields(first, second, flags=()):
|
||||||
"""Returns the score for the lowest matching fields.
|
"""Returns the score for the lowest matching :ref:`fields`.
|
||||||
|
|
||||||
first and second must be lists of lists of string.
|
``first`` and ``second`` must be lists of lists of string. Each sub-list is then compared with
|
||||||
|
:func:`compare`.
|
||||||
"""
|
"""
|
||||||
if len(first) != len(second):
|
if len(first) != len(second):
|
||||||
return 0
|
return 0
|
||||||
@ -98,13 +99,14 @@ def compare_fields(first, second, flags=()):
|
|||||||
if matched_field:
|
if matched_field:
|
||||||
second.remove(matched_field)
|
second.remove(matched_field)
|
||||||
else:
|
else:
|
||||||
results = [compare(word1, word2, flags) for word1, word2 in zip(first, second)]
|
results = [compare(field1, field2, flags) for field1, field2 in zip(first, second)]
|
||||||
return min(results) if results else 0
|
return min(results) if results else 0
|
||||||
|
|
||||||
def build_word_dict(objects, j=job.nulljob):
|
def build_word_dict(objects, j=job.nulljob):
|
||||||
"""Returns a dict of objects mapped by their words.
|
"""Returns a dict of objects mapped by their words.
|
||||||
|
|
||||||
objects must have a 'words' attribute being a list of strings or a list of lists of strings.
|
objects must have a ``words`` attribute being a list of strings or a list of lists of strings
|
||||||
|
(:ref:`fields`).
|
||||||
|
|
||||||
The result will be a dict with words as keys, lists of objects as values.
|
The result will be a dict with words as keys, lists of objects as values.
|
||||||
"""
|
"""
|
||||||
@ -115,7 +117,11 @@ def build_word_dict(objects, j=job.nulljob):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def merge_similar_words(word_dict):
|
def merge_similar_words(word_dict):
|
||||||
"""Take all keys in word_dict that are similar, and merge them together.
|
"""Take all keys in ``word_dict`` that are similar, and merge them together.
|
||||||
|
|
||||||
|
``word_dict`` has been built with :func:`build_word_dict`. Similarity is computed with Python's
|
||||||
|
``difflib.get_close_matches()``, which computes the number of edits that are necessary to make
|
||||||
|
a word equal to the other.
|
||||||
"""
|
"""
|
||||||
keys = list(word_dict.keys())
|
keys = list(word_dict.keys())
|
||||||
keys.sort(key=len)# we want the shortest word to stay
|
keys.sort(key=len)# we want the shortest word to stay
|
||||||
@ -131,7 +137,9 @@ def merge_similar_words(word_dict):
|
|||||||
keys.remove(similar)
|
keys.remove(similar)
|
||||||
|
|
||||||
def reduce_common_words(word_dict, threshold):
|
def reduce_common_words(word_dict, threshold):
|
||||||
"""Remove all objects from word_dict values where the object count >= threshold
|
"""Remove all objects from ``word_dict`` values where the object count >= ``threshold``
|
||||||
|
|
||||||
|
``word_dict`` has been built with :func:`build_word_dict`.
|
||||||
|
|
||||||
The exception to this removal are the objects where all the words of the object are common.
|
The exception to this removal are the objects where all the words of the object are common.
|
||||||
Because if we remove them, we will miss some duplicates!
|
Because if we remove them, we will miss some duplicates!
|
||||||
@ -150,13 +158,42 @@ def reduce_common_words(word_dict, threshold):
|
|||||||
del word_dict[word]
|
del word_dict[word]
|
||||||
|
|
||||||
Match = namedtuple('Match', 'first second percentage')
|
Match = namedtuple('Match', 'first second percentage')
|
||||||
|
Match.__doc__ = """Represents a match between two :class:`~core.fs.File`.
|
||||||
|
|
||||||
|
Regarless of the matching method, when two files are determined to match, a Match pair is created,
|
||||||
|
which holds, of course, the two matched files, but also their match "level".
|
||||||
|
|
||||||
|
.. attribute:: first
|
||||||
|
|
||||||
|
first file of the pair.
|
||||||
|
|
||||||
|
.. attribute:: second
|
||||||
|
|
||||||
|
second file of the pair.
|
||||||
|
|
||||||
|
.. attribute:: percentage
|
||||||
|
|
||||||
|
their match level according to the scan method which found the match. int from 1 to 100. For
|
||||||
|
exact scan methods, such as Contents scans, this will always be 100.
|
||||||
|
"""
|
||||||
|
|
||||||
def get_match(first, second, flags=()):
|
def get_match(first, second, flags=()):
|
||||||
#it is assumed here that first and second both have a "words" attribute
|
#it is assumed here that first and second both have a "words" attribute
|
||||||
percentage = compare(first.words, second.words, flags)
|
percentage = compare(first.words, second.words, flags)
|
||||||
return Match(first, second, percentage)
|
return Match(first, second, percentage)
|
||||||
|
|
||||||
def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False,
|
def getmatches(
|
||||||
|
objects, min_match_percentage=0, match_similar_words=False, weight_words=False,
|
||||||
no_field_order=False, j=job.nulljob):
|
no_field_order=False, j=job.nulljob):
|
||||||
|
"""Returns a list of :class:`Match` within ``objects`` after fuzzily matching their words.
|
||||||
|
|
||||||
|
:param objects: List of :class:`~core.fs.File` to match.
|
||||||
|
:param int min_match_percentage: minimum % of words that have to match.
|
||||||
|
:param bool match_similar_words: make similar words (see :func:`merge_similar_words`) match.
|
||||||
|
:param bool weight_words: longer words are worth more in match % computations.
|
||||||
|
:param bool no_field_order: match :ref:`fields` regardless of their order.
|
||||||
|
:param j: A :ref:`job progress instance <jobs>`.
|
||||||
|
"""
|
||||||
COMMON_WORD_THRESHOLD = 50
|
COMMON_WORD_THRESHOLD = 50
|
||||||
LIMIT = 5000000
|
LIMIT = 5000000
|
||||||
j = j.start_subjob(2)
|
j = j.start_subjob(2)
|
||||||
@ -203,6 +240,14 @@ def getmatches(objects, min_match_percentage=0, match_similar_words=False, weigh
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob):
|
def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob):
|
||||||
|
"""Returns a list of :class:`Match` within ``files`` if their contents is the same.
|
||||||
|
|
||||||
|
:param str sizeattr: attibute name of the :class:`~core.fs.file` that returns the size of the
|
||||||
|
file to use for comparison.
|
||||||
|
:param bool partial: if true, will use the "md5partial" attribute instead of "md5" to compute
|
||||||
|
contents hash.
|
||||||
|
:param j: A :ref:`job progress instance <jobs>`.
|
||||||
|
"""
|
||||||
j = j.start_subjob([2, 8])
|
j = j.start_subjob([2, 8])
|
||||||
size2files = defaultdict(set)
|
size2files = defaultdict(set)
|
||||||
for file in j.iter_with_progress(files, tr("Read size of %d/%d files")):
|
for file in j.iter_with_progress(files, tr("Read size of %d/%d files")):
|
||||||
@ -240,6 +285,15 @@ class Group:
|
|||||||
.. attribute:: unordered
|
.. attribute:: unordered
|
||||||
|
|
||||||
Set duplicates in the group (including the :attr:`ref`).
|
Set duplicates in the group (including the :attr:`ref`).
|
||||||
|
|
||||||
|
.. attribute:: dupes
|
||||||
|
|
||||||
|
An ordered list of the group's duplicate, without :attr:`ref`. Equivalent to
|
||||||
|
``ordered[1:]``
|
||||||
|
|
||||||
|
.. attribute:: percentage
|
||||||
|
|
||||||
|
Average match percentage of match pairs containing :attr:`ref`.
|
||||||
"""
|
"""
|
||||||
#---Override
|
#---Override
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -362,6 +416,8 @@ class Group:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def switch_ref(self, with_dupe):
|
def switch_ref(self, with_dupe):
|
||||||
|
"""Make the :attr:`ref` dupe of the group switch position with ``with_dupe``.
|
||||||
|
"""
|
||||||
if self.ref.is_ref:
|
if self.ref.is_ref:
|
||||||
return False
|
return False
|
||||||
try:
|
try:
|
||||||
@ -392,6 +448,10 @@ class Group:
|
|||||||
|
|
||||||
|
|
||||||
def get_groups(matches, j=job.nulljob):
|
def get_groups(matches, j=job.nulljob):
|
||||||
|
"""Returns a list of :class:`Group` from ``matches``.
|
||||||
|
|
||||||
|
Create groups out of match pairs in the smartest way possible.
|
||||||
|
"""
|
||||||
matches.sort(key=lambda match: -match.percentage)
|
matches.sort(key=lambda match: -match.percentage)
|
||||||
dupe2group = {}
|
dupe2group = {}
|
||||||
groups = []
|
groups = []
|
||||||
|
@ -195,6 +195,8 @@ class Results(Markable):
|
|||||||
self.__dupes = None
|
self.__dupes = None
|
||||||
|
|
||||||
def get_group_of_duplicate(self, dupe):
|
def get_group_of_duplicate(self, dupe):
|
||||||
|
"""Returns :class:`~core.engine.Group` in which ``dupe`` belongs.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
return self.__group_of_duplicate[dupe]
|
return self.__group_of_duplicate[dupe]
|
||||||
except (TypeError, KeyError):
|
except (TypeError, KeyError):
|
||||||
@ -203,6 +205,12 @@ class Results(Markable):
|
|||||||
is_markable = _is_markable
|
is_markable = _is_markable
|
||||||
|
|
||||||
def load_from_xml(self, infile, get_file, j=nulljob):
|
def load_from_xml(self, infile, get_file, j=nulljob):
|
||||||
|
"""Load results from ``infile``.
|
||||||
|
|
||||||
|
:param infile: a file or path pointing to an XML file created with :meth:`save_to_xml`.
|
||||||
|
:param get_file: a function f(path) returning a :class:`~core.fs.File` wrapping the path.
|
||||||
|
:param j: A :ref:`job progress instance <jobs>`.
|
||||||
|
"""
|
||||||
def do_match(ref_file, other_files, group):
|
def do_match(ref_file, other_files, group):
|
||||||
if not other_files:
|
if not other_files:
|
||||||
return
|
return
|
||||||
@ -255,6 +263,8 @@ class Results(Markable):
|
|||||||
self.is_modified = False
|
self.is_modified = False
|
||||||
|
|
||||||
def make_ref(self, dupe):
|
def make_ref(self, dupe):
|
||||||
|
"""Make ``dupe`` take the :attr:`~core.engine.Group.ref` position of its group.
|
||||||
|
"""
|
||||||
g = self.get_group_of_duplicate(dupe)
|
g = self.get_group_of_duplicate(dupe)
|
||||||
r = g.ref
|
r = g.ref
|
||||||
if not g.switch_ref(dupe):
|
if not g.switch_ref(dupe):
|
||||||
@ -271,8 +281,14 @@ class Results(Markable):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def perform_on_marked(self, func, remove_from_results):
|
def perform_on_marked(self, func, remove_from_results):
|
||||||
# Performs `func` on all marked dupes. If an EnvironmentError is raised during the call,
|
"""Performs ``func`` on all marked dupes.
|
||||||
# the problematic dupe is added to self.problems.
|
|
||||||
|
If an ``EnvironmentError`` is raised during the call, the problematic dupe is added to
|
||||||
|
self.problems.
|
||||||
|
|
||||||
|
:param bool remove_from_results: If true, dupes which had ``func`` applied and didn't cause
|
||||||
|
any problem.
|
||||||
|
"""
|
||||||
self.problems = []
|
self.problems = []
|
||||||
to_remove = []
|
to_remove = []
|
||||||
marked = (dupe for dupe in self.dupes if self.is_marked(dupe))
|
marked = (dupe for dupe in self.dupes if self.is_marked(dupe))
|
||||||
@ -317,9 +333,12 @@ class Results(Markable):
|
|||||||
self.is_modified = bool(self.__groups)
|
self.is_modified = bool(self.__groups)
|
||||||
|
|
||||||
def save_to_xml(self, outfile):
|
def save_to_xml(self, outfile):
|
||||||
|
"""Save results to ``outfile`` in XML.
|
||||||
|
|
||||||
|
:param outfile: file object or path.
|
||||||
|
"""
|
||||||
self.apply_filter(None)
|
self.apply_filter(None)
|
||||||
root = ET.Element('results')
|
root = ET.Element('results')
|
||||||
# writer = XMLGenerator(outfile, 'utf-8')
|
|
||||||
for g in self.groups:
|
for g in self.groups:
|
||||||
group_elem = ET.SubElement(root, 'group')
|
group_elem = ET.SubElement(root, 'group')
|
||||||
dupe2index = {}
|
dupe2index = {}
|
||||||
@ -364,13 +383,26 @@ class Results(Markable):
|
|||||||
self.is_modified = False
|
self.is_modified = False
|
||||||
|
|
||||||
def sort_dupes(self, key, asc=True, delta=False):
|
def sort_dupes(self, key, asc=True, delta=False):
|
||||||
|
"""Sort :attr:`dupes` according to ``key``.
|
||||||
|
|
||||||
|
:param str key: key attribute name to sort with.
|
||||||
|
:param bool asc: If false, sorting is reversed.
|
||||||
|
:param bool delta: If true, sorting occurs using :ref:`delta values <deltavalues>`.
|
||||||
|
"""
|
||||||
if not self.__dupes:
|
if not self.__dupes:
|
||||||
self.__get_dupe_list()
|
self.__get_dupe_list()
|
||||||
keyfunc = lambda d: self.app._get_dupe_sort_key(d, lambda: self.get_group_of_duplicate(d), key, delta)
|
keyfunc = lambda d: self.app._get_dupe_sort_key(d, lambda: self.get_group_of_duplicate(d), key, delta)
|
||||||
self.__dupes.sort(key=keyfunc, reverse=not asc)
|
self.__dupes.sort(key=keyfunc, reverse=not asc)
|
||||||
self.__dupes_sort_descriptor = (key,asc,delta)
|
self.__dupes_sort_descriptor = (key,asc,delta)
|
||||||
|
|
||||||
def sort_groups(self,key,asc=True):
|
def sort_groups(self, key, asc=True):
|
||||||
|
"""Sort :attr:`groups` according to ``key``.
|
||||||
|
|
||||||
|
The :attr:`~core.engine.Group.ref` of each group is used to extract values for sorting.
|
||||||
|
|
||||||
|
:param str key: key attribute name to sort with.
|
||||||
|
:param bool asc: If false, sorting is reversed.
|
||||||
|
"""
|
||||||
keyfunc = lambda g: self.app._get_group_sort_key(g, key)
|
keyfunc = lambda g: self.app._get_group_sort_key(g, key)
|
||||||
self.groups.sort(key=keyfunc, reverse=not asc)
|
self.groups.sort(key=keyfunc, reverse=not asc)
|
||||||
self.__groups_sort_descriptor = (key,asc)
|
self.__groups_sort_descriptor = (key,asc)
|
||||||
|
@ -3,5 +3,34 @@ core.engine
|
|||||||
|
|
||||||
.. automodule:: core.engine
|
.. automodule:: core.engine
|
||||||
|
|
||||||
.. autoclass:: core.engine.Group
|
.. autoclass:: Match
|
||||||
|
|
||||||
|
.. autoclass:: Group
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
.. autofunction:: build_word_dict
|
||||||
|
.. autofunction:: compare
|
||||||
|
.. autofunction:: compare_fields
|
||||||
|
.. autofunction:: getmatches
|
||||||
|
.. autofunction:: getmatches_by_contents
|
||||||
|
.. autofunction:: get_groups
|
||||||
|
.. autofunction:: merge_similar_words
|
||||||
|
.. autofunction:: reduce_common_words
|
||||||
|
|
||||||
|
.. _fields:
|
||||||
|
|
||||||
|
Fields
|
||||||
|
------
|
||||||
|
|
||||||
|
Fields are groups of words which each represent a significant part of the whole name. This concept
|
||||||
|
is sifnificant in music file names, where we often have names like "My Artist - a very long title
|
||||||
|
with many many words".
|
||||||
|
|
||||||
|
This title has 10 words. If you run as scan with a bit of tolerance, let's say 90%, you'll be able
|
||||||
|
to find a dupe that has only one "many" in the song title. However, you would also get false
|
||||||
|
duplicates from a title like "My Giraffe - a very long title with many many words", which is of
|
||||||
|
course a very different song and it doesn't make sense to match them.
|
||||||
|
|
||||||
|
When matching by fields, each field (separated by "-") is considered as a separate string to match
|
||||||
|
independently. After all fields are matched, the lowest result is kept. In the "Giraffe" example we
|
||||||
|
gave, the result would be 50% instead of 90% in normal mode.
|
||||||
|
@ -23,6 +23,8 @@ codebase. For example, when performing "Remove Selected From Results",
|
|||||||
``base.app.DupeGuru.remove_duplicates()`` on the PyQt side, are respectively called to perform the
|
``base.app.DupeGuru.remove_duplicates()`` on the PyQt side, are respectively called to perform the
|
||||||
thing. All of this is quite ugly, I know (see the "Refactoring" section below).
|
thing. All of this is quite ugly, I know (see the "Refactoring" section below).
|
||||||
|
|
||||||
|
.. _jobs:
|
||||||
|
|
||||||
Jobs
|
Jobs
|
||||||
----
|
----
|
||||||
|
|
||||||
|
@ -45,6 +45,8 @@ The dupeGuru results, when in normal mode, are sorted according to duplicate gro
|
|||||||
* Hold Shift and click on it.
|
* Hold Shift and click on it.
|
||||||
* Press Space to mark all selected duplicates.
|
* Press Space to mark all selected duplicates.
|
||||||
|
|
||||||
|
.. _deltavalues:
|
||||||
|
|
||||||
Delta Values
|
Delta Values
|
||||||
------------
|
------------
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user