mirror of
https://github.com/arsenetar/dupeguru.git
synced 2024-10-31 22:05:58 +00:00
70e505ad92
Having dupes from ref folders (which makes ref switching impossible) would make the new feature glitchy (selection would be emptied). Now, in cases where the action results in nothing being changed, the selection stays intact. [#222]
399 lines
14 KiB
Python
399 lines
14 KiB
Python
# Created By: Virgil Dupras
|
|
# Created On: 2006/01/29
|
|
# Copyright 2013 Hardcoded Software (http://www.hardcoded.net)
|
|
#
|
|
# This software is licensed under the "BSD" License as described in the "LICENSE" file,
|
|
# which should be included with this package. The terms are also available at
|
|
# http://www.hardcoded.net/licenses/bsd_license
|
|
|
|
import difflib
|
|
import itertools
|
|
import logging
|
|
import string
|
|
from collections import defaultdict, namedtuple
|
|
from unicodedata import normalize
|
|
|
|
from hscommon.util import flatten, multi_replace
|
|
from hscommon.trans import tr
|
|
from jobprogress import job
|
|
|
|
(WEIGHT_WORDS,
|
|
MATCH_SIMILAR_WORDS,
|
|
NO_FIELD_ORDER) = range(3)
|
|
|
|
JOB_REFRESH_RATE = 100
|
|
|
|
def getwords(s):
|
|
# We decompose the string so that ascii letters with accents can be part of the word.
|
|
s = normalize('NFD', s)
|
|
s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", ' ').lower()
|
|
s = ''.join(c for c in s if c in string.ascii_letters + string.digits + string.whitespace)
|
|
return [_f for _f in s.split(' ') if _f] # remove empty elements
|
|
|
|
def getfields(s):
|
|
fields = [getwords(field) for field in s.split(' - ')]
|
|
return [_f for _f in fields if _f]
|
|
|
|
def unpack_fields(fields):
|
|
result = []
|
|
for field in fields:
|
|
if isinstance(field, list):
|
|
result += field
|
|
else:
|
|
result.append(field)
|
|
return result
|
|
|
|
def compare(first, second, flags=()):
|
|
"""Returns the % of words that match between first and second
|
|
|
|
The result is a int in the range 0..100.
|
|
First and second can be either a string or a list.
|
|
"""
|
|
if not (first and second):
|
|
return 0
|
|
if any(isinstance(element, list) for element in first):
|
|
return compare_fields(first, second, flags)
|
|
second = second[:] #We must use a copy of second because we remove items from it
|
|
match_similar = MATCH_SIMILAR_WORDS in flags
|
|
weight_words = WEIGHT_WORDS in flags
|
|
joined = first + second
|
|
total_count = (sum(len(word) for word in joined) if weight_words else len(joined))
|
|
match_count = 0
|
|
in_order = True
|
|
for word in first:
|
|
if match_similar and (word not in second):
|
|
similar = difflib.get_close_matches(word, second, 1, 0.8)
|
|
if similar:
|
|
word = similar[0]
|
|
if word in second:
|
|
if second[0] != word:
|
|
in_order = False
|
|
second.remove(word)
|
|
match_count += (len(word) if weight_words else 1)
|
|
result = round(((match_count * 2) / total_count) * 100)
|
|
if (result == 100) and (not in_order):
|
|
result = 99 # We cannot consider a match exact unless the ordering is the same
|
|
return result
|
|
|
|
def compare_fields(first, second, flags=()):
|
|
"""Returns the score for the lowest matching fields.
|
|
|
|
first and second must be lists of lists of string.
|
|
"""
|
|
if len(first) != len(second):
|
|
return 0
|
|
if NO_FIELD_ORDER in flags:
|
|
results = []
|
|
#We don't want to remove field directly in the list. We must work on a copy.
|
|
second = second[:]
|
|
for field1 in first:
|
|
max = 0
|
|
matched_field = None
|
|
for field2 in second:
|
|
r = compare(field1, field2, flags)
|
|
if r > max:
|
|
max = r
|
|
matched_field = field2
|
|
results.append(max)
|
|
if matched_field:
|
|
second.remove(matched_field)
|
|
else:
|
|
results = [compare(word1, word2, flags) for word1, word2 in zip(first, second)]
|
|
return min(results) if results else 0
|
|
|
|
def build_word_dict(objects, j=job.nulljob):
|
|
"""Returns a dict of objects mapped by their words.
|
|
|
|
objects must have a 'words' attribute being a list of strings or a list of lists of strings.
|
|
|
|
The result will be a dict with words as keys, lists of objects as values.
|
|
"""
|
|
result = defaultdict(set)
|
|
for object in j.iter_with_progress(objects, 'Prepared %d/%d files', JOB_REFRESH_RATE):
|
|
for word in unpack_fields(object.words):
|
|
result[word].add(object)
|
|
return result
|
|
|
|
def merge_similar_words(word_dict):
|
|
"""Take all keys in word_dict that are similar, and merge them together.
|
|
"""
|
|
keys = list(word_dict.keys())
|
|
keys.sort(key=len)# we want the shortest word to stay
|
|
while keys:
|
|
key = keys.pop(0)
|
|
similars = difflib.get_close_matches(key, keys, 100, 0.8)
|
|
if not similars:
|
|
continue
|
|
objects = word_dict[key]
|
|
for similar in similars:
|
|
objects |= word_dict[similar]
|
|
del word_dict[similar]
|
|
keys.remove(similar)
|
|
|
|
def reduce_common_words(word_dict, threshold):
|
|
"""Remove all objects from word_dict values where the object count >= threshold
|
|
|
|
The exception to this removal are the objects where all the words of the object are common.
|
|
Because if we remove them, we will miss some duplicates!
|
|
"""
|
|
uncommon_words = set(word for word, objects in word_dict.items() if len(objects) < threshold)
|
|
for word, objects in list(word_dict.items()):
|
|
if len(objects) < threshold:
|
|
continue
|
|
reduced = set()
|
|
for o in objects:
|
|
if not any(w in uncommon_words for w in unpack_fields(o.words)):
|
|
reduced.add(o)
|
|
if reduced:
|
|
word_dict[word] = reduced
|
|
else:
|
|
del word_dict[word]
|
|
|
|
Match = namedtuple('Match', 'first second percentage')
|
|
def get_match(first, second, flags=()):
|
|
#it is assumed here that first and second both have a "words" attribute
|
|
percentage = compare(first.words, second.words, flags)
|
|
return Match(first, second, percentage)
|
|
|
|
def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False,
|
|
no_field_order=False, j=job.nulljob):
|
|
COMMON_WORD_THRESHOLD = 50
|
|
LIMIT = 5000000
|
|
j = j.start_subjob(2)
|
|
sj = j.start_subjob(2)
|
|
for o in objects:
|
|
if not hasattr(o, 'words'):
|
|
o.words = getwords(o.name)
|
|
word_dict = build_word_dict(objects, sj)
|
|
reduce_common_words(word_dict, COMMON_WORD_THRESHOLD)
|
|
if match_similar_words:
|
|
merge_similar_words(word_dict)
|
|
match_flags = []
|
|
if weight_words:
|
|
match_flags.append(WEIGHT_WORDS)
|
|
if match_similar_words:
|
|
match_flags.append(MATCH_SIMILAR_WORDS)
|
|
if no_field_order:
|
|
match_flags.append(NO_FIELD_ORDER)
|
|
j.start_job(len(word_dict), tr("0 matches found"))
|
|
compared = defaultdict(set)
|
|
result = []
|
|
try:
|
|
# This whole 'popping' thing is there to avoid taking too much memory at the same time.
|
|
while word_dict:
|
|
items = word_dict.popitem()[1]
|
|
while items:
|
|
ref = items.pop()
|
|
compared_already = compared[ref]
|
|
to_compare = items - compared_already
|
|
compared_already |= to_compare
|
|
for other in to_compare:
|
|
m = get_match(ref, other, match_flags)
|
|
if m.percentage >= min_match_percentage:
|
|
result.append(m)
|
|
if len(result) >= LIMIT:
|
|
return result
|
|
j.add_progress(desc=tr("%d matches found") % len(result))
|
|
except MemoryError:
|
|
# This is the place where the memory usage is at its peak during the scan.
|
|
# Just continue the process with an incomplete list of matches.
|
|
del compared # This should give us enough room to call logging.
|
|
logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
|
|
return result
|
|
return result
|
|
|
|
def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob):
|
|
j = j.start_subjob([2, 8])
|
|
size2files = defaultdict(set)
|
|
for file in j.iter_with_progress(files, tr("Read size of %d/%d files")):
|
|
filesize = getattr(file, sizeattr)
|
|
if filesize:
|
|
size2files[filesize].add(file)
|
|
possible_matches = [files for files in size2files.values() if len(files) > 1]
|
|
del size2files
|
|
result = []
|
|
j.start_job(len(possible_matches), tr("0 matches found"))
|
|
for group in possible_matches:
|
|
for first, second in itertools.combinations(group, 2):
|
|
if first.is_ref and second.is_ref:
|
|
continue # Don't spend time comparing two ref pics together.
|
|
if first.md5partial == second.md5partial:
|
|
if partial or first.md5 == second.md5:
|
|
result.append(Match(first, second, 100))
|
|
j.add_progress(desc=tr("%d matches found") % len(result))
|
|
return result
|
|
|
|
class Group:
|
|
#---Override
|
|
def __init__(self):
|
|
self._clear()
|
|
|
|
def __contains__(self, item):
|
|
return item in self.unordered
|
|
|
|
def __getitem__(self, key):
|
|
return self.ordered.__getitem__(key)
|
|
|
|
def __iter__(self):
|
|
return iter(self.ordered)
|
|
|
|
def __len__(self):
|
|
return len(self.ordered)
|
|
|
|
#---Private
|
|
def _clear(self):
|
|
self._percentage = None
|
|
self._matches_for_ref = None
|
|
self.matches = set()
|
|
self.candidates = defaultdict(set)
|
|
self.ordered = []
|
|
self.unordered = set()
|
|
|
|
def _get_matches_for_ref(self):
|
|
if self._matches_for_ref is None:
|
|
ref = self.ref
|
|
self._matches_for_ref = [match for match in self.matches if ref in match]
|
|
return self._matches_for_ref
|
|
|
|
#---Public
|
|
def add_match(self, match):
|
|
def add_candidate(item, match):
|
|
matches = self.candidates[item]
|
|
matches.add(match)
|
|
if self.unordered <= matches:
|
|
self.ordered.append(item)
|
|
self.unordered.add(item)
|
|
|
|
if match in self.matches:
|
|
return
|
|
self.matches.add(match)
|
|
first, second, _ = match
|
|
if first not in self.unordered:
|
|
add_candidate(first, second)
|
|
if second not in self.unordered:
|
|
add_candidate(second, first)
|
|
self._percentage = None
|
|
self._matches_for_ref = None
|
|
|
|
def discard_matches(self):
|
|
discarded = set(m for m in self.matches if not all(obj in self.unordered for obj in [m.first, m.second]))
|
|
self.matches -= discarded
|
|
self.candidates = defaultdict(set)
|
|
return discarded
|
|
|
|
def get_match_of(self, item):
|
|
if item is self.ref:
|
|
return
|
|
for m in self._get_matches_for_ref():
|
|
if item in m:
|
|
return m
|
|
|
|
def prioritize(self, key_func, tie_breaker=None):
|
|
# tie_breaker(ref, dupe) --> True if dupe should be ref
|
|
# Returns True if anything changed during prioritization.
|
|
master_key_func = lambda x: (-x.is_ref, key_func(x))
|
|
new_order = sorted(self.ordered, key=master_key_func)
|
|
changed = new_order != self.ordered
|
|
self.ordered = new_order
|
|
if tie_breaker is None:
|
|
return changed
|
|
ref = self.ref
|
|
key_value = key_func(ref)
|
|
for dupe in self.dupes:
|
|
if key_func(dupe) != key_value:
|
|
break
|
|
if tie_breaker(ref, dupe):
|
|
ref = dupe
|
|
if ref is not self.ref:
|
|
self.switch_ref(ref)
|
|
return True
|
|
return changed
|
|
|
|
def remove_dupe(self, item, discard_matches=True):
|
|
try:
|
|
self.ordered.remove(item)
|
|
self.unordered.remove(item)
|
|
self._percentage = None
|
|
self._matches_for_ref = None
|
|
if (len(self) > 1) and any(not getattr(item, 'is_ref', False) for item in self):
|
|
if discard_matches:
|
|
self.matches = set(m for m in self.matches if item not in m)
|
|
else:
|
|
self._clear()
|
|
except ValueError:
|
|
pass
|
|
|
|
def switch_ref(self, with_dupe):
|
|
if self.ref.is_ref:
|
|
return False
|
|
try:
|
|
self.ordered.remove(with_dupe)
|
|
self.ordered.insert(0, with_dupe)
|
|
self._percentage = None
|
|
self._matches_for_ref = None
|
|
return True
|
|
except ValueError:
|
|
return False
|
|
|
|
dupes = property(lambda self: self[1:])
|
|
|
|
@property
|
|
def percentage(self):
|
|
if self._percentage is None:
|
|
if self.dupes:
|
|
matches = self._get_matches_for_ref()
|
|
self._percentage = sum(match.percentage for match in matches) // len(matches)
|
|
else:
|
|
self._percentage = 0
|
|
return self._percentage
|
|
|
|
@property
|
|
def ref(self):
|
|
if self:
|
|
return self[0]
|
|
|
|
|
|
def get_groups(matches, j=job.nulljob):
|
|
matches.sort(key=lambda match: -match.percentage)
|
|
dupe2group = {}
|
|
groups = []
|
|
try:
|
|
for match in j.iter_with_progress(matches, tr("Grouped %d/%d matches"), JOB_REFRESH_RATE):
|
|
first, second, _ = match
|
|
first_group = dupe2group.get(first)
|
|
second_group = dupe2group.get(second)
|
|
if first_group:
|
|
if second_group:
|
|
if first_group is second_group:
|
|
target_group = first_group
|
|
else:
|
|
continue
|
|
else:
|
|
target_group = first_group
|
|
dupe2group[second] = target_group
|
|
else:
|
|
if second_group:
|
|
target_group = second_group
|
|
dupe2group[first] = target_group
|
|
else:
|
|
target_group = Group()
|
|
groups.append(target_group)
|
|
dupe2group[first] = target_group
|
|
dupe2group[second] = target_group
|
|
target_group.add_match(match)
|
|
except MemoryError:
|
|
del dupe2group
|
|
del matches
|
|
# should free enough memory to continue
|
|
logging.warning('Memory Overflow. Groups: {0}'.format(len(groups)))
|
|
# Now that we have a group, we have to discard groups' matches and see if there're any "orphan"
|
|
# matches, that is, matches that were candidate in a group but that none of their 2 files were
|
|
# accepted in the group. With these orphan groups, it's safe to build additional groups
|
|
matched_files = set(flatten(groups))
|
|
orphan_matches = []
|
|
for group in groups:
|
|
orphan_matches += set(m for m in group.discard_matches() if not any(obj in matched_files for obj in [m.first, m.second]))
|
|
if groups and orphan_matches:
|
|
groups += get_groups(orphan_matches) # no job, as it isn't supposed to take a long time
|
|
return groups
|