mirror of
https://github.com/arsenetar/dupeguru.git
synced 2026-01-22 06:37:17 +00:00
Remove flake8 E731 Errors
Note: black formatting is now applying correctly as well.
This commit is contained in:
@@ -17,7 +17,11 @@ from hscommon.util import flatten, multi_replace
|
||||
from hscommon.trans import tr
|
||||
from hscommon.jobprogress import job
|
||||
|
||||
(WEIGHT_WORDS, MATCH_SIMILAR_WORDS, NO_FIELD_ORDER,) = range(3)
|
||||
(
|
||||
WEIGHT_WORDS,
|
||||
MATCH_SIMILAR_WORDS,
|
||||
NO_FIELD_ORDER,
|
||||
) = range(3)
|
||||
|
||||
JOB_REFRESH_RATE = 100
|
||||
|
||||
@@ -34,11 +38,9 @@ def getwords(s):
|
||||
# of their accents, etc.) are preserved as is. The arbitrary limit is
|
||||
# obtained from this one: ord("\u037e") GREEK QUESTION MARK
|
||||
s = "".join(
|
||||
c for c in s
|
||||
if (ord(c) <= 894
|
||||
and c in string.ascii_letters + string.digits + string.whitespace
|
||||
)
|
||||
or ord(c) > 894
|
||||
c
|
||||
for c in s
|
||||
if (ord(c) <= 894 and c in string.ascii_letters + string.digits + string.whitespace) or ord(c) > 894
|
||||
)
|
||||
return [_f for _f in s.split(" ") if _f] # remove empty elements
|
||||
|
||||
@@ -115,9 +117,7 @@ def compare_fields(first, second, flags=()):
|
||||
if matched_field:
|
||||
second.remove(matched_field)
|
||||
else:
|
||||
results = [
|
||||
compare(field1, field2, flags) for field1, field2 in zip(first, second)
|
||||
]
|
||||
results = [compare(field1, field2, flags) for field1, field2 in zip(first, second)]
|
||||
return min(results) if results else 0
|
||||
|
||||
|
||||
@@ -130,9 +130,7 @@ def build_word_dict(objects, j=job.nulljob):
|
||||
The result will be a dict with words as keys, lists of objects as values.
|
||||
"""
|
||||
result = defaultdict(set)
|
||||
for object in j.iter_with_progress(
|
||||
objects, "Prepared %d/%d files", JOB_REFRESH_RATE
|
||||
):
|
||||
for object in j.iter_with_progress(objects, "Prepared %d/%d files", JOB_REFRESH_RATE):
|
||||
for word in unpack_fields(object.words):
|
||||
result[word].add(object)
|
||||
return result
|
||||
@@ -167,9 +165,7 @@ def reduce_common_words(word_dict, threshold):
|
||||
The exception to this removal are the objects where all the words of the object are common.
|
||||
Because if we remove them, we will miss some duplicates!
|
||||
"""
|
||||
uncommon_words = set(
|
||||
word for word, objects in word_dict.items() if len(objects) < threshold
|
||||
)
|
||||
uncommon_words = set(word for word, objects in word_dict.items() if len(objects) < threshold)
|
||||
for word, objects in list(word_dict.items()):
|
||||
if len(objects) < threshold:
|
||||
continue
|
||||
@@ -275,10 +271,7 @@ def getmatches(
|
||||
# This is the place where the memory usage is at its peak during the scan.
|
||||
# Just continue the process with an incomplete list of matches.
|
||||
del compared # This should give us enough room to call logging.
|
||||
logging.warning(
|
||||
"Memory Overflow. Matches: %d. Word dict: %d"
|
||||
% (len(result), len(word_dict))
|
||||
)
|
||||
logging.warning("Memory Overflow. Matches: %d. Word dict: %d" % (len(result), len(word_dict)))
|
||||
return result
|
||||
return result
|
||||
|
||||
@@ -408,18 +401,13 @@ class Group:
|
||||
|
||||
You can call this after the duplicate scanning process to free a bit of memory.
|
||||
"""
|
||||
discarded = set(
|
||||
m
|
||||
for m in self.matches
|
||||
if not all(obj in self.unordered for obj in [m.first, m.second])
|
||||
)
|
||||
discarded = set(m for m in self.matches if not all(obj in self.unordered for obj in [m.first, m.second]))
|
||||
self.matches -= discarded
|
||||
self.candidates = defaultdict(set)
|
||||
return discarded
|
||||
|
||||
def get_match_of(self, item):
|
||||
"""Returns the match pair between ``item`` and :attr:`ref`.
|
||||
"""
|
||||
"""Returns the match pair between ``item`` and :attr:`ref`."""
|
||||
if item is self.ref:
|
||||
return
|
||||
for m in self._get_matches_for_ref():
|
||||
@@ -435,8 +423,7 @@ class Group:
|
||||
"""
|
||||
# tie_breaker(ref, dupe) --> True if dupe should be ref
|
||||
# Returns True if anything changed during prioritization.
|
||||
master_key_func = lambda x: (-x.is_ref, key_func(x))
|
||||
new_order = sorted(self.ordered, key=master_key_func)
|
||||
new_order = sorted(self.ordered, key=lambda x: (-x.is_ref, key_func(x)))
|
||||
changed = new_order != self.ordered
|
||||
self.ordered = new_order
|
||||
if tie_breaker is None:
|
||||
@@ -459,9 +446,7 @@ class Group:
|
||||
self.unordered.remove(item)
|
||||
self._percentage = None
|
||||
self._matches_for_ref = None
|
||||
if (len(self) > 1) and any(
|
||||
not getattr(item, "is_ref", False) for item in self
|
||||
):
|
||||
if (len(self) > 1) and any(not getattr(item, "is_ref", False) for item in self):
|
||||
if discard_matches:
|
||||
self.matches = set(m for m in self.matches if item not in m)
|
||||
else:
|
||||
@@ -470,8 +455,7 @@ class Group:
|
||||
pass
|
||||
|
||||
def switch_ref(self, with_dupe):
|
||||
"""Make the :attr:`ref` dupe of the group switch position with ``with_dupe``.
|
||||
"""
|
||||
"""Make the :attr:`ref` dupe of the group switch position with ``with_dupe``."""
|
||||
if self.ref.is_ref:
|
||||
return False
|
||||
try:
|
||||
@@ -490,9 +474,7 @@ class Group:
|
||||
if self._percentage is None:
|
||||
if self.dupes:
|
||||
matches = self._get_matches_for_ref()
|
||||
self._percentage = sum(match.percentage for match in matches) // len(
|
||||
matches
|
||||
)
|
||||
self._percentage = sum(match.percentage for match in matches) // len(matches)
|
||||
else:
|
||||
self._percentage = 0
|
||||
return self._percentage
|
||||
@@ -547,12 +529,8 @@ def get_groups(matches):
|
||||
orphan_matches = []
|
||||
for group in groups:
|
||||
orphan_matches += {
|
||||
m
|
||||
for m in group.discard_matches()
|
||||
if not any(obj in matched_files for obj in [m.first, m.second])
|
||||
m for m in group.discard_matches() if not any(obj in matched_files for obj in [m.first, m.second])
|
||||
}
|
||||
if groups and orphan_matches:
|
||||
groups += get_groups(
|
||||
orphan_matches
|
||||
) # no job, as it isn't supposed to take a long time
|
||||
groups += get_groups(orphan_matches) # no job, as it isn't supposed to take a long time
|
||||
return groups
|
||||
|
||||
@@ -106,9 +106,7 @@ class Results(Markable):
|
||||
self.groups,
|
||||
)
|
||||
if self.__filtered_dupes:
|
||||
self.__dupes = [
|
||||
dupe for dupe in self.__dupes if dupe in self.__filtered_dupes
|
||||
]
|
||||
self.__dupes = [dupe for dupe in self.__dupes if dupe in self.__filtered_dupes]
|
||||
sd = self.__dupes_sort_descriptor
|
||||
if sd:
|
||||
self.sort_dupes(sd[0], sd[1], sd[2])
|
||||
@@ -127,18 +125,10 @@ class Results(Markable):
|
||||
total_count = self.__total_count
|
||||
total_size = self.__total_size
|
||||
else:
|
||||
mark_count = len(
|
||||
[dupe for dupe in self.__filtered_dupes if self.is_marked(dupe)]
|
||||
)
|
||||
marked_size = sum(
|
||||
dupe.size for dupe in self.__filtered_dupes if self.is_marked(dupe)
|
||||
)
|
||||
total_count = len(
|
||||
[dupe for dupe in self.__filtered_dupes if self.is_markable(dupe)]
|
||||
)
|
||||
total_size = sum(
|
||||
dupe.size for dupe in self.__filtered_dupes if self.is_markable(dupe)
|
||||
)
|
||||
mark_count = len([dupe for dupe in self.__filtered_dupes if self.is_marked(dupe)])
|
||||
marked_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_marked(dupe))
|
||||
total_count = len([dupe for dupe in self.__filtered_dupes if self.is_markable(dupe)])
|
||||
total_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_markable(dupe))
|
||||
if self.mark_inverted:
|
||||
marked_size = self.__total_size - marked_size
|
||||
result = tr("%d / %d (%s / %s) duplicates marked.") % (
|
||||
@@ -201,11 +191,7 @@ class Results(Markable):
|
||||
self.__filters.append(filter_str)
|
||||
if self.__filtered_dupes is None:
|
||||
self.__filtered_dupes = flatten(g[:] for g in self.groups)
|
||||
self.__filtered_dupes = set(
|
||||
dupe
|
||||
for dupe in self.__filtered_dupes
|
||||
if filter_re.search(str(dupe.path))
|
||||
)
|
||||
self.__filtered_dupes = set(dupe for dupe in self.__filtered_dupes if filter_re.search(str(dupe.path)))
|
||||
filtered_groups = set()
|
||||
for dupe in self.__filtered_dupes:
|
||||
filtered_groups.add(self.get_group_of_duplicate(dupe))
|
||||
@@ -217,8 +203,7 @@ class Results(Markable):
|
||||
self.__dupes = None
|
||||
|
||||
def get_group_of_duplicate(self, dupe):
|
||||
"""Returns :class:`~core.engine.Group` in which ``dupe`` belongs.
|
||||
"""
|
||||
"""Returns :class:`~core.engine.Group` in which ``dupe`` belongs."""
|
||||
try:
|
||||
return self.__group_of_duplicate[dupe]
|
||||
except (TypeError, KeyError):
|
||||
@@ -284,8 +269,7 @@ class Results(Markable):
|
||||
self.is_modified = False
|
||||
|
||||
def make_ref(self, dupe):
|
||||
"""Make ``dupe`` take the :attr:`~core.engine.Group.ref` position of its group.
|
||||
"""
|
||||
"""Make ``dupe`` take the :attr:`~core.engine.Group.ref` position of its group."""
|
||||
g = self.get_group_of_duplicate(dupe)
|
||||
r = g.ref
|
||||
if not g.switch_ref(dupe):
|
||||
@@ -412,10 +396,10 @@ class Results(Markable):
|
||||
"""
|
||||
if not self.__dupes:
|
||||
self.__get_dupe_list()
|
||||
keyfunc = lambda d: self.app._get_dupe_sort_key(
|
||||
d, lambda: self.get_group_of_duplicate(d), key, delta
|
||||
self.__dupes.sort(
|
||||
key=lambda d: self.app._get_dupe_sort_key(d, lambda: self.get_group_of_duplicate(d), key, delta),
|
||||
reverse=not asc,
|
||||
)
|
||||
self.__dupes.sort(key=keyfunc, reverse=not asc)
|
||||
self.__dupes_sort_descriptor = (key, asc, delta)
|
||||
|
||||
def sort_groups(self, key, asc=True):
|
||||
@@ -426,8 +410,7 @@ class Results(Markable):
|
||||
:param str key: key attribute name to sort with.
|
||||
:param bool asc: If false, sorting is reversed.
|
||||
"""
|
||||
keyfunc = lambda g: self.app._get_group_sort_key(g, key)
|
||||
self.groups.sort(key=keyfunc, reverse=not asc)
|
||||
self.groups.sort(key=lambda g: self.app._get_group_sort_key(g, key), reverse=not asc)
|
||||
self.__groups_sort_descriptor = (key, asc)
|
||||
|
||||
# ---Properties
|
||||
|
||||
@@ -177,9 +177,7 @@ class TestCaseWordCompareWithFields:
|
||||
def test_simple(self):
|
||||
eq_(
|
||||
67,
|
||||
compare_fields(
|
||||
[["a", "b"], ["c", "d", "e"]], [["a", "b"], ["c", "d", "f"]]
|
||||
),
|
||||
compare_fields([["a", "b"], ["c", "d", "e"]], [["a", "b"], ["c", "d", "f"]]),
|
||||
)
|
||||
|
||||
def test_empty(self):
|
||||
@@ -265,9 +263,7 @@ class TestCasebuild_word_dict:
|
||||
j = job.Job(1, do_progress)
|
||||
self.log = []
|
||||
s = "foo bar"
|
||||
build_word_dict(
|
||||
[NamedObject(s, True), NamedObject(s, True), NamedObject(s, True)], j
|
||||
)
|
||||
build_word_dict([NamedObject(s, True), NamedObject(s, True), NamedObject(s, True)], j)
|
||||
# We don't have intermediate log because iter_with_progress is called with every > 1
|
||||
eq_(0, self.log[0])
|
||||
eq_(100, self.log[1])
|
||||
@@ -297,10 +293,7 @@ class TestCasereduce_common_words:
|
||||
|
||||
def test_dont_remove_objects_with_only_common_words(self):
|
||||
d = {
|
||||
"common": set(
|
||||
[NamedObject("common uncommon", True) for i in range(50)]
|
||||
+ [NamedObject("common", True)]
|
||||
),
|
||||
"common": set([NamedObject("common uncommon", True) for i in range(50)] + [NamedObject("common", True)]),
|
||||
"uncommon": set([NamedObject("common uncommon", True)]),
|
||||
}
|
||||
reduce_common_words(d, 50)
|
||||
@@ -309,10 +302,7 @@ class TestCasereduce_common_words:
|
||||
|
||||
def test_values_still_are_set_instances(self):
|
||||
d = {
|
||||
"common": set(
|
||||
[NamedObject("common uncommon", True) for i in range(50)]
|
||||
+ [NamedObject("common", True)]
|
||||
),
|
||||
"common": set([NamedObject("common uncommon", True) for i in range(50)] + [NamedObject("common", True)]),
|
||||
"uncommon": set([NamedObject("common uncommon", True)]),
|
||||
}
|
||||
reduce_common_words(d, 50)
|
||||
@@ -352,12 +342,8 @@ class TestCasereduce_common_words:
|
||||
# would not stay in 'bar' because 'foo' is not a common word anymore.
|
||||
only_common = NamedObject("foo bar", True)
|
||||
d = {
|
||||
"foo": set(
|
||||
[NamedObject("foo bar baz", True) for i in range(49)] + [only_common]
|
||||
),
|
||||
"bar": set(
|
||||
[NamedObject("foo bar baz", True) for i in range(49)] + [only_common]
|
||||
),
|
||||
"foo": set([NamedObject("foo bar baz", True) for i in range(49)] + [only_common]),
|
||||
"bar": set([NamedObject("foo bar baz", True) for i in range(49)] + [only_common]),
|
||||
"baz": set([NamedObject("foo bar baz", True) for i in range(49)]),
|
||||
}
|
||||
reduce_common_words(d, 50)
|
||||
@@ -386,9 +372,7 @@ class TestCaseget_match:
|
||||
assert object() not in m
|
||||
|
||||
def test_word_weight(self):
|
||||
m = get_match(
|
||||
NamedObject("foo bar", True), NamedObject("bar bleh", True), (WEIGHT_WORDS,)
|
||||
)
|
||||
m = get_match(NamedObject("foo bar", True), NamedObject("bar bleh", True), (WEIGHT_WORDS,))
|
||||
eq_(m.percentage, int((6.0 / 13.0) * 100))
|
||||
|
||||
|
||||
@@ -554,8 +538,12 @@ class TestCaseGetMatchesByContents:
|
||||
def test_big_file_partial_hashes(self):
|
||||
smallsize = 1
|
||||
bigsize = 100 * 1024 * 1024 # 100MB
|
||||
f = [no("bigfoo", size=bigsize), no("bigbar", size=bigsize),
|
||||
no("smallfoo", size=smallsize), no("smallbar", size=smallsize)]
|
||||
f = [
|
||||
no("bigfoo", size=bigsize),
|
||||
no("bigbar", size=bigsize),
|
||||
no("smallfoo", size=smallsize),
|
||||
no("smallbar", size=smallsize),
|
||||
]
|
||||
f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
|
||||
f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
|
||||
f[2].md5 = f[2].md5partial = "bleh"
|
||||
@@ -749,8 +737,7 @@ class TestCaseGroup:
|
||||
# if the ref has the same key as one or more of the dupe, run the tie_breaker func among them
|
||||
g = get_test_group()
|
||||
o1, o2, o3 = g.ordered
|
||||
tie_breaker = lambda ref, dupe: dupe is o3
|
||||
g.prioritize(lambda x: 0, tie_breaker)
|
||||
g.prioritize(lambda x: 0, lambda ref, dupe: dupe is o3)
|
||||
assert g.ref is o3
|
||||
|
||||
def test_prioritize_with_tie_breaker_runs_on_all_dupes(self):
|
||||
@@ -761,8 +748,7 @@ class TestCaseGroup:
|
||||
o1.foo = 1
|
||||
o2.foo = 2
|
||||
o3.foo = 3
|
||||
tie_breaker = lambda ref, dupe: dupe.foo > ref.foo
|
||||
g.prioritize(lambda x: 0, tie_breaker)
|
||||
g.prioritize(lambda x: 0, lambda ref, dupe: dupe.foo > ref.foo)
|
||||
assert g.ref is o3
|
||||
|
||||
def test_prioritize_with_tie_breaker_runs_only_on_tie_dupes(self):
|
||||
@@ -775,9 +761,7 @@ class TestCaseGroup:
|
||||
o1.bar = 1
|
||||
o2.bar = 2
|
||||
o3.bar = 3
|
||||
key_func = lambda x: -x.foo
|
||||
tie_breaker = lambda ref, dupe: dupe.bar > ref.bar
|
||||
g.prioritize(key_func, tie_breaker)
|
||||
g.prioritize(lambda x: -x.foo, lambda ref, dupe: dupe.bar > ref.bar)
|
||||
assert g.ref is o2
|
||||
|
||||
def test_prioritize_with_ref_dupe(self):
|
||||
@@ -909,9 +893,7 @@ class TestCaseget_groups:
|
||||
m1 = Match(A, B, 90) # This is the strongest "A" match
|
||||
m2 = Match(A, C, 80) # Because C doesn't match with B, it won't be in the group
|
||||
m3 = Match(A, D, 80) # Same thing for D
|
||||
m4 = Match(
|
||||
C, D, 70
|
||||
) # However, because C and D match, they should have their own group.
|
||||
m4 = Match(C, D, 70) # However, because C and D match, they should have their own group.
|
||||
groups = get_groups([m1, m2, m3, m4])
|
||||
eq_(len(groups), 2)
|
||||
g1, g2 = groups
|
||||
|
||||
Reference in New Issue
Block a user