1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-22 06:37:17 +00:00

Remove flake8 E731 Errors

Note: black formatting is now applying correctly as well.
This commit is contained in:
2021-08-15 03:51:27 -05:00
parent af19660c18
commit 9446f37fad
6 changed files with 100 additions and 229 deletions

View File

@@ -17,7 +17,11 @@ from hscommon.util import flatten, multi_replace
from hscommon.trans import tr
from hscommon.jobprogress import job
(WEIGHT_WORDS, MATCH_SIMILAR_WORDS, NO_FIELD_ORDER,) = range(3)
(
WEIGHT_WORDS,
MATCH_SIMILAR_WORDS,
NO_FIELD_ORDER,
) = range(3)
JOB_REFRESH_RATE = 100
@@ -34,11 +38,9 @@ def getwords(s):
# of their accents, etc.) are preserved as is. The arbitrary limit is
# obtained from this one: ord("\u037e") GREEK QUESTION MARK
s = "".join(
c for c in s
if (ord(c) <= 894
and c in string.ascii_letters + string.digits + string.whitespace
)
or ord(c) > 894
c
for c in s
if (ord(c) <= 894 and c in string.ascii_letters + string.digits + string.whitespace) or ord(c) > 894
)
return [_f for _f in s.split(" ") if _f] # remove empty elements
@@ -115,9 +117,7 @@ def compare_fields(first, second, flags=()):
if matched_field:
second.remove(matched_field)
else:
results = [
compare(field1, field2, flags) for field1, field2 in zip(first, second)
]
results = [compare(field1, field2, flags) for field1, field2 in zip(first, second)]
return min(results) if results else 0
@@ -130,9 +130,7 @@ def build_word_dict(objects, j=job.nulljob):
The result will be a dict with words as keys, lists of objects as values.
"""
result = defaultdict(set)
for object in j.iter_with_progress(
objects, "Prepared %d/%d files", JOB_REFRESH_RATE
):
for object in j.iter_with_progress(objects, "Prepared %d/%d files", JOB_REFRESH_RATE):
for word in unpack_fields(object.words):
result[word].add(object)
return result
@@ -167,9 +165,7 @@ def reduce_common_words(word_dict, threshold):
The exception to this removal are the objects where all the words of the object are common.
Because if we remove them, we will miss some duplicates!
"""
uncommon_words = set(
word for word, objects in word_dict.items() if len(objects) < threshold
)
uncommon_words = set(word for word, objects in word_dict.items() if len(objects) < threshold)
for word, objects in list(word_dict.items()):
if len(objects) < threshold:
continue
@@ -275,10 +271,7 @@ def getmatches(
# This is the place where the memory usage is at its peak during the scan.
# Just continue the process with an incomplete list of matches.
del compared # This should give us enough room to call logging.
logging.warning(
"Memory Overflow. Matches: %d. Word dict: %d"
% (len(result), len(word_dict))
)
logging.warning("Memory Overflow. Matches: %d. Word dict: %d" % (len(result), len(word_dict)))
return result
return result
@@ -408,18 +401,13 @@ class Group:
You can call this after the duplicate scanning process to free a bit of memory.
"""
discarded = set(
m
for m in self.matches
if not all(obj in self.unordered for obj in [m.first, m.second])
)
discarded = set(m for m in self.matches if not all(obj in self.unordered for obj in [m.first, m.second]))
self.matches -= discarded
self.candidates = defaultdict(set)
return discarded
def get_match_of(self, item):
"""Returns the match pair between ``item`` and :attr:`ref`.
"""
"""Returns the match pair between ``item`` and :attr:`ref`."""
if item is self.ref:
return
for m in self._get_matches_for_ref():
@@ -435,8 +423,7 @@ class Group:
"""
# tie_breaker(ref, dupe) --> True if dupe should be ref
# Returns True if anything changed during prioritization.
master_key_func = lambda x: (-x.is_ref, key_func(x))
new_order = sorted(self.ordered, key=master_key_func)
new_order = sorted(self.ordered, key=lambda x: (-x.is_ref, key_func(x)))
changed = new_order != self.ordered
self.ordered = new_order
if tie_breaker is None:
@@ -459,9 +446,7 @@ class Group:
self.unordered.remove(item)
self._percentage = None
self._matches_for_ref = None
if (len(self) > 1) and any(
not getattr(item, "is_ref", False) for item in self
):
if (len(self) > 1) and any(not getattr(item, "is_ref", False) for item in self):
if discard_matches:
self.matches = set(m for m in self.matches if item not in m)
else:
@@ -470,8 +455,7 @@ class Group:
pass
def switch_ref(self, with_dupe):
"""Make the :attr:`ref` dupe of the group switch position with ``with_dupe``.
"""
"""Make the :attr:`ref` dupe of the group switch position with ``with_dupe``."""
if self.ref.is_ref:
return False
try:
@@ -490,9 +474,7 @@ class Group:
if self._percentage is None:
if self.dupes:
matches = self._get_matches_for_ref()
self._percentage = sum(match.percentage for match in matches) // len(
matches
)
self._percentage = sum(match.percentage for match in matches) // len(matches)
else:
self._percentage = 0
return self._percentage
@@ -547,12 +529,8 @@ def get_groups(matches):
orphan_matches = []
for group in groups:
orphan_matches += {
m
for m in group.discard_matches()
if not any(obj in matched_files for obj in [m.first, m.second])
m for m in group.discard_matches() if not any(obj in matched_files for obj in [m.first, m.second])
}
if groups and orphan_matches:
groups += get_groups(
orphan_matches
) # no job, as it isn't supposed to take a long time
groups += get_groups(orphan_matches) # no job, as it isn't supposed to take a long time
return groups

View File

@@ -106,9 +106,7 @@ class Results(Markable):
self.groups,
)
if self.__filtered_dupes:
self.__dupes = [
dupe for dupe in self.__dupes if dupe in self.__filtered_dupes
]
self.__dupes = [dupe for dupe in self.__dupes if dupe in self.__filtered_dupes]
sd = self.__dupes_sort_descriptor
if sd:
self.sort_dupes(sd[0], sd[1], sd[2])
@@ -127,18 +125,10 @@ class Results(Markable):
total_count = self.__total_count
total_size = self.__total_size
else:
mark_count = len(
[dupe for dupe in self.__filtered_dupes if self.is_marked(dupe)]
)
marked_size = sum(
dupe.size for dupe in self.__filtered_dupes if self.is_marked(dupe)
)
total_count = len(
[dupe for dupe in self.__filtered_dupes if self.is_markable(dupe)]
)
total_size = sum(
dupe.size for dupe in self.__filtered_dupes if self.is_markable(dupe)
)
mark_count = len([dupe for dupe in self.__filtered_dupes if self.is_marked(dupe)])
marked_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_marked(dupe))
total_count = len([dupe for dupe in self.__filtered_dupes if self.is_markable(dupe)])
total_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_markable(dupe))
if self.mark_inverted:
marked_size = self.__total_size - marked_size
result = tr("%d / %d (%s / %s) duplicates marked.") % (
@@ -201,11 +191,7 @@ class Results(Markable):
self.__filters.append(filter_str)
if self.__filtered_dupes is None:
self.__filtered_dupes = flatten(g[:] for g in self.groups)
self.__filtered_dupes = set(
dupe
for dupe in self.__filtered_dupes
if filter_re.search(str(dupe.path))
)
self.__filtered_dupes = set(dupe for dupe in self.__filtered_dupes if filter_re.search(str(dupe.path)))
filtered_groups = set()
for dupe in self.__filtered_dupes:
filtered_groups.add(self.get_group_of_duplicate(dupe))
@@ -217,8 +203,7 @@ class Results(Markable):
self.__dupes = None
def get_group_of_duplicate(self, dupe):
"""Returns :class:`~core.engine.Group` in which ``dupe`` belongs.
"""
"""Returns :class:`~core.engine.Group` in which ``dupe`` belongs."""
try:
return self.__group_of_duplicate[dupe]
except (TypeError, KeyError):
@@ -284,8 +269,7 @@ class Results(Markable):
self.is_modified = False
def make_ref(self, dupe):
"""Make ``dupe`` take the :attr:`~core.engine.Group.ref` position of its group.
"""
"""Make ``dupe`` take the :attr:`~core.engine.Group.ref` position of its group."""
g = self.get_group_of_duplicate(dupe)
r = g.ref
if not g.switch_ref(dupe):
@@ -412,10 +396,10 @@ class Results(Markable):
"""
if not self.__dupes:
self.__get_dupe_list()
keyfunc = lambda d: self.app._get_dupe_sort_key(
d, lambda: self.get_group_of_duplicate(d), key, delta
self.__dupes.sort(
key=lambda d: self.app._get_dupe_sort_key(d, lambda: self.get_group_of_duplicate(d), key, delta),
reverse=not asc,
)
self.__dupes.sort(key=keyfunc, reverse=not asc)
self.__dupes_sort_descriptor = (key, asc, delta)
def sort_groups(self, key, asc=True):
@@ -426,8 +410,7 @@ class Results(Markable):
:param str key: key attribute name to sort with.
:param bool asc: If false, sorting is reversed.
"""
keyfunc = lambda g: self.app._get_group_sort_key(g, key)
self.groups.sort(key=keyfunc, reverse=not asc)
self.groups.sort(key=lambda g: self.app._get_group_sort_key(g, key), reverse=not asc)
self.__groups_sort_descriptor = (key, asc)
# ---Properties

View File

@@ -177,9 +177,7 @@ class TestCaseWordCompareWithFields:
def test_simple(self):
eq_(
67,
compare_fields(
[["a", "b"], ["c", "d", "e"]], [["a", "b"], ["c", "d", "f"]]
),
compare_fields([["a", "b"], ["c", "d", "e"]], [["a", "b"], ["c", "d", "f"]]),
)
def test_empty(self):
@@ -265,9 +263,7 @@ class TestCasebuild_word_dict:
j = job.Job(1, do_progress)
self.log = []
s = "foo bar"
build_word_dict(
[NamedObject(s, True), NamedObject(s, True), NamedObject(s, True)], j
)
build_word_dict([NamedObject(s, True), NamedObject(s, True), NamedObject(s, True)], j)
# We don't have intermediate log because iter_with_progress is called with every > 1
eq_(0, self.log[0])
eq_(100, self.log[1])
@@ -297,10 +293,7 @@ class TestCasereduce_common_words:
def test_dont_remove_objects_with_only_common_words(self):
d = {
"common": set(
[NamedObject("common uncommon", True) for i in range(50)]
+ [NamedObject("common", True)]
),
"common": set([NamedObject("common uncommon", True) for i in range(50)] + [NamedObject("common", True)]),
"uncommon": set([NamedObject("common uncommon", True)]),
}
reduce_common_words(d, 50)
@@ -309,10 +302,7 @@ class TestCasereduce_common_words:
def test_values_still_are_set_instances(self):
d = {
"common": set(
[NamedObject("common uncommon", True) for i in range(50)]
+ [NamedObject("common", True)]
),
"common": set([NamedObject("common uncommon", True) for i in range(50)] + [NamedObject("common", True)]),
"uncommon": set([NamedObject("common uncommon", True)]),
}
reduce_common_words(d, 50)
@@ -352,12 +342,8 @@ class TestCasereduce_common_words:
# would not stay in 'bar' because 'foo' is not a common word anymore.
only_common = NamedObject("foo bar", True)
d = {
"foo": set(
[NamedObject("foo bar baz", True) for i in range(49)] + [only_common]
),
"bar": set(
[NamedObject("foo bar baz", True) for i in range(49)] + [only_common]
),
"foo": set([NamedObject("foo bar baz", True) for i in range(49)] + [only_common]),
"bar": set([NamedObject("foo bar baz", True) for i in range(49)] + [only_common]),
"baz": set([NamedObject("foo bar baz", True) for i in range(49)]),
}
reduce_common_words(d, 50)
@@ -386,9 +372,7 @@ class TestCaseget_match:
assert object() not in m
def test_word_weight(self):
m = get_match(
NamedObject("foo bar", True), NamedObject("bar bleh", True), (WEIGHT_WORDS,)
)
m = get_match(NamedObject("foo bar", True), NamedObject("bar bleh", True), (WEIGHT_WORDS,))
eq_(m.percentage, int((6.0 / 13.0) * 100))
@@ -554,8 +538,12 @@ class TestCaseGetMatchesByContents:
def test_big_file_partial_hashes(self):
smallsize = 1
bigsize = 100 * 1024 * 1024 # 100MB
f = [no("bigfoo", size=bigsize), no("bigbar", size=bigsize),
no("smallfoo", size=smallsize), no("smallbar", size=smallsize)]
f = [
no("bigfoo", size=bigsize),
no("bigbar", size=bigsize),
no("smallfoo", size=smallsize),
no("smallbar", size=smallsize),
]
f[0].md5 = f[0].md5partial = f[0].md5samples = "foobar"
f[1].md5 = f[1].md5partial = f[1].md5samples = "foobar"
f[2].md5 = f[2].md5partial = "bleh"
@@ -749,8 +737,7 @@ class TestCaseGroup:
# if the ref has the same key as one or more of the dupe, run the tie_breaker func among them
g = get_test_group()
o1, o2, o3 = g.ordered
tie_breaker = lambda ref, dupe: dupe is o3
g.prioritize(lambda x: 0, tie_breaker)
g.prioritize(lambda x: 0, lambda ref, dupe: dupe is o3)
assert g.ref is o3
def test_prioritize_with_tie_breaker_runs_on_all_dupes(self):
@@ -761,8 +748,7 @@ class TestCaseGroup:
o1.foo = 1
o2.foo = 2
o3.foo = 3
tie_breaker = lambda ref, dupe: dupe.foo > ref.foo
g.prioritize(lambda x: 0, tie_breaker)
g.prioritize(lambda x: 0, lambda ref, dupe: dupe.foo > ref.foo)
assert g.ref is o3
def test_prioritize_with_tie_breaker_runs_only_on_tie_dupes(self):
@@ -775,9 +761,7 @@ class TestCaseGroup:
o1.bar = 1
o2.bar = 2
o3.bar = 3
key_func = lambda x: -x.foo
tie_breaker = lambda ref, dupe: dupe.bar > ref.bar
g.prioritize(key_func, tie_breaker)
g.prioritize(lambda x: -x.foo, lambda ref, dupe: dupe.bar > ref.bar)
assert g.ref is o2
def test_prioritize_with_ref_dupe(self):
@@ -909,9 +893,7 @@ class TestCaseget_groups:
m1 = Match(A, B, 90) # This is the strongest "A" match
m2 = Match(A, C, 80) # Because C doesn't match with B, it won't be in the group
m3 = Match(A, D, 80) # Same thing for D
m4 = Match(
C, D, 70
) # However, because C and D match, they should have their own group.
m4 = Match(C, D, 70) # However, because C and D match, they should have their own group.
groups = get_groups([m1, m2, m3, m4])
eq_(len(groups), 2)
g1, g2 = groups