From c4dcfd3d4ba41b897a56a54de88c083cd8fa2096 Mon Sep 17 00:00:00 2001 From: glubsy Date: Thu, 29 Apr 2021 05:08:43 +0200 Subject: [PATCH 1/2] Fix stripping (japanese) unicode characters * Accents are getting removed from Unicode characters to generate similar "words". * Non-latin characters which cannot be processed that way (eg. japanese, greek, russian, etc.) should not be filtered out at all otherwise files are erroneously skipped or detected as dupes if only some characters make it passed the filter. * Starting from an arbitrary unicode codepoint (converted to decimal), above which we know it is pointless to try any sort of processing, we leave the characters as is. * Fix #878. --- core/engine.py | 13 ++++++++++++- core/tests/engine_test.py | 6 +++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/core/engine.py b/core/engine.py index 8a5f054a..867b8e09 100644 --- a/core/engine.py +++ b/core/engine.py @@ -26,8 +26,19 @@ def getwords(s): # We decompose the string so that ascii letters with accents can be part of the word. s = normalize("NFD", s) s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", " ").lower() + # logging.debug(f"DEBUG chars for: {s}\n" + # f"{[c for c in s if ord(c) != 32]}\n" + # f"{[ord(c) for c in s if ord(c) != 32]}") + # HACK We shouldn't ignore non-ascii characters altogether. Any Unicode char + # above common european characters that cannot be "sanitized" (ie. stripped + # of their accents, etc.) are preserved as is. The arbitrary limit is + # obtained from this one: ord("\u037e") GREEK QUESTION MARK s = "".join( - c for c in s if c in string.ascii_letters + string.digits + string.whitespace + c for c in s + if (ord(c) < 894 + and c in string.ascii_letters + string.digits + string.whitespace + ) + or ord(c) > 894 ) return [_f for _f in s.split(" ") if _f] # remove empty elements diff --git a/core/tests/engine_test.py b/core/tests/engine_test.py index b36378ce..0c36b42f 100644 --- a/core/tests/engine_test.py +++ b/core/tests/engine_test.py @@ -69,6 +69,10 @@ class TestCasegetwords: eq_(["a", "b", "c", "d"], getwords("a b c d")) eq_(["a", "b", "c", "d"], getwords(" a b c d ")) + def test_unicode(self): + eq_(["e", "c", "0", "a", "o", "u", "e", "u"], getwords("é ç 0 à ö û è ¤ ù")) + eq_(["02", "君のこころは輝いてるかい?", "国木田花丸", "solo", "ver"], getwords("02 君のこころは輝いてるかい? 国木田花丸 Solo Ver")) + def test_splitter_chars(self): eq_( [chr(i) for i in range(ord("a"), ord("z") + 1)], @@ -85,7 +89,7 @@ class TestCasegetwords: eq_(["foo", "bar"], getwords("FOO BAR")) def test_decompose_unicode(self): - eq_(getwords("foo\xe9bar"), ["fooebar"]) + eq_(["fooebar"], getwords("foo\xe9bar")) class TestCasegetfields: From f1ae47843338683e9ae4f306d2bd336031df919e Mon Sep 17 00:00:00 2001 From: glubsy Date: Thu, 29 Apr 2021 05:29:35 +0200 Subject: [PATCH 2/2] Fix including character at the border --- core/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/engine.py b/core/engine.py index 867b8e09..ba306a98 100644 --- a/core/engine.py +++ b/core/engine.py @@ -35,7 +35,7 @@ def getwords(s): # obtained from this one: ord("\u037e") GREEK QUESTION MARK s = "".join( c for c in s - if (ord(c) < 894 + if (ord(c) <= 894 and c in string.ascii_letters + string.digits + string.whitespace ) or ord(c) > 894