Merge pull request #879 from glubsy/fix_unicode

Fix stripping (japanese) unicode characters
2025-07-23 14:03:20 +00:00 · 2021-05-25 19:11:19 -05:00 · 2021-05-25 19:11:19 -05:00 · 0b46ca2222
commit 0b46ca2222
parent 72e0f76242 f1ae478433
2 changed files with 17 additions and 2 deletions
--- a/core/engine.py
+++ b/core/engine.py
@ -26,8 +26,19 @@ def getwords(s):
    # We decompose the string so that ascii letters with accents can be part of the word.
    s = normalize("NFD", s)
    s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", " ").lower()
+    # logging.debug(f"DEBUG chars for: {s}\n"
+    #               f"{[c for c in s if ord(c) != 32]}\n"
+    #               f"{[ord(c) for c in s if ord(c) != 32]}")
+    # HACK We shouldn't ignore non-ascii characters altogether. Any Unicode char
+    # above common european characters that cannot be "sanitized" (ie. stripped
+    # of their accents, etc.) are preserved as is. The arbitrary limit is
+    # obtained from this one: ord("\u037e") GREEK QUESTION MARK
    s = "".join(
-        c for c in s if c in string.ascii_letters + string.digits + string.whitespace
+        c for c in s
+        if (ord(c) <= 894
+            and c in string.ascii_letters + string.digits + string.whitespace
+            )
+        or ord(c) > 894
    )
    return [_f for _f in s.split(" ") if _f]  # remove empty elements

--- a/core/tests/engine_test.py
+++ b/core/tests/engine_test.py
@ -69,6 +69,10 @@ class TestCasegetwords:
        eq_(["a", "b", "c", "d"], getwords("a b c d"))
        eq_(["a", "b", "c", "d"], getwords(" a  b  c d "))

+    def test_unicode(self):
+        eq_(["e", "c", "0", "a", "o", "u", "e", "u"], getwords("é ç 0 à ö û è ¤ ù"))
+        eq_(["02", "君のこころは輝いてるかい？", "国木田花丸", "solo", "ver"], getwords("02 君のこころは輝いてるかい？ 国木田花丸 Solo Ver"))
+
    def test_splitter_chars(self):
        eq_(
            [chr(i) for i in range(ord("a"), ord("z") + 1)],
@ -85,7 +89,7 @@ class TestCasegetwords:
        eq_(["foo", "bar"], getwords("FOO BAR"))

    def test_decompose_unicode(self):
-        eq_(getwords("foo\xe9bar"), ["fooebar"])
+        eq_(["fooebar"], getwords("foo\xe9bar"))


 class TestCasegetfields: