Merge pull request #879 from glubsy/fix_unicode

Fix stripping (japanese) unicode characters
2026-07-02 11:07:52 +00:00 · 2021-05-25 19:11:19 -05:00
parent 72e0f76242 f1ae478433
commit 0b46ca2222
2 changed files with 17 additions and 2 deletions
--- a/core/engine.py
+++ b/core/engine.py
@@ -26,8 +26,19 @@ def getwords(s):
    # We decompose the string so that ascii letters with accents can be part of the word.
    s = normalize("NFD", s)
    s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", " ").lower()
    # logging.debug(f"DEBUG chars for: {s}\n"
    #               f"{[c for c in s if ord(c) != 32]}\n"
    #               f"{[ord(c) for c in s if ord(c) != 32]}")
    # HACK We shouldn't ignore non-ascii characters altogether. Any Unicode char
    # above common european characters that cannot be "sanitized" (ie. stripped
    # of their accents, etc.) are preserved as is. The arbitrary limit is
    # obtained from this one: ord("\u037e") GREEK QUESTION MARK
    s = "".join(
-        c for c in s if c in string.ascii_letters + string.digits + string.whitespace
+        c for c in s
        if (ord(c) <= 894
            and c in string.ascii_letters + string.digits + string.whitespace
            )
        or ord(c) > 894
    )
    return [_f for _f in s.split(" ") if _f]  # remove empty elements
--- a/core/tests/engine_test.py
+++ b/core/tests/engine_test.py
@@ -69,6 +69,10 @@ class TestCasegetwords:
        eq_(["a", "b", "c", "d"], getwords("a b c d"))
        eq_(["a", "b", "c", "d"], getwords(" a  b  c d "))
    def test_unicode(self):
        eq_(["e", "c", "0", "a", "o", "u", "e", "u"], getwords("é ç 0 à ö û è ¤ ù"))
        eq_(["02", "君のこころは輝いてるかい？", "国木田花丸", "solo", "ver"], getwords("02 君のこころは輝いてるかい？ 国木田花丸 Solo Ver"))
    def test_splitter_chars(self):
        eq_(
            [chr(i) for i in range(ord("a"), ord("z") + 1)],
@@ -85,7 +89,7 @@ class TestCasegetwords:
        eq_(["foo", "bar"], getwords("FOO BAR"))
    def test_decompose_unicode(self):
-        eq_(getwords("foo\xe9bar"), ["fooebar"])
+        eq_(["fooebar"], getwords("foo\xe9bar"))
 class TestCasegetfields: