Merge pull request #879 from glubsy/fix_unicode

Fix stripping (japanese) unicode characters
This commit is contained in:
Andrew Senetar 2021-05-25 19:11:19 -05:00 committed by GitHub
commit 0b46ca2222
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 17 additions and 2 deletions

View File

@ -26,8 +26,19 @@ def getwords(s):
# We decompose the string so that ascii letters with accents can be part of the word.
s = normalize("NFD", s)
s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", " ").lower()
# logging.debug(f"DEBUG chars for: {s}\n"
# f"{[c for c in s if ord(c) != 32]}\n"
# f"{[ord(c) for c in s if ord(c) != 32]}")
# HACK We shouldn't ignore non-ascii characters altogether. Any Unicode char
# above common european characters that cannot be "sanitized" (ie. stripped
# of their accents, etc.) are preserved as is. The arbitrary limit is
# obtained from this one: ord("\u037e") GREEK QUESTION MARK
s = "".join(
c for c in s if c in string.ascii_letters + string.digits + string.whitespace
c for c in s
if (ord(c) <= 894
and c in string.ascii_letters + string.digits + string.whitespace
)
or ord(c) > 894
)
return [_f for _f in s.split(" ") if _f] # remove empty elements

View File

@ -69,6 +69,10 @@ class TestCasegetwords:
eq_(["a", "b", "c", "d"], getwords("a b c d"))
eq_(["a", "b", "c", "d"], getwords(" a b c d "))
def test_unicode(self):
eq_(["e", "c", "0", "a", "o", "u", "e", "u"], getwords("é ç 0 à ö û è ¤ ù"))
eq_(["02", "君のこころは輝いてるかい?", "国木田花丸", "solo", "ver"], getwords("02 君のこころは輝いてるかい? 国木田花丸 Solo Ver"))
def test_splitter_chars(self):
eq_(
[chr(i) for i in range(ord("a"), ord("z") + 1)],
@ -85,7 +89,7 @@ class TestCasegetwords:
eq_(["foo", "bar"], getwords("FOO BAR"))
def test_decompose_unicode(self):
eq_(getwords("foo\xe9bar"), ["fooebar"])
eq_(["fooebar"], getwords("foo\xe9bar"))
class TestCasegetfields: