mirror of
https://github.com/arsenetar/dupeguru.git
synced 2025-03-09 21:24:36 +00:00
Merge pull request #879 from glubsy/fix_unicode
Fix stripping (japanese) unicode characters
This commit is contained in:
commit
0b46ca2222
@ -26,8 +26,19 @@ def getwords(s):
|
|||||||
# We decompose the string so that ascii letters with accents can be part of the word.
|
# We decompose the string so that ascii letters with accents can be part of the word.
|
||||||
s = normalize("NFD", s)
|
s = normalize("NFD", s)
|
||||||
s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", " ").lower()
|
s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", " ").lower()
|
||||||
|
# logging.debug(f"DEBUG chars for: {s}\n"
|
||||||
|
# f"{[c for c in s if ord(c) != 32]}\n"
|
||||||
|
# f"{[ord(c) for c in s if ord(c) != 32]}")
|
||||||
|
# HACK We shouldn't ignore non-ascii characters altogether. Any Unicode char
|
||||||
|
# above common european characters that cannot be "sanitized" (ie. stripped
|
||||||
|
# of their accents, etc.) are preserved as is. The arbitrary limit is
|
||||||
|
# obtained from this one: ord("\u037e") GREEK QUESTION MARK
|
||||||
s = "".join(
|
s = "".join(
|
||||||
c for c in s if c in string.ascii_letters + string.digits + string.whitespace
|
c for c in s
|
||||||
|
if (ord(c) <= 894
|
||||||
|
and c in string.ascii_letters + string.digits + string.whitespace
|
||||||
|
)
|
||||||
|
or ord(c) > 894
|
||||||
)
|
)
|
||||||
return [_f for _f in s.split(" ") if _f] # remove empty elements
|
return [_f for _f in s.split(" ") if _f] # remove empty elements
|
||||||
|
|
||||||
|
@ -69,6 +69,10 @@ class TestCasegetwords:
|
|||||||
eq_(["a", "b", "c", "d"], getwords("a b c d"))
|
eq_(["a", "b", "c", "d"], getwords("a b c d"))
|
||||||
eq_(["a", "b", "c", "d"], getwords(" a b c d "))
|
eq_(["a", "b", "c", "d"], getwords(" a b c d "))
|
||||||
|
|
||||||
|
def test_unicode(self):
|
||||||
|
eq_(["e", "c", "0", "a", "o", "u", "e", "u"], getwords("é ç 0 à ö û è ¤ ù"))
|
||||||
|
eq_(["02", "君のこころは輝いてるかい?", "国木田花丸", "solo", "ver"], getwords("02 君のこころは輝いてるかい? 国木田花丸 Solo Ver"))
|
||||||
|
|
||||||
def test_splitter_chars(self):
|
def test_splitter_chars(self):
|
||||||
eq_(
|
eq_(
|
||||||
[chr(i) for i in range(ord("a"), ord("z") + 1)],
|
[chr(i) for i in range(ord("a"), ord("z") + 1)],
|
||||||
@ -85,7 +89,7 @@ class TestCasegetwords:
|
|||||||
eq_(["foo", "bar"], getwords("FOO BAR"))
|
eq_(["foo", "bar"], getwords("FOO BAR"))
|
||||||
|
|
||||||
def test_decompose_unicode(self):
|
def test_decompose_unicode(self):
|
||||||
eq_(getwords("foo\xe9bar"), ["fooebar"])
|
eq_(["fooebar"], getwords("foo\xe9bar"))
|
||||||
|
|
||||||
|
|
||||||
class TestCasegetfields:
|
class TestCasegetfields:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user