Merge pull request #879 from glubsy/fix_unicode

Fix stripping (japanese) unicode characters
Merge pull request #898 from AlttiRi/master
2026-04-29 15:11:38 +00:00 · 2021-05-25 19:11:19 -05:00 · 2021-05-25 19:10:31 -05:00 · 2021-05-22 02:52:41 +03:00 · 2021-04-29 05:29:35 +02:00 · 2021-04-29 05:15:34 +02:00
3 changed files with 18 additions and 3 deletions
--- a/core/engine.py
+++ b/core/engine.py
@@ -26,8 +26,19 @@ def getwords(s):
    # We decompose the string so that ascii letters with accents can be part of the word.
    s = normalize("NFD", s)
    s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", " ").lower()
+    # logging.debug(f"DEBUG chars for: {s}\n"
+    #               f"{[c for c in s if ord(c) != 32]}\n"
+    #               f"{[ord(c) for c in s if ord(c) != 32]}")
+    # HACK We shouldn't ignore non-ascii characters altogether. Any Unicode char
+    # above common european characters that cannot be "sanitized" (ie. stripped
+    # of their accents, etc.) are preserved as is. The arbitrary limit is
+    # obtained from this one: ord("\u037e") GREEK QUESTION MARK
    s = "".join(
-        c for c in s if c in string.ascii_letters + string.digits + string.whitespace
+        c for c in s
+        if (ord(c) <= 894
+            and c in string.ascii_letters + string.digits + string.whitespace
+            )
+        or ord(c) > 894
    )
    return [_f for _f in s.split(" ") if _f]  # remove empty elements

--- a/core/tests/engine_test.py
+++ b/core/tests/engine_test.py
@@ -69,6 +69,10 @@ class TestCasegetwords:
        eq_(["a", "b", "c", "d"], getwords("a b c d"))
        eq_(["a", "b", "c", "d"], getwords(" a  b  c d "))

+    def test_unicode(self):
+        eq_(["e", "c", "0", "a", "o", "u", "e", "u"], getwords("é ç 0 à ö û è ¤ ù"))
+        eq_(["02", "君のこころは輝いてるかい？", "国木田花丸", "solo", "ver"], getwords("02 君のこころは輝いてるかい？ 国木田花丸 Solo Ver"))
+
    def test_splitter_chars(self):
        eq_(
            [chr(i) for i in range(ord("a"), ord("z") + 1)],
@@ -85,7 +89,7 @@ class TestCasegetwords:
        eq_(["foo", "bar"], getwords("FOO BAR"))

    def test_decompose_unicode(self):
-        eq_(getwords("foo\xe9bar"), ["fooebar"])
+        eq_(["fooebar"], getwords("foo\xe9bar"))


 class TestCasegetfields:
--- a/qt/preferences.py
+++ b/qt/preferences.py
@@ -102,7 +102,7 @@ class Preferences(PreferencesBase):
        self.details_dialog_override_theme_icons = False if not ISLINUX else True
        self.details_dialog_viewers_show_scrollbars = True
        self.result_table_ref_foreground_color = QColor(Qt.blue)
-        self.result_table_ref_background_color = QColor(Qt.darkGray)
+        self.result_table_ref_background_color = QColor(Qt.lightGray)
        self.result_table_delta_foreground_color = QColor(255, 142, 40)  # orange
        self.resultWindowIsMaximized = False
        self.resultWindowRect = None
Author	SHA1	Message	Date
Andrew Senetar	0b46ca2222	Merge pull request #879 from glubsy/fix_unicode Fix stripping (japanese) unicode characters	2021-05-25 19:11:19 -05:00
Andrew Senetar	72e0f76242	Merge pull request #898 from AlttiRi/master Change reference background color #894	2021-05-25 19:10:31 -05:00
[Alt'tiRi]	65c1d463f8	Change reference background color #894	2021-05-22 02:52:41 +03:00
glubsy	f1ae478433	Fix including character at the border	2021-04-29 05:29:35 +02:00
glubsy	c4dcfd3d4b	Fix stripping (japanese) unicode characters * Accents are getting removed from Unicode characters to generate similar "words". * Non-latin characters which cannot be processed that way (eg. japanese, greek, russian, etc.) should not be filtered out at all otherwise files are erroneously skipped or detected as dupes if only some characters make it passed the filter. * Starting from an arbitrary unicode codepoint (converted to decimal), above which we know it is pointless to try any sort of processing, we leave the characters as is. * Fix #878.	2021-04-29 05:15:34 +02:00