mirror of
https://github.com/arsenetar/dupeguru.git
synced 2026-01-22 14:41:39 +00:00
Fix stripping (japanese) unicode characters
* Accents are getting removed from Unicode characters to generate similar "words". * Non-latin characters which cannot be processed that way (eg. japanese, greek, russian, etc.) should not be filtered out at all otherwise files are erroneously skipped or detected as dupes if only some characters make it passed the filter. * Starting from an arbitrary unicode codepoint (converted to decimal), above which we know it is pointless to try any sort of processing, we leave the characters as is. * Fix #878.
This commit is contained in:
@@ -69,6 +69,10 @@ class TestCasegetwords:
|
||||
eq_(["a", "b", "c", "d"], getwords("a b c d"))
|
||||
eq_(["a", "b", "c", "d"], getwords(" a b c d "))
|
||||
|
||||
def test_unicode(self):
|
||||
eq_(["e", "c", "0", "a", "o", "u", "e", "u"], getwords("é ç 0 à ö û è ¤ ù"))
|
||||
eq_(["02", "君のこころは輝いてるかい?", "国木田花丸", "solo", "ver"], getwords("02 君のこころは輝いてるかい? 国木田花丸 Solo Ver"))
|
||||
|
||||
def test_splitter_chars(self):
|
||||
eq_(
|
||||
[chr(i) for i in range(ord("a"), ord("z") + 1)],
|
||||
@@ -85,7 +89,7 @@ class TestCasegetwords:
|
||||
eq_(["foo", "bar"], getwords("FOO BAR"))
|
||||
|
||||
def test_decompose_unicode(self):
|
||||
eq_(getwords("foo\xe9bar"), ["fooebar"])
|
||||
eq_(["fooebar"], getwords("foo\xe9bar"))
|
||||
|
||||
|
||||
class TestCasegetfields:
|
||||
|
||||
Reference in New Issue
Block a user