1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-22 14:41:39 +00:00

Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt.

This commit is contained in:
Virgil Dupras
2010-08-11 16:39:06 +02:00
parent fb79daad6a
commit 854d194f88
57 changed files with 394 additions and 338 deletions

View File

@@ -6,7 +6,7 @@
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
from __future__ import division
import difflib
import itertools
import logging
@@ -25,15 +25,15 @@ NO_FIELD_ORDER) = range(3)
JOB_REFRESH_RATE = 100
def getwords(s):
if isinstance(s, unicode):
if isinstance(s, str):
s = normalize('NFD', s)
s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", ' ').lower()
s = ''.join(c for c in s if c in string.ascii_letters + string.digits + string.whitespace)
return filter(None, s.split(' ')) # filter() is to remove empty elements
return [_f for _f in s.split(' ') if _f] # remove empty elements
def getfields(s):
fields = [getwords(field) for field in s.split(' - ')]
return filter(None, fields)
return [_f for _f in fields if _f]
def unpack_fields(fields):
result = []
@@ -118,7 +118,7 @@ def build_word_dict(objects, j=job.nulljob):
def merge_similar_words(word_dict):
"""Take all keys in word_dict that are similar, and merge them together.
"""
keys = word_dict.keys()
keys = list(word_dict.keys())
keys.sort(key=len)# we want the shortest word to stay
while keys:
key = keys.pop(0)
@@ -138,7 +138,7 @@ def reduce_common_words(word_dict, threshold):
Because if we remove them, we will miss some duplicates!
"""
uncommon_words = set(word for word, objects in word_dict.items() if len(objects) < threshold)
for word, objects in word_dict.items():
for word, objects in list(word_dict.items()):
if len(objects) < threshold:
continue
reduced = set()