1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-22 14:41:39 +00:00

Use lxml everywhere for xml save/load (instead of ElementTree and minidom).

This commit is contained in:
Virgil Dupras
2010-03-01 12:21:43 +01:00
parent b17ca66f73
commit 47a6ceffbc
7 changed files with 199 additions and 273 deletions

View File

@@ -8,16 +8,14 @@
import logging
import re
from xml.sax import handler, make_parser, SAXException
from xml.sax.saxutils import XMLGenerator
from xml.sax.xmlreader import AttributesImpl
from lxml import etree
from . import engine
from hsutil.job import nulljob
from hsutil.markable import Markable
from hsutil.misc import flatten, cond, nonone
from hsutil.misc import flatten, nonone
from hsutil.str import format_size
from hsutil.files import open_if_filename
from hsutil.files import FileOrPath
class Results(Markable):
#---Override
@@ -168,42 +166,54 @@ class Results(Markable):
is_markable = _is_markable
def load_from_xml(self, infile, get_file, j=nulljob):
def do_match(ref_file, other_files, group):
if not other_files:
return
for other_file in other_files:
group.add_match(engine.get_match(ref_file, other_file))
do_match(other_files[0], other_files[1:], group)
self.apply_filter(None)
handler = _ResultsHandler(get_file)
try:
parser = make_parser()
except Exception as e:
# This special handling is to try to figure out the cause of #47
# We don't silently return, because we want the user to send error report.
logging.exception(e)
try:
import xml.parsers.expat
logging.warning('importing xml.parsers.expat went ok, WTF?')
except Exception as e:
# This log should give a little more details about the cause of this all
logging.exception(e)
raise
raise
parser.setContentHandler(handler)
try:
infile, must_close = open_if_filename(infile)
except IOError:
root = etree.parse(infile).getroot()
except Exception:
return
BUFSIZE = 1024 * 1024 # 1mb buffer
infile.seek(0, 2)
j.start_job(infile.tell() // BUFSIZE)
infile.seek(0, 0)
try:
while True:
data = infile.read(BUFSIZE)
if not data:
break
parser.feed(data)
j.add_progress()
except SAXException:
return
self.groups = handler.groups
for dupe_file in handler.marked:
group_elems = list(root.iterchildren('group'))
groups = []
marked = set()
for group_elem in j.iter_with_progress(group_elems, every=100):
group = engine.Group()
dupes = []
for file_elem in group_elem.iterchildren('file'):
path = file_elem.get('path')
words = file_elem.get('words', '')
if not path:
continue
file = get_file(path)
if file is None:
continue
file.words = words.split(',')
file.is_ref = file_elem.get('is_ref') == 'y'
dupes.append(file)
if file_elem.get('marked') == 'y':
marked.add(file)
for match_elem in group_elem.iterchildren('match'):
try:
attrs = match_elem.attrib
first_file = dupes[int(attrs['first'])]
second_file = dupes[int(attrs['second'])]
percentage = int(attrs['percentage'])
group.add_match(engine.Match(first_file, second_file, percentage))
except (IndexError, KeyError, ValueError): # Covers missing attr, non-int values and indexes out of bounds
pass
if (not group.matches) and (len(dupes) >= 2):
do_match(dupes[0], dupes[1:], group)
group.prioritize(lambda x: dupes.index(x))
if len(group):
groups.append(group)
j.add_progress()
self.groups = groups
for dupe_file in marked:
self.mark(dupe_file)
def make_ref(self, dupe):
@@ -256,13 +266,10 @@ class Results(Markable):
def save_to_xml(self, outfile):
self.apply_filter(None)
outfile, must_close = open_if_filename(outfile, 'wb')
writer = XMLGenerator(outfile, 'utf-8')
writer.startDocument()
empty_attrs = AttributesImpl({})
writer.startElement('results', empty_attrs)
root = etree.Element('results')
# writer = XMLGenerator(outfile, 'utf-8')
for g in self.groups:
writer.startElement('group', empty_attrs)
group_elem = etree.SubElement(root, 'group')
dupe2index = {}
for index, d in enumerate(g):
dupe2index[d] = index
@@ -270,27 +277,19 @@ class Results(Markable):
words = engine.unpack_fields(d.words)
except AttributeError:
words = ()
attrs = AttributesImpl({
'path': unicode(d.path),
'is_ref': cond(d.is_ref, 'y', 'n'),
'words': ','.join(words),
'marked': cond(self.is_marked(d), 'y', 'n')
})
writer.startElement('file', attrs)
writer.endElement('file')
file_elem = etree.SubElement(group_elem, 'file')
file_elem.set('path', unicode(d.path))
file_elem.set('is_ref', ('y' if d.is_ref else 'n'))
file_elem.set('words', ','.join(words))
file_elem.set('marked', ('y' if self.is_marked(d) else 'n'))
for match in g.matches:
attrs = AttributesImpl({
'first': str(dupe2index[match.first]),
'second': str(dupe2index[match.second]),
'percentage': str(int(match.percentage)),
})
writer.startElement('match', attrs)
writer.endElement('match')
writer.endElement('group')
writer.endElement('results')
writer.endDocument()
if must_close:
outfile.close()
match_elem = etree.SubElement(group_elem, 'match')
match_elem.set('first', unicode(dupe2index[match.first]))
match_elem.set('second', unicode(dupe2index[match.second]))
match_elem.set('percentage', unicode(int(match.percentage)))
tree = etree.ElementTree(root)
with FileOrPath(outfile, 'wb') as fp:
tree.write(fp, encoding='utf-8')
def sort_dupes(self, key, asc=True, delta=False):
if not self.__dupes:
@@ -310,60 +309,3 @@ class Results(Markable):
dupes = property(__get_dupe_list)
groups = property(__get_groups, __set_groups)
stat_line = property(__get_stat_line)
class _ResultsHandler(handler.ContentHandler):
def __init__(self, get_file):
self.group = None
self.dupes = None
self.marked = set()
self.groups = []
self.get_file = get_file
def startElement(self, name, attrs):
if name == 'group':
self.group = engine.Group()
self.dupes = []
return
if (name == 'file') and (self.group is not None):
if not (('path' in attrs) and ('words' in attrs)):
return
path = attrs['path']
file = self.get_file(path)
if file is None:
return
file.words = attrs['words'].split(',')
file.is_ref = attrs.get('is_ref') == 'y'
self.dupes.append(file)
if attrs.get('marked') == 'y':
self.marked.add(file)
if (name == 'match') and (self.group is not None):
try:
first_file = self.dupes[int(attrs['first'])]
second_file = self.dupes[int(attrs['second'])]
percentage = int(attrs['percentage'])
self.group.add_match(engine.Match(first_file, second_file, percentage))
except (IndexError, KeyError, ValueError): # Covers missing attr, non-int values and indexes out of bounds
pass
def endElement(self, name):
def do_match(ref_file, other_files, group):
if not other_files:
return
for other_file in other_files:
group.add_match(engine.get_match(ref_file, other_file))
do_match(other_files[0], other_files[1:], group)
if name == 'group':
group = self.group
self.group = None
dupes = self.dupes
self.dupes = []
if group is None:
return
if len(dupes) < 2:
return
if not group.matches: # <match> elements not present, do it manually, without %
do_match(dupes[0], dupes[1:], group)
group.prioritize(lambda x: dupes.index(x))
self.groups.append(group)