Use lxml everywhere for xml save/load (instead of ElementTree and minidom).

2025-07-14 18:33:19 +00:00 · 2010-03-01 12:21:43 +01:00 · 2010-03-01 12:21:43 +01:00 · 47a6ceffbc
commit 47a6ceffbc
parent b17ca66f73
7 changed files with 199 additions and 273 deletions
--- a/core/directories.py
+++ b/core/directories.py
@ -6,7 +6,7 @@
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
-import xml.dom.minidom
+from lxml import etree
 from hsutil import io
 from hsutil.files import FileOrPath
@ -126,38 +126,38 @@ class Directories(object):
    def load_from_file(self, infile):
        try:
-            doc = xml.dom.minidom.parse(infile)
+            root = etree.parse(infile).getroot()
        except:
            return
-        root_path_nodes = doc.getElementsByTagName('root_directory')
+        for rdn in root.iterchildren('root_directory'):
-        for rdn in root_path_nodes:
+            attrib = rdn.attrib
-            if not rdn.getAttributeNode('path'):
+            if 'path' not in attrib:
                continue
-            path = rdn.getAttributeNode('path').nodeValue
+            path = attrib['path']
            try:
                self.add_path(Path(path))
            except (AlreadyThereError, InvalidPathError):
                pass
-        state_nodes = doc.getElementsByTagName('state')
+        for sn in root.iterchildren('state'):
-        for sn in state_nodes:
+            attrib = sn.attrib
-            if not (sn.getAttributeNode('path') and sn.getAttributeNode('value')):
+            if not ('path' in attrib and 'value' in attrib):
                continue
-            path = sn.getAttributeNode('path').nodeValue
+            path = attrib['path']
-            state = sn.getAttributeNode('value').nodeValue
+            state = attrib['value']
            self.set_state(Path(path), int(state))
-    def save_to_file(self,outfile):
+    def save_to_file(self, outfile):
        with FileOrPath(outfile, 'wb') as fp:
-            doc = xml.dom.minidom.Document()
+            root = etree.Element('directories')
            root = doc.appendChild(doc.createElement('directories'))
            for root_path in self:
-                root_path_node = root.appendChild(doc.createElement('root_directory'))
+                root_path_node = etree.SubElement(root, 'root_directory')
-                root_path_node.setAttribute('path', unicode(root_path).encode('utf-8'))
+                root_path_node.set('path', unicode(root_path))
            for path, state in self.states.iteritems():
-                state_node = root.appendChild(doc.createElement('state'))
+                state_node = etree.SubElement(root, 'state')
-                state_node.setAttribute('path', unicode(path).encode('utf-8'))
+                state_node.set('path', unicode(path))
-                state_node.setAttribute('value', str(state))
+                state_node.set('value', unicode(state))
-            doc.writexml(fp, '\t', '\t', '\n', encoding='utf-8')
+            tree = etree.ElementTree(root)
            tree.write(fp, encoding='utf-8')
    def set_state(self, path, state):
        if self.get_state(path) == state:
--- a/core/ignore.py
+++ b/core/ignore.py
@ -6,9 +6,9 @@
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
-from hsutil.files import FileOrPath
+from lxml import etree
-import xml.dom.minidom
+from hsutil.files import FileOrPath
 class IgnoreList(object):
    """An ignore list implementation that is iterable, filterable and exportable to XML.
@ -71,45 +71,38 @@ class IgnoreList(object):
                self._ignored[first] = matches
        self._count += 1
-    def load_from_xml(self,infile):
+    def load_from_xml(self, infile):
        """Loads the ignore list from a XML created with save_to_xml.
        infile can be a file object or a filename.
        """
        try:
-            doc = xml.dom.minidom.parse(infile)
+            root = etree.parse(infile).getroot()
        except Exception:
            return
-        file_nodes = doc.getElementsByTagName('file')
+        for fn in root.iterchildren('file'):
-        for fn in file_nodes:
+            file_path = fn.get('path')
-            if not fn.getAttributeNode('path'):
+            if not file_path:
                continue
-            file_path = fn.getAttributeNode('path').nodeValue
+            for sfn in fn.iterchildren('file'):
-            subfile_nodes = fn.getElementsByTagName('file')
+                subfile_path = sfn.get('path')
-            for sfn in subfile_nodes:
+                if subfile_path:
-                if not sfn.getAttributeNode('path'):
+                    self.Ignore(file_path, subfile_path)
                    continue
                subfile_path = sfn.getAttributeNode('path').nodeValue
                self.Ignore(file_path,subfile_path)
-    def save_to_xml(self,outfile):
+    def save_to_xml(self, outfile):
        """Create a XML file that can be used by load_from_xml.
        outfile can be a file object or a filename.
        """
-        doc = xml.dom.minidom.Document()
+        root = etree.Element('ignore_list')
-        root = doc.appendChild(doc.createElement('ignore_list'))
+        for filename, subfiles in self._ignored.items():
-        for file,subfiles in self._ignored.items():
+            file_node = etree.SubElement(root, 'file')
-            file_node = root.appendChild(doc.createElement('file'))
+            file_node.set('path', filename)
-            if isinstance(file,unicode):
+            for subfilename in subfiles:
-                file = file.encode('utf-8')
+                subfile_node = etree.SubElement(file_node, 'file')
-            file_node.setAttribute('path',file)
+                subfile_node.set('path', subfilename)
-            for subfile in subfiles:
+        tree = etree.ElementTree(root)
                subfile_node = file_node.appendChild(doc.createElement('file'))
                if isinstance(subfile,unicode):
                    subfile = subfile.encode('utf-8')
                subfile_node.setAttribute('path',subfile)
        with FileOrPath(outfile, 'wb') as fp:
-            doc.writexml(fp,'\t','\t','\n',encoding='utf-8')
+            tree.write(fp, encoding='utf-8')
--- a/core/results.py
+++ b/core/results.py
@ -8,16 +8,14 @@
 import logging
 import re
-from xml.sax import handler, make_parser, SAXException
+from lxml import etree
 from xml.sax.saxutils import XMLGenerator
 from xml.sax.xmlreader import AttributesImpl
 from . import engine
 from hsutil.job import nulljob
 from hsutil.markable import Markable
-from hsutil.misc import flatten, cond, nonone
+from hsutil.misc import flatten, nonone
 from hsutil.str import format_size
-from hsutil.files import open_if_filename
+from hsutil.files import FileOrPath
 class Results(Markable):
    #---Override
@ -168,42 +166,54 @@ class Results(Markable):
    is_markable = _is_markable
    def load_from_xml(self, infile, get_file, j=nulljob):
        def do_match(ref_file, other_files, group):
            if not other_files:
                return
            for other_file in other_files:
                group.add_match(engine.get_match(ref_file, other_file))
            do_match(other_files[0], other_files[1:], group)
        self.apply_filter(None)
        handler = _ResultsHandler(get_file)
        try:
-            parser = make_parser()
+            root = etree.parse(infile).getroot()
-        except Exception as e:
+        except Exception:
            # This special handling is to try to figure out the cause of #47
            # We don't silently return, because we want the user to send error report.
            logging.exception(e)
            try:
                import xml.parsers.expat
                logging.warning('importing xml.parsers.expat went ok, WTF?')
            except Exception as e:
                # This log should give a little more details about the cause of this all
                logging.exception(e)
                raise
            raise
        parser.setContentHandler(handler)
        try:
            infile, must_close = open_if_filename(infile)
        except IOError:
            return
-        BUFSIZE = 1024 * 1024 # 1mb buffer
+        group_elems = list(root.iterchildren('group'))
-        infile.seek(0, 2)
+        groups = []
-        j.start_job(infile.tell() // BUFSIZE)
+        marked = set()
-        infile.seek(0, 0)
+        for group_elem in j.iter_with_progress(group_elems, every=100):
            group = engine.Group()
            dupes = []
            for file_elem in group_elem.iterchildren('file'):
                path = file_elem.get('path')
                words = file_elem.get('words', '')
                if not path:
                    continue
                file = get_file(path)
                if file is None:
                    continue
                file.words = words.split(',')
                file.is_ref = file_elem.get('is_ref') == 'y'
                dupes.append(file)
                if file_elem.get('marked') == 'y':
                    marked.add(file)
            for match_elem in group_elem.iterchildren('match'):
                try:
-            while True:
+                    attrs = match_elem.attrib
-                data = infile.read(BUFSIZE)
+                    first_file = dupes[int(attrs['first'])]
-                if not data:
+                    second_file = dupes[int(attrs['second'])]
-                    break
+                    percentage = int(attrs['percentage'])
-                parser.feed(data)
+                    group.add_match(engine.Match(first_file, second_file, percentage))
                except (IndexError, KeyError, ValueError): # Covers missing attr, non-int values and indexes out of bounds
                    pass
            if (not group.matches) and (len(dupes) >= 2):
                do_match(dupes[0], dupes[1:], group)
            group.prioritize(lambda x: dupes.index(x))
            if len(group):
                groups.append(group)    
            j.add_progress()
-        except SAXException:
+        self.groups = groups
-            return
+        for dupe_file in marked:
        self.groups = handler.groups
        for dupe_file in handler.marked:
            self.mark(dupe_file)
    def make_ref(self, dupe):
@ -256,13 +266,10 @@ class Results(Markable):
    def save_to_xml(self, outfile):
        self.apply_filter(None)
-        outfile, must_close = open_if_filename(outfile, 'wb')
+        root = etree.Element('results')
-        writer = XMLGenerator(outfile, 'utf-8')
+        # writer = XMLGenerator(outfile, 'utf-8')
        writer.startDocument()
        empty_attrs = AttributesImpl({})
        writer.startElement('results', empty_attrs)
        for g in self.groups:
-            writer.startElement('group', empty_attrs)
+            group_elem = etree.SubElement(root, 'group')
            dupe2index = {}
            for index, d in enumerate(g):
                dupe2index[d] = index
@ -270,27 +277,19 @@ class Results(Markable):
                    words = engine.unpack_fields(d.words)
                except AttributeError:
                    words = ()
-                attrs = AttributesImpl({
+                file_elem = etree.SubElement(group_elem, 'file')
-                    'path': unicode(d.path),
+                file_elem.set('path', unicode(d.path))
-                    'is_ref': cond(d.is_ref, 'y', 'n'),
+                file_elem.set('is_ref', ('y' if d.is_ref else 'n'))
-                    'words': ','.join(words),
+                file_elem.set('words', ','.join(words))
-                    'marked': cond(self.is_marked(d), 'y', 'n')
+                file_elem.set('marked', ('y' if self.is_marked(d) else 'n'))
                })
                writer.startElement('file', attrs)
                writer.endElement('file')
            for match in g.matches:
-                attrs = AttributesImpl({
+                match_elem = etree.SubElement(group_elem, 'match')
-                    'first': str(dupe2index[match.first]),
+                match_elem.set('first', unicode(dupe2index[match.first]))
-                    'second': str(dupe2index[match.second]),
+                match_elem.set('second', unicode(dupe2index[match.second]))
-                    'percentage': str(int(match.percentage)),
+                match_elem.set('percentage', unicode(int(match.percentage)))
-                })
+        tree = etree.ElementTree(root)
-                writer.startElement('match', attrs)
+        with FileOrPath(outfile, 'wb') as fp:
-                writer.endElement('match')
+            tree.write(fp, encoding='utf-8')
            writer.endElement('group')
        writer.endElement('results')
        writer.endDocument()
        if must_close:
            outfile.close()
    def sort_dupes(self, key, asc=True, delta=False):
        if not self.__dupes:
@ -310,60 +309,3 @@ class Results(Markable):
    dupes     = property(__get_dupe_list)
    groups    = property(__get_groups, __set_groups)
    stat_line = property(__get_stat_line)
 class _ResultsHandler(handler.ContentHandler):
    def __init__(self, get_file):
        self.group = None
        self.dupes = None
        self.marked = set()
        self.groups = []
        self.get_file = get_file
    def startElement(self, name, attrs):
        if name == 'group':
            self.group = engine.Group()
            self.dupes = []
            return
        if (name == 'file') and (self.group is not None):
            if not (('path' in attrs) and ('words' in attrs)):
                return
            path = attrs['path']
            file = self.get_file(path)
            if file is None:
                return
            file.words = attrs['words'].split(',')
            file.is_ref = attrs.get('is_ref') == 'y'
            self.dupes.append(file)
            if attrs.get('marked') == 'y':
                self.marked.add(file)
        if (name == 'match') and (self.group is not None):
            try:
                first_file = self.dupes[int(attrs['first'])]
                second_file = self.dupes[int(attrs['second'])]
                percentage = int(attrs['percentage'])
                self.group.add_match(engine.Match(first_file, second_file, percentage))
            except (IndexError, KeyError, ValueError): # Covers missing attr, non-int values and indexes out of bounds
                pass
    def endElement(self, name):
        def do_match(ref_file, other_files, group):
            if not other_files:
                return
            for other_file in other_files:
                group.add_match(engine.get_match(ref_file, other_file))
            do_match(other_files[0], other_files[1:], group)
        if name == 'group':
            group = self.group
            self.group = None
            dupes = self.dupes
            self.dupes = []
            if group is None:
                return
            if len(dupes) < 2:
                return
            if not group.matches: # <match> elements not present, do it manually, without %
                do_match(dupes[0], dupes[1:], group)
            group.prioritize(lambda x: dupes.index(x))
            self.groups.append(group)
--- a/core/tests/app_test.py
+++ b/core/tests/app_test.py
@ -248,7 +248,7 @@ class TCDupeGuruWithResults(TestCase):
        self.rtree.selected_paths = paths
        self.app.remove_selected()
        # The first 2 dupes have been removed. The 3rd one is a ref. it stays there, in first pos.
-        eq_(self.rtree.selected_paths, [[0]]) # no exception
+        eq_(self.rtree.selected_paths, [[0, 0]]) # no exception
    def test_selectResultNodePaths(self):
        app = self.app
@ -366,10 +366,7 @@ class TCDupeGuruWithResults(TestCase):
        app = self.app
        self.rtree.selected_paths = [[0, 0], [1, 0]]
        app.remove_selected()
-        eq_(len(app.results.dupes), 1)
+        eq_(len(app.results.dupes), 1) # the first path is now selected
        app.remove_selected()
        eq_(len(app.results.dupes), 1)
        self.rtree.selected_path = [0, 0]
        app.remove_selected()
        eq_(len(app.results.dupes), 0)
--- a/core/tests/engine_test.py
+++ b/core/tests/engine_test.py
@ -229,10 +229,9 @@ class TCbuild_word_dict(TestCase):
        self.log = []
        s = "foo bar"
        build_word_dict([NamedObject(s, True), NamedObject(s, True), NamedObject(s, True)], j)
        # We don't have intermediate log because iter_with_progress is called with every > 1
        self.assertEqual(0,self.log[0])
-        self.assertEqual(33,self.log[1])
+        self.assertEqual(100,self.log[1])
        self.assertEqual(66,self.log[2])
        self.assertEqual(100,self.log[3])
 class TCmerge_similar_words(TestCase):
--- a/core/tests/ignore_test.py
+++ b/core/tests/ignore_test.py
@ -7,7 +7,7 @@
 # http://www.hardcoded.net/licenses/hs_license
 import cStringIO
-import xml.dom.minidom
+from lxml import etree
 from nose.tools import eq_
@ -62,26 +62,25 @@ def test_save_to_xml():
    f = cStringIO.StringIO()
    il.save_to_xml(f)
    f.seek(0)
-    doc = xml.dom.minidom.parse(f)
+    doc = etree.parse(f)
-    root = doc.documentElement
+    root = doc.getroot()
-    eq_('ignore_list',root.nodeName)
+    eq_(root.tag, 'ignore_list')
-    children = [c for c in root.childNodes if c.localName]
+    eq_(len(root), 2)
-    eq_(2,len(children))
+    eq_(len([c for c in root if c.tag == 'file']), 2)
-    eq_(2,len([c for c in children if c.nodeName == 'file']))
+    f1, f2 = root[:]
-    f1,f2 = children
+    subchildren = [c for c in f1 if c.tag == 'file'] + [c for c in f2 if c.tag == 'file']
-    subchildren = [c for c in f1.childNodes if c.localName == 'file'] +\
+    eq_(len(subchildren), 3)
        [c for c in f2.childNodes if c.localName == 'file']
    eq_(3,len(subchildren))
 def test_SaveThenLoad():
    il = IgnoreList()
-    il.Ignore('foo','bar')
+    il.Ignore('foo', 'bar')
-    il.Ignore('foo','bleh')
+    il.Ignore('foo', 'bleh')
-    il.Ignore('bleh','bar')
+    il.Ignore('bleh', 'bar')
-    il.Ignore(u'\u00e9','bar')
+    il.Ignore(u'\u00e9', 'bar')
    f = cStringIO.StringIO()
    il.save_to_xml(f)
    f.seek(0)
    f.seek(0)
    il = IgnoreList()
    il.load_from_xml(f)
    eq_(4,len(il))
@ -129,9 +128,9 @@ def test_filter():
    assert not il.AreIgnored('foo','bar')
    assert il.AreIgnored('bar','baz')
-def test_save_with_non_ascii_non_unicode_items():
+def test_save_with_non_ascii_items():
    il = IgnoreList()
-    il.Ignore('\xac','\xbf')
+    il.Ignore(u'\xac', u'\xbf')
    f = cStringIO.StringIO()
    try:
        il.save_to_xml(f)
--- a/core/tests/results_test.py
+++ b/core/tests/results_test.py
@ -7,10 +7,9 @@
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
 import unittest
 import StringIO
 import xml.dom.minidom
 import os.path as op
 from lxml import etree
 from hsutil.path import Path
 from hsutil.testcase import TestCase
@ -18,7 +17,7 @@ from hsutil.misc import first
 from . import engine_test, data
 from .. import engine
-from ..results import *
+from ..results import Results
 class NamedObject(engine_test.NamedObject):
    path = property(lambda x:Path('basepath') + x.name)
@ -65,9 +64,9 @@ class TCResultsEmpty(TestCase):
        f = StringIO.StringIO()
        self.results.save_to_xml(f)
        f.seek(0)
-        doc = xml.dom.minidom.parse(f)
+        doc = etree.parse(f)
-        root = doc.documentElement
+        root = doc.getroot()
-        self.assertEqual('results',root.nodeName)
+        self.assertEqual('results', root.tag)
 class TCResultsWithSomeGroups(TestCase):
@ -321,16 +320,16 @@ class TCResultsMarkings(TestCase):
        f = StringIO.StringIO()
        self.results.save_to_xml(f)
        f.seek(0)
-        doc = xml.dom.minidom.parse(f)
+        doc = etree.parse(f)
-        root = doc.documentElement
+        root = doc.getroot()
-        g1,g2 = root.getElementsByTagName('group')
+        g1, g2 = root.iterchildren('group')
-        d1,d2,d3 = g1.getElementsByTagName('file')
+        d1, d2, d3 = g1.iterchildren('file')
-        self.assertEqual('n',d1.getAttributeNode('marked').nodeValue)
+        self.assertEqual('n', d1.get('marked'))
-        self.assertEqual('n',d2.getAttributeNode('marked').nodeValue)
+        self.assertEqual('n', d2.get('marked'))
-        self.assertEqual('y',d3.getAttributeNode('marked').nodeValue)
+        self.assertEqual('y', d3.get('marked'))
-        d1,d2 = g2.getElementsByTagName('file')
+        d1, d2 = g2.iterchildren('file')
-        self.assertEqual('n',d1.getAttributeNode('marked').nodeValue)
+        self.assertEqual('n', d1.get('marked'))
-        self.assertEqual('y',d2.getAttributeNode('marked').nodeValue)
+        self.assertEqual('y', d2.get('marked'))
    def test_LoadXML(self):
        def get_file(path):
@ -366,38 +365,35 @@ class TCResultsXML(TestCase):
        f = StringIO.StringIO()
        self.results.save_to_xml(f)
        f.seek(0)
-        doc = xml.dom.minidom.parse(f)
+        doc = etree.parse(f)
-        root = doc.documentElement
+        root = doc.getroot()
-        self.assertEqual('results',root.nodeName)
+        self.assertEqual('results', root.tag)
-        children = [c for c in root.childNodes if c.localName]
+        self.assertEqual(2, len(root))
-        self.assertEqual(2,len(children))
+        self.assertEqual(2, len([c for c in root if c.tag == 'group']))
-        self.assertEqual(2,len([c for c in children if c.nodeName == 'group']))
+        g1, g2 = root
-        g1,g2 = children
+        self.assertEqual(6,len(g1))
-        children = [c for c in g1.childNodes if c.localName]
+        self.assertEqual(3,len([c for c in g1 if c.tag == 'file']))
-        self.assertEqual(6,len(children))
+        self.assertEqual(3,len([c for c in g1 if c.tag == 'match']))
-        self.assertEqual(3,len([c for c in children if c.nodeName == 'file']))
+        d1, d2, d3 = [c for c in g1 if c.tag == 'file']
-        self.assertEqual(3,len([c for c in children if c.nodeName == 'match']))
+        self.assertEqual(op.join('basepath','foo bar'),d1.get('path'))
-        d1,d2,d3 = [c for c in children if c.nodeName == 'file']
+        self.assertEqual(op.join('basepath','bar bleh'),d2.get('path'))
-        self.assertEqual(op.join('basepath','foo bar'),d1.getAttributeNode('path').nodeValue)
+        self.assertEqual(op.join('basepath','foo bleh'),d3.get('path'))
-        self.assertEqual(op.join('basepath','bar bleh'),d2.getAttributeNode('path').nodeValue)
+        self.assertEqual('y',d1.get('is_ref'))
-        self.assertEqual(op.join('basepath','foo bleh'),d3.getAttributeNode('path').nodeValue)
+        self.assertEqual('n',d2.get('is_ref'))
-        self.assertEqual('y',d1.getAttributeNode('is_ref').nodeValue)
+        self.assertEqual('n',d3.get('is_ref'))
-        self.assertEqual('n',d2.getAttributeNode('is_ref').nodeValue)
+        self.assertEqual('foo,bar',d1.get('words'))
-        self.assertEqual('n',d3.getAttributeNode('is_ref').nodeValue)
+        self.assertEqual('bar,bleh',d2.get('words'))
-        self.assertEqual('foo,bar',d1.getAttributeNode('words').nodeValue)
+        self.assertEqual('foo,bleh',d3.get('words'))
-        self.assertEqual('bar,bleh',d2.getAttributeNode('words').nodeValue)
+        self.assertEqual(3,len(g2))
-        self.assertEqual('foo,bleh',d3.getAttributeNode('words').nodeValue)
+        self.assertEqual(2,len([c for c in g2 if c.tag == 'file']))
-        children = [c for c in g2.childNodes if c.localName]
+        self.assertEqual(1,len([c for c in g2 if c.tag == 'match']))
-        self.assertEqual(3,len(children))
+        d1, d2 = [c for c in g2 if c.tag == 'file']
-        self.assertEqual(2,len([c for c in children if c.nodeName == 'file']))
+        self.assertEqual(op.join('basepath','ibabtu'),d1.get('path'))
-        self.assertEqual(1,len([c for c in children if c.nodeName == 'match']))
+        self.assertEqual(op.join('basepath','ibabtu'),d2.get('path'))
-        d1,d2 = [c for c in children if c.nodeName == 'file']
+        self.assertEqual('n',d1.get('is_ref'))
-        self.assertEqual(op.join('basepath','ibabtu'),d1.getAttributeNode('path').nodeValue)
+        self.assertEqual('n',d2.get('is_ref'))
-        self.assertEqual(op.join('basepath','ibabtu'),d2.getAttributeNode('path').nodeValue)
+        self.assertEqual('ibabtu',d1.get('words'))
-        self.assertEqual('n',d1.getAttributeNode('is_ref').nodeValue)
+        self.assertEqual('ibabtu',d2.get('words'))
        self.assertEqual('n',d2.getAttributeNode('is_ref').nodeValue)
        self.assertEqual('ibabtu',d1.getAttributeNode('words').nodeValue)
        self.assertEqual('ibabtu',d2.getAttributeNode('words').nodeValue)
    def test_LoadXML(self):
        def get_file(path):
@ -460,41 +456,41 @@ class TCResultsXML(TestCase):
        def get_file(path):
            return [f for f in self.objects if str(f.path) == path][0]
-        doc = xml.dom.minidom.Document()
+        root = etree.Element('foobar') #The root element shouldn't matter, really.
-        root = doc.appendChild(doc.createElement('foobar')) #The root element shouldn't matter, really.
+        group_node = etree.SubElement(root, 'group')
-        group_node = root.appendChild(doc.createElement('group'))
+        dupe_node = etree.SubElement(group_node, 'file') #Perfectly correct file
-        dupe_node = group_node.appendChild(doc.createElement('file')) #Perfectly correct file
+        dupe_node.set('path', op.join('basepath','foo bar'))
-        dupe_node.setAttribute('path',op.join('basepath','foo bar'))
+        dupe_node.set('is_ref', 'y')
-        dupe_node.setAttribute('is_ref','y')
+        dupe_node.set('words', 'foo,bar')
-        dupe_node.setAttribute('words','foo,bar')
+        dupe_node = etree.SubElement(group_node, 'file') #is_ref missing, default to 'n'
-        dupe_node = group_node.appendChild(doc.createElement('file')) #is_ref missing, default to 'n'
+        dupe_node.set('path',op.join('basepath','foo bleh'))
-        dupe_node.setAttribute('path',op.join('basepath','foo bleh'))
+        dupe_node.set('words','foo,bleh')
-        dupe_node.setAttribute('words','foo,bleh')
+        dupe_node = etree.SubElement(group_node, 'file') #words are missing, valid.
-        dupe_node = group_node.appendChild(doc.createElement('file')) #words are missing, invalid.
+        dupe_node.set('path',op.join('basepath','bar bleh'))
-        dupe_node.setAttribute('path',op.join('basepath','bar bleh'))
+        dupe_node = etree.SubElement(group_node, 'file') #path is missing, invalid.
-        dupe_node = group_node.appendChild(doc.createElement('file')) #path is missing, invalid.
+        dupe_node.set('words','foo,bleh')
-        dupe_node.setAttribute('words','foo,bleh')
+        dupe_node = etree.SubElement(group_node, 'foobar') #Invalid element name
-        dupe_node = group_node.appendChild(doc.createElement('foobar')) #Invalid element name
+        dupe_node.set('path',op.join('basepath','bar bleh'))
-        dupe_node.setAttribute('path',op.join('basepath','bar bleh'))
+        dupe_node.set('is_ref','y')
-        dupe_node.setAttribute('is_ref','y')
+        dupe_node.set('words','bar,bleh')
-        dupe_node.setAttribute('words','bar,bleh')
+        match_node = etree.SubElement(group_node, 'match') # match pointing to a bad index
-        match_node = group_node.appendChild(doc.createElement('match')) # match pointing to a bad index
+        match_node.set('first', '42')
-        match_node.setAttribute('first', '42')
+        match_node.set('second', '45')
-        match_node.setAttribute('second', '45')
+        match_node = etree.SubElement(group_node, 'match') # match with missing attrs
-        match_node = group_node.appendChild(doc.createElement('match')) # match with missing attrs
+        match_node = etree.SubElement(group_node, 'match') # match with non-int values
-        match_node = group_node.appendChild(doc.createElement('match')) # match with non-int values
+        match_node.set('first', 'foo')
-        match_node.setAttribute('first', 'foo')
+        match_node.set('second', 'bar')
-        match_node.setAttribute('second', 'bar')
+        match_node.set('percentage', 'baz')
-        match_node.setAttribute('percentage', 'baz')
+        group_node = etree.SubElement(root, 'foobar') #invalid group
-        group_node = root.appendChild(doc.createElement('foobar')) #invalid group
+        group_node = etree.SubElement(root, 'group') #empty group
        group_node = root.appendChild(doc.createElement('group')) #empty group
        f = StringIO.StringIO()
-        doc.writexml(f,'\t','\t','\n',encoding='utf-8')
+        tree = etree.ElementTree(root)
        tree.write(f, encoding='utf-8')
        f.seek(0)
        r = Results(data)
-        r.load_from_xml(f,get_file)
+        r.load_from_xml(f, get_file)
        self.assertEqual(1,len(r.groups))
-        self.assertEqual(2,len(r.groups[0]))
+        self.assertEqual(3,len(r.groups[0]))
    def test_xml_non_ascii(self):
        def get_file(path):