From c8827769b421f323ecad1e00e53a5e3c1544491a Mon Sep 17 00:00:00 2001 From: Virgil Dupras Date: Sun, 15 Aug 2010 14:42:55 +0200 Subject: [PATCH] Removed dependency on lxml (it made the final package much bigger, and building it on windows is not fun). --- README | 1 - cocoa/me/dg_cocoa.py | 2 +- cocoa/pe/dg_cocoa.py | 2 +- cocoa/se/dg_cocoa.py | 2 +- core/directories.py | 18 ++++++++--------- core/ignore.py | 18 +++++++++-------- core/results.py | 20 +++++++++---------- core/tests/ignore_test.py | 5 ++--- core/tests/results_test.py | 40 +++++++++++++++++++------------------- core_pe/app_cocoa.py | 20 ++++++++++++------- debian_me/control | 2 +- debian_pe/control | 2 +- debian_se/control | 2 +- qt/base/cxfreeze_fix.py | 2 +- 14 files changed, 71 insertions(+), 65 deletions(-) diff --git a/README b/README index cec48985..6b47adab 100644 --- a/README +++ b/README @@ -29,7 +29,6 @@ General dependencies - Send2Trash3k (http://hg.hardcoded.net/send2trash3k) - hsutil3k (http://hg.hardcoded.net/hsutil3k) - hsaudiotag3k (for ME) (http://hg.hardcoded.net/hsaudiotag3k) -- lxml, to read and write XML files. (http://codespeak.net/lxml/) - Markdown, to generate help files. (http://pypi.python.org/pypi/Markdown) - PyYaml, for help files and the build system. (http://pyyaml.org/) - py.test, to run unit tests. (http://codespeak.net/py/dist/test/) diff --git a/cocoa/me/dg_cocoa.py b/cocoa/me/dg_cocoa.py index 91ceeacb..eae2b626 100644 --- a/cocoa/me/dg_cocoa.py +++ b/cocoa/me/dg_cocoa.py @@ -13,7 +13,7 @@ from core.scanner import ScanType # Fix py2app imports which chokes on relative imports and other stuff from core_me import app_cocoa, data, fs, scanner from hsaudiotag import aiff, flac, genres, id3v1, id3v2, mp4, mpeg, ogg, wma -from lxml import etree, _elementpath +import xml.etree.ElementPath import gzip class PyDupeGuru(PyDupeGuruBase): diff --git a/cocoa/pe/dg_cocoa.py b/cocoa/pe/dg_cocoa.py index 7b6d1907..8608a0f1 100644 --- a/cocoa/pe/dg_cocoa.py +++ b/cocoa/pe/dg_cocoa.py @@ -11,7 +11,7 @@ from core_pe import app_cocoa as app_pe_cocoa import hsutil.conflict import core.engine, core.fs, core.app import core_pe.block, core_pe.cache, core_pe.matchbase, core_pe.data, core_pe._block_osx -import lxml.etree, lxml._elementpath +import xml.etree.ElementPath import gzip import aem.kae import appscript.defaultterminology diff --git a/cocoa/se/dg_cocoa.py b/cocoa/se/dg_cocoa.py index b4dfc23d..8415bd89 100644 --- a/cocoa/se/dg_cocoa.py +++ b/cocoa/se/dg_cocoa.py @@ -14,7 +14,7 @@ from core_se.app_cocoa import DupeGuru import hsutil.conflict import core.engine, core.fs, core.app import core_se.fs, core_se.data -import lxml.etree, lxml._elementpath +import xml.etree.ElementPath import gzip class PyDupeGuru(PyDupeGuruBase): diff --git a/core/directories.py b/core/directories.py index fa7202bf..19365409 100644 --- a/core/directories.py +++ b/core/directories.py @@ -6,7 +6,7 @@ # which should be included with this package. The terms are also available at # http://www.hardcoded.net/licenses/hs_license -from lxml import etree +from xml.etree import ElementTree as ET from hsutil import io from hsutil.files import FileOrPath @@ -126,10 +126,10 @@ class Directories(object): def load_from_file(self, infile): try: - root = etree.parse(infile).getroot() - except: + root = ET.parse(infile).getroot() + except Exception: return - for rdn in root.iterchildren('root_directory'): + for rdn in root.getiterator('root_directory'): attrib = rdn.attrib if 'path' not in attrib: continue @@ -138,7 +138,7 @@ class Directories(object): self.add_path(Path(path)) except (AlreadyThereError, InvalidPathError): pass - for sn in root.iterchildren('state'): + for sn in root.getiterator('state'): attrib = sn.attrib if not ('path' in attrib and 'value' in attrib): continue @@ -148,15 +148,15 @@ class Directories(object): def save_to_file(self, outfile): with FileOrPath(outfile, 'wb') as fp: - root = etree.Element('directories') + root = ET.Element('directories') for root_path in self: - root_path_node = etree.SubElement(root, 'root_directory') + root_path_node = ET.SubElement(root, 'root_directory') root_path_node.set('path', str(root_path)) for path, state in self.states.items(): - state_node = etree.SubElement(root, 'state') + state_node = ET.SubElement(root, 'state') state_node.set('path', str(path)) state_node.set('value', str(state)) - tree = etree.ElementTree(root) + tree = ET.ElementTree(root) tree.write(fp, encoding='utf-8') def set_state(self, path, state): diff --git a/core/ignore.py b/core/ignore.py index 6cdb5395..d15abc99 100644 --- a/core/ignore.py +++ b/core/ignore.py @@ -6,7 +6,7 @@ # which should be included with this package. The terms are also available at # http://www.hardcoded.net/licenses/hs_license -from lxml import etree +from xml.etree import ElementTree as ET from hsutil.files import FileOrPath @@ -77,14 +77,16 @@ class IgnoreList(object): infile can be a file object or a filename. """ try: - root = etree.parse(infile).getroot() + root = ET.parse(infile).getroot() except Exception: return - for fn in root.iterchildren('file'): + file_elems = (e for e in root if e.tag == 'file') + for fn in file_elems: file_path = fn.get('path') if not file_path: continue - for sfn in fn.iterchildren('file'): + subfile_elems = (e for e in fn if e.tag == 'file') + for sfn in subfile_elems: subfile_path = sfn.get('path') if subfile_path: self.Ignore(file_path, subfile_path) @@ -94,14 +96,14 @@ class IgnoreList(object): outfile can be a file object or a filename. """ - root = etree.Element('ignore_list') + root = ET.Element('ignore_list') for filename, subfiles in self._ignored.items(): - file_node = etree.SubElement(root, 'file') + file_node = ET.SubElement(root, 'file') file_node.set('path', filename) for subfilename in subfiles: - subfile_node = etree.SubElement(file_node, 'file') + subfile_node = ET.SubElement(file_node, 'file') subfile_node.set('path', subfilename) - tree = etree.ElementTree(root) + tree = ET.ElementTree(root) with FileOrPath(outfile, 'wb') as fp: tree.write(fp, encoding='utf-8') diff --git a/core/results.py b/core/results.py index 09b4f8dd..f04901c7 100644 --- a/core/results.py +++ b/core/results.py @@ -8,7 +8,7 @@ import logging import re -from lxml import etree +from xml.etree import ElementTree as ET from . import engine from hscommon.job import nulljob @@ -178,16 +178,16 @@ class Results(Markable): self.apply_filter(None) try: - root = etree.parse(infile).getroot() + root = ET.parse(infile).getroot() except Exception: return - group_elems = list(root.iterchildren('group')) + group_elems = list(root.getiterator('group')) groups = [] marked = set() for group_elem in j.iter_with_progress(group_elems, every=100): group = engine.Group() dupes = [] - for file_elem in group_elem.iterchildren('file'): + for file_elem in group_elem.getiterator('file'): path = file_elem.get('path') words = file_elem.get('words', '') if not path: @@ -200,7 +200,7 @@ class Results(Markable): dupes.append(file) if file_elem.get('marked') == 'y': marked.add(file) - for match_elem in group_elem.iterchildren('match'): + for match_elem in group_elem.getiterator('match'): try: attrs = match_elem.attrib first_file = dupes[int(attrs['first'])] @@ -277,10 +277,10 @@ class Results(Markable): def save_to_xml(self, outfile): self.apply_filter(None) - root = etree.Element('results') + root = ET.Element('results') # writer = XMLGenerator(outfile, 'utf-8') for g in self.groups: - group_elem = etree.SubElement(root, 'group') + group_elem = ET.SubElement(root, 'group') dupe2index = {} for index, d in enumerate(g): dupe2index[d] = index @@ -288,7 +288,7 @@ class Results(Markable): words = engine.unpack_fields(d.words) except AttributeError: words = () - file_elem = etree.SubElement(group_elem, 'file') + file_elem = ET.SubElement(group_elem, 'file') try: file_elem.set('path', str(d.path)) file_elem.set('words', ','.join(words)) @@ -297,11 +297,11 @@ class Results(Markable): file_elem.set('is_ref', ('y' if d.is_ref else 'n')) file_elem.set('marked', ('y' if self.is_marked(d) else 'n')) for match in g.matches: - match_elem = etree.SubElement(group_elem, 'match') + match_elem = ET.SubElement(group_elem, 'match') match_elem.set('first', str(dupe2index[match.first])) match_elem.set('second', str(dupe2index[match.second])) match_elem.set('percentage', str(int(match.percentage))) - tree = etree.ElementTree(root) + tree = ET.ElementTree(root) with FileOrPath(outfile, 'wb') as fp: tree.write(fp, encoding='utf-8') self.is_modified = False diff --git a/core/tests/ignore_test.py b/core/tests/ignore_test.py index 598cacf6..ab680d99 100644 --- a/core/tests/ignore_test.py +++ b/core/tests/ignore_test.py @@ -7,7 +7,7 @@ # http://www.hardcoded.net/licenses/hs_license import io -from lxml import etree +from xml.etree import ElementTree as ET from hsutil.testutil import eq_ @@ -62,7 +62,7 @@ def test_save_to_xml(): f = io.BytesIO() il.save_to_xml(f) f.seek(0) - doc = etree.parse(f) + doc = ET.parse(f) root = doc.getroot() eq_(root.tag, 'ignore_list') eq_(len(root), 2) @@ -80,7 +80,6 @@ def test_SaveThenLoad(): f = io.BytesIO() il.save_to_xml(f) f.seek(0) - f.seek(0) il = IgnoreList() il.load_from_xml(f) eq_(4,len(il)) diff --git a/core/tests/results_test.py b/core/tests/results_test.py index 212f0b29..06d83545 100644 --- a/core/tests/results_test.py +++ b/core/tests/results_test.py @@ -10,7 +10,7 @@ import io import os.path as op -from lxml import etree +from xml.etree import ElementTree as ET from hsutil.path import Path from hsutil.testutil import eq_ @@ -66,7 +66,7 @@ class TCResultsEmpty(TestCase): f = io.BytesIO() self.results.save_to_xml(f) f.seek(0) - doc = etree.parse(f) + doc = ET.parse(f) root = doc.getroot() eq_('results', root.tag) @@ -380,14 +380,14 @@ class TCResultsMarkings(TestCase): f = io.BytesIO() self.results.save_to_xml(f) f.seek(0) - doc = etree.parse(f) + doc = ET.parse(f) root = doc.getroot() - g1, g2 = root.iterchildren('group') - d1, d2, d3 = g1.iterchildren('file') + g1, g2 = root.getiterator('group') + d1, d2, d3 = g1.getiterator('file') eq_('n', d1.get('marked')) eq_('n', d2.get('marked')) eq_('y', d3.get('marked')) - d1, d2 = g2.iterchildren('file') + d1, d2 = g2.getiterator('file') eq_('n', d1.get('marked')) eq_('y', d2.get('marked')) @@ -425,7 +425,7 @@ class TCResultsXML(TestCase): f = io.BytesIO() self.results.save_to_xml(f) f.seek(0) - doc = etree.parse(f) + doc = ET.parse(f) root = doc.getroot() eq_('results', root.tag) eq_(2, len(root)) @@ -516,35 +516,35 @@ class TCResultsXML(TestCase): def get_file(path): return [f for f in self.objects if str(f.path) == path][0] - root = etree.Element('foobar') #The root element shouldn't matter, really. - group_node = etree.SubElement(root, 'group') - dupe_node = etree.SubElement(group_node, 'file') #Perfectly correct file + root = ET.Element('foobar') #The root element shouldn't matter, really. + group_node = ET.SubElement(root, 'group') + dupe_node = ET.SubElement(group_node, 'file') #Perfectly correct file dupe_node.set('path', op.join('basepath','foo bar')) dupe_node.set('is_ref', 'y') dupe_node.set('words', 'foo,bar') - dupe_node = etree.SubElement(group_node, 'file') #is_ref missing, default to 'n' + dupe_node = ET.SubElement(group_node, 'file') #is_ref missing, default to 'n' dupe_node.set('path',op.join('basepath','foo bleh')) dupe_node.set('words','foo,bleh') - dupe_node = etree.SubElement(group_node, 'file') #words are missing, valid. + dupe_node = ET.SubElement(group_node, 'file') #words are missing, valid. dupe_node.set('path',op.join('basepath','bar bleh')) - dupe_node = etree.SubElement(group_node, 'file') #path is missing, invalid. + dupe_node = ET.SubElement(group_node, 'file') #path is missing, invalid. dupe_node.set('words','foo,bleh') - dupe_node = etree.SubElement(group_node, 'foobar') #Invalid element name + dupe_node = ET.SubElement(group_node, 'foobar') #Invalid element name dupe_node.set('path',op.join('basepath','bar bleh')) dupe_node.set('is_ref','y') dupe_node.set('words','bar,bleh') - match_node = etree.SubElement(group_node, 'match') # match pointing to a bad index + match_node = ET.SubElement(group_node, 'match') # match pointing to a bad index match_node.set('first', '42') match_node.set('second', '45') - match_node = etree.SubElement(group_node, 'match') # match with missing attrs - match_node = etree.SubElement(group_node, 'match') # match with non-int values + match_node = ET.SubElement(group_node, 'match') # match with missing attrs + match_node = ET.SubElement(group_node, 'match') # match with non-int values match_node.set('first', 'foo') match_node.set('second', 'bar') match_node.set('percentage', 'baz') - group_node = etree.SubElement(root, 'foobar') #invalid group - group_node = etree.SubElement(root, 'group') #empty group + group_node = ET.SubElement(root, 'foobar') #invalid group + group_node = ET.SubElement(root, 'group') #empty group f = io.BytesIO() - tree = etree.ElementTree(root) + tree = ET.ElementTree(root) tree.write(f, encoding='utf-8') f.seek(0) r = Results(data) diff --git a/core_pe/app_cocoa.py b/core_pe/app_cocoa.py index 10dd2c30..375c79b4 100644 --- a/core_pe/app_cocoa.py +++ b/core_pe/app_cocoa.py @@ -8,12 +8,13 @@ import os.path as op import plistlib +import logging +import re -from lxml import etree from appscript import app, k, CommandError from hsutil import io -from hsutil.str import get_file_ext +from hsutil.str import get_file_ext, remove_invalid_xml from hsutil.path import Path from hscommon.cocoa import as_fetch from hscommon.cocoa.objcmin import NSUserDefaults, NSURL @@ -67,11 +68,16 @@ def get_iphoto_database_path(): def get_iphoto_pictures(plistpath): if not io.exists(plistpath): return [] - # We make the xml go through lxml so that it can fix broken xml which iPhoto sometimes produces. - parser = etree.XMLParser(recover=True) - root = etree.parse(io.open(plistpath), parser=parser).getroot() - s = etree.tostring(root) - plist = plistlib.readPlistFromString(s) + s = io.open(plistpath, 'rt', encoding='utf-8').read() + # There was a case where a guy had 0x10 chars in his plist, causing expat errors on loading + s = remove_invalid_xml(s, replace_with='') + # It seems that iPhoto sometimes doesn't properly escape & chars. The regexp below is to find + # any & char that is not a &-based entity (&, ", etc.). based on TextMate's XML + # bundle's regexp + s, count = re.subn(r'&(?![a-zA-Z0-9_-]+|#[0-9]+|#x[0-9a-fA-F]+;)', '', s) + if count: + logging.warning("%d invalid XML entities replacement made", count) + plist = plistlib.readPlistFromBytes(s.encode('utf-8')) result = [] for photo_data in plist['Master Image List'].values(): if photo_data['MediaType'] != 'Image': diff --git a/debian_me/control b/debian_me/control index 7e141760..a8056cce 100644 --- a/debian_me/control +++ b/debian_me/control @@ -8,5 +8,5 @@ Homepage: http://www.hardcoded.net Package: dupeguru-me Architecture: any -Depends: python (>= 2.6), python-qt4 (>= 4.6), python-lxml (>= 2.1) +Depends: python (>= 2.6), python-qt4 (>= 4.6) Description: dupeGuru Music Edition diff --git a/debian_pe/control b/debian_pe/control index d7f296f1..9ba5993b 100644 --- a/debian_pe/control +++ b/debian_pe/control @@ -8,5 +8,5 @@ Homepage: http://www.hardcoded.net Package: dupeguru-pe Architecture: any -Depends: python (>= 2.6), python-qt4 (>= 4.6), python-lxml (>= 2.1), python-imaging (>= 1.1.6) +Depends: python (>= 2.6), python-qt4 (>= 4.6), python-imaging (>= 1.1.6) Description: dupeGuru Picture Edition diff --git a/debian_se/control b/debian_se/control index 67bd00a1..e7235229 100644 --- a/debian_se/control +++ b/debian_se/control @@ -8,5 +8,5 @@ Homepage: http://www.hardcoded.net Package: dupeguru-se Architecture: any -Depends: python (>= 2.6), python-qt4 (>= 4.6), python-lxml (>= 2.1) +Depends: python (>= 2.6), python-qt4 (>= 4.6) Description: dupeGuru diff --git a/qt/base/cxfreeze_fix.py b/qt/base/cxfreeze_fix.py index 0b33a2f1..e6a8a0b2 100644 --- a/qt/base/cxfreeze_fix.py +++ b/qt/base/cxfreeze_fix.py @@ -1,7 +1,7 @@ # cxfreeze has some problems detecting all dependencies. # This modules explicitly import those problematic modules. -import lxml._elementpath +import xml.etree.ElementPath import gzip import os