mirror of
https://github.com/arsenetar/dupeguru.git
synced 2026-01-23 07:01:39 +00:00
Initial commit.
--HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402
This commit is contained in:
1
py/__init__.py
Normal file
1
py/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
229
py/app.py
Normal file
229
py/app.py
Normal file
@@ -0,0 +1,229 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.app
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/11/11
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 16:02:48 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4388 $
|
||||
Copyright 2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import os
|
||||
import os.path as op
|
||||
import logging
|
||||
|
||||
from hsfs import IT_ATTRS, IT_EXTRA
|
||||
from hsutil import job, io, files
|
||||
from hsutil.path import Path
|
||||
from hsutil.reg import RegistrableApplication, RegistrationRequired
|
||||
from hsutil.misc import flatten, first
|
||||
from hsutil.str import escape
|
||||
|
||||
import directories
|
||||
import results
|
||||
import scanner
|
||||
|
||||
JOB_SCAN = 'job_scan'
|
||||
JOB_LOAD = 'job_load'
|
||||
JOB_MOVE = 'job_move'
|
||||
JOB_COPY = 'job_copy'
|
||||
JOB_DELETE = 'job_delete'
|
||||
|
||||
class NoScannableFileError(Exception):
|
||||
pass
|
||||
|
||||
class AllFilesAreRefError(Exception):
|
||||
pass
|
||||
|
||||
class DupeGuru(RegistrableApplication):
|
||||
def __init__(self, data_module, appdata, appid):
|
||||
RegistrableApplication.__init__(self, appid)
|
||||
self.appdata = appdata
|
||||
if not op.exists(self.appdata):
|
||||
os.makedirs(self.appdata)
|
||||
self.data = data_module
|
||||
self.directories = directories.Directories()
|
||||
self.results = results.Results(data_module)
|
||||
self.scanner = scanner.Scanner()
|
||||
self.action_count = 0
|
||||
self.last_op_error_count = 0
|
||||
self.options = {
|
||||
'escape_filter_regexp': True,
|
||||
'clean_empty_dirs': False,
|
||||
}
|
||||
|
||||
def _demo_check(self):
|
||||
if self.registered:
|
||||
return
|
||||
count = self.results.mark_count
|
||||
if count + self.action_count > 10:
|
||||
raise RegistrationRequired()
|
||||
else:
|
||||
self.action_count += count
|
||||
|
||||
def _do_delete(self, j):
|
||||
def op(dupe):
|
||||
j.add_progress()
|
||||
return self._do_delete_dupe(dupe)
|
||||
|
||||
j.start_job(self.results.mark_count)
|
||||
self.last_op_error_count = self.results.perform_on_marked(op, True)
|
||||
|
||||
def _do_delete_dupe(self, dupe):
|
||||
if not io.exists(dupe.path):
|
||||
dupe.parent = None
|
||||
return True
|
||||
self._recycle_dupe(dupe)
|
||||
self.clean_empty_dirs(dupe.path[:-1])
|
||||
if not io.exists(dupe.path):
|
||||
dupe.parent = None
|
||||
return True
|
||||
logging.warning(u"Could not send {0} to trash.".format(unicode(dupe.path)))
|
||||
return False
|
||||
|
||||
def _do_load(self, j):
|
||||
self.directories.LoadFromFile(op.join(self.appdata, 'last_directories.xml'))
|
||||
j = j.start_subjob([1, 9])
|
||||
self.results.load_from_xml(op.join(self.appdata, 'last_results.xml'), self._get_file, j)
|
||||
files = flatten(g[:] for g in self.results.groups)
|
||||
for file in j.iter_with_progress(files, 'Reading metadata %d/%d'):
|
||||
file._read_all_info(sections=[IT_ATTRS, IT_EXTRA])
|
||||
|
||||
def _get_file(self, str_path):
|
||||
p = Path(str_path)
|
||||
for d in self.directories:
|
||||
if p not in d.path:
|
||||
continue
|
||||
result = d.find_path(p[d.path:])
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _recycle_dupe(dupe):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _start_job(self, jobid, func):
|
||||
# func(j)
|
||||
raise NotImplementedError()
|
||||
|
||||
def AddDirectory(self, d):
|
||||
try:
|
||||
self.directories.add_path(Path(d))
|
||||
return 0
|
||||
except directories.AlreadyThereError:
|
||||
return 1
|
||||
except directories.InvalidPathError:
|
||||
return 2
|
||||
|
||||
def AddToIgnoreList(self, dupe):
|
||||
g = self.results.get_group_of_duplicate(dupe)
|
||||
for other in g:
|
||||
if other is not dupe:
|
||||
self.scanner.ignore_list.Ignore(unicode(other.path), unicode(dupe.path))
|
||||
|
||||
def ApplyFilter(self, filter):
|
||||
self.results.apply_filter(None)
|
||||
if self.options['escape_filter_regexp']:
|
||||
filter = escape(filter, '()[]\\.|+?^')
|
||||
filter = escape(filter, '*', '.')
|
||||
self.results.apply_filter(filter)
|
||||
|
||||
def clean_empty_dirs(self, path):
|
||||
if self.options['clean_empty_dirs']:
|
||||
while files.delete_if_empty(path, ['.DS_Store']):
|
||||
path = path[:-1]
|
||||
|
||||
def CopyOrMove(self, dupe, copy, destination, dest_type):
|
||||
"""
|
||||
copy: True = Copy False = Move
|
||||
destination: string.
|
||||
dest_type: 0 = right in destination.
|
||||
1 = relative re-creation.
|
||||
2 = absolute re-creation.
|
||||
"""
|
||||
source_path = dupe.path
|
||||
location_path = dupe.root.path
|
||||
dest_path = Path(destination)
|
||||
if dest_type == 2:
|
||||
dest_path = dest_path + source_path[1:-1] #Remove drive letter and filename
|
||||
elif dest_type == 1:
|
||||
dest_path = dest_path + source_path[location_path:-1]
|
||||
if not io.exists(dest_path):
|
||||
io.makedirs(dest_path)
|
||||
try:
|
||||
if copy:
|
||||
files.copy(source_path, dest_path)
|
||||
else:
|
||||
files.move(source_path, dest_path)
|
||||
self.clean_empty_dirs(source_path[:-1])
|
||||
except (IOError, OSError) as e:
|
||||
operation = 'Copy' if copy else 'Move'
|
||||
logging.warning('%s operation failed on %s. Error: %s' % (operation, unicode(dupe.path), unicode(e)))
|
||||
return False
|
||||
return True
|
||||
|
||||
def copy_or_move_marked(self, copy, destination, recreate_path):
|
||||
def do(j):
|
||||
def op(dupe):
|
||||
j.add_progress()
|
||||
return self.CopyOrMove(dupe, copy, destination, recreate_path)
|
||||
|
||||
j.start_job(self.results.mark_count)
|
||||
self.last_op_error_count = self.results.perform_on_marked(op, not copy)
|
||||
|
||||
self._demo_check()
|
||||
jobid = JOB_COPY if copy else JOB_MOVE
|
||||
self._start_job(jobid, do)
|
||||
|
||||
def delete_marked(self):
|
||||
self._demo_check()
|
||||
self._start_job(JOB_DELETE, self._do_delete)
|
||||
|
||||
def load(self):
|
||||
self._start_job(JOB_LOAD, self._do_load)
|
||||
self.LoadIgnoreList()
|
||||
|
||||
def LoadIgnoreList(self):
|
||||
p = op.join(self.appdata, 'ignore_list.xml')
|
||||
self.scanner.ignore_list.load_from_xml(p)
|
||||
|
||||
def make_reference(self, duplicates):
|
||||
changed_groups = set()
|
||||
for dupe in duplicates:
|
||||
g = self.results.get_group_of_duplicate(dupe)
|
||||
if g not in changed_groups:
|
||||
self.results.make_ref(dupe)
|
||||
changed_groups.add(g)
|
||||
|
||||
def Save(self):
|
||||
self.directories.SaveToFile(op.join(self.appdata, 'last_directories.xml'))
|
||||
self.results.save_to_xml(op.join(self.appdata, 'last_results.xml'))
|
||||
|
||||
def SaveIgnoreList(self):
|
||||
p = op.join(self.appdata, 'ignore_list.xml')
|
||||
self.scanner.ignore_list.save_to_xml(p)
|
||||
|
||||
def start_scanning(self):
|
||||
def do(j):
|
||||
j.set_progress(0, 'Collecting files to scan')
|
||||
files = list(self.directories.get_files())
|
||||
logging.info('Scanning %d files' % len(files))
|
||||
self.results.groups = self.scanner.GetDupeGroups(files, j)
|
||||
|
||||
files = self.directories.get_files()
|
||||
first_file = first(files)
|
||||
if first_file is None:
|
||||
raise NoScannableFileError()
|
||||
if first_file.is_ref and all(f.is_ref for f in files):
|
||||
raise AllFilesAreRefError()
|
||||
self.results.groups = []
|
||||
self._start_job(JOB_SCAN, do)
|
||||
|
||||
#--- Properties
|
||||
@property
|
||||
def stat_line(self):
|
||||
result = self.results.stat_line
|
||||
if self.scanner.discarded_file_count:
|
||||
result = '%s (%d discarded)' % (result, self.scanner.discarded_file_count)
|
||||
return result
|
||||
|
||||
304
py/app_cocoa.py
Normal file
304
py/app_cocoa.py
Normal file
@@ -0,0 +1,304 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.app_cocoa
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/11/11
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4392 $
|
||||
Copyright 2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
from AppKit import *
|
||||
import logging
|
||||
import os.path as op
|
||||
|
||||
import hsfs as fs
|
||||
from hsfs.phys.bundle import Bundle
|
||||
from hsutil.cocoa import install_exception_hook
|
||||
from hsutil.str import get_file_ext
|
||||
from hsutil import io, cocoa, job
|
||||
from hsutil.reg import RegistrationRequired
|
||||
|
||||
import export, app, data
|
||||
|
||||
JOBID2TITLE = {
|
||||
app.JOB_SCAN: "Scanning for duplicates",
|
||||
app.JOB_LOAD: "Loading",
|
||||
app.JOB_MOVE: "Moving",
|
||||
app.JOB_COPY: "Copying",
|
||||
app.JOB_DELETE: "Sending to Trash",
|
||||
}
|
||||
|
||||
class DGDirectory(fs.phys.Directory):
|
||||
def _create_sub_dir(self,name,with_parent = True):
|
||||
ext = get_file_ext(name)
|
||||
if ext == 'app':
|
||||
if with_parent:
|
||||
parent = self
|
||||
else:
|
||||
parent = None
|
||||
return Bundle(parent,name)
|
||||
else:
|
||||
return super(DGDirectory,self)._create_sub_dir(name,with_parent)
|
||||
|
||||
|
||||
def demo_method(method):
|
||||
def wrapper(self, *args, **kwargs):
|
||||
try:
|
||||
return method(self, *args, **kwargs)
|
||||
except RegistrationRequired:
|
||||
NSNotificationCenter.defaultCenter().postNotificationName_object_('RegistrationRequired', self)
|
||||
|
||||
return wrapper
|
||||
|
||||
class DupeGuru(app.DupeGuru):
|
||||
def __init__(self, data_module, appdata_subdir, appid):
|
||||
LOGGING_LEVEL = logging.DEBUG if NSUserDefaults.standardUserDefaults().boolForKey_('debug') else logging.WARNING
|
||||
logging.basicConfig(level=LOGGING_LEVEL, format='%(levelname)s %(message)s')
|
||||
logging.debug('started in debug mode')
|
||||
install_exception_hook()
|
||||
if data_module is None:
|
||||
data_module = data
|
||||
appdata = op.expanduser(op.join('~', '.hsoftdata', appdata_subdir))
|
||||
app.DupeGuru.__init__(self, data_module, appdata, appid)
|
||||
self.progress = cocoa.ThreadedJobPerformer()
|
||||
self.directories.dirclass = DGDirectory
|
||||
self.display_delta_values = False
|
||||
self.selected_dupes = []
|
||||
self.RefreshDetailsTable(None,None)
|
||||
|
||||
#--- Override
|
||||
@staticmethod
|
||||
def _recycle_dupe(dupe):
|
||||
if not io.exists(dupe.path):
|
||||
dupe.parent = None
|
||||
return True
|
||||
directory = unicode(dupe.parent.path)
|
||||
filename = dupe.name
|
||||
result, tag = NSWorkspace.sharedWorkspace().performFileOperation_source_destination_files_tag_(
|
||||
NSWorkspaceRecycleOperation, directory, '', [filename])
|
||||
if not io.exists(dupe.path):
|
||||
dupe.parent = None
|
||||
return True
|
||||
logging.warning('Could not send %s to trash. tag: %d' % (unicode(dupe.path), tag))
|
||||
return False
|
||||
|
||||
def _start_job(self, jobid, func):
|
||||
try:
|
||||
j = self.progress.create_job()
|
||||
self.progress.run_threaded(func, args=(j, ))
|
||||
except job.JobInProgressError:
|
||||
NSNotificationCenter.defaultCenter().postNotificationName_object_('JobInProgress', self)
|
||||
else:
|
||||
ud = {'desc': JOBID2TITLE[jobid], 'jobid':jobid}
|
||||
NSNotificationCenter.defaultCenter().postNotificationName_object_userInfo_('JobStarted', self, ud)
|
||||
|
||||
#---Helpers
|
||||
def GetObjects(self,node_path):
|
||||
#returns a tuple g,d
|
||||
try:
|
||||
g = self.results.groups[node_path[0]]
|
||||
if len(node_path) == 2:
|
||||
return (g,self.results.groups[node_path[0]].dupes[node_path[1]])
|
||||
else:
|
||||
return (g,None)
|
||||
except IndexError:
|
||||
return (None,None)
|
||||
|
||||
def GetDirectory(self,node_path,curr_dir=None):
|
||||
if not node_path:
|
||||
return curr_dir
|
||||
if curr_dir is not None:
|
||||
l = curr_dir.dirs
|
||||
else:
|
||||
l = self.directories
|
||||
d = l[node_path[0]]
|
||||
return self.GetDirectory(node_path[1:],d)
|
||||
|
||||
def RefreshDetailsTable(self,dupe,group):
|
||||
l1 = self.data.GetDisplayInfo(dupe,group,False)
|
||||
if group is not None:
|
||||
l2 = self.data.GetDisplayInfo(group.ref,group,False)
|
||||
else:
|
||||
l2 = l1 #To have a list of empty '---' values
|
||||
names = [c['display'] for c in self.data.COLUMNS]
|
||||
self.details_table = zip(names,l1,l2)
|
||||
|
||||
#---Public
|
||||
def AddSelectedToIgnoreList(self):
|
||||
for dupe in self.selected_dupes:
|
||||
self.AddToIgnoreList(dupe)
|
||||
|
||||
copy_or_move_marked = demo_method(app.DupeGuru.copy_or_move_marked)
|
||||
delete_marked = demo_method(app.DupeGuru.delete_marked)
|
||||
|
||||
def ExportToXHTML(self,column_ids,xslt_path,css_path):
|
||||
columns = []
|
||||
for index,column in enumerate(self.data.COLUMNS):
|
||||
display = column['display']
|
||||
enabled = str(index) in column_ids
|
||||
columns.append((display,enabled))
|
||||
xml_path = op.join(self.appdata,'results_export.xml')
|
||||
self.results.save_to_xml(xml_path,self.data.GetDisplayInfo)
|
||||
return export.export_to_xhtml(xml_path,xslt_path,css_path,columns)
|
||||
|
||||
def MakeSelectedReference(self):
|
||||
self.make_reference(self.selected_dupes)
|
||||
|
||||
def OpenSelected(self):
|
||||
if self.selected_dupes:
|
||||
path = unicode(self.selected_dupes[0].path)
|
||||
NSWorkspace.sharedWorkspace().openFile_(path)
|
||||
|
||||
def PurgeIgnoreList(self):
|
||||
self.scanner.ignore_list.Filter(lambda f,s:op.exists(f) and op.exists(s))
|
||||
|
||||
def RefreshDetailsWithSelected(self):
|
||||
if self.selected_dupes:
|
||||
self.RefreshDetailsTable(
|
||||
self.selected_dupes[0],
|
||||
self.results.get_group_of_duplicate(self.selected_dupes[0])
|
||||
)
|
||||
else:
|
||||
self.RefreshDetailsTable(None,None)
|
||||
|
||||
def RemoveDirectory(self,index):
|
||||
try:
|
||||
del self.directories[index]
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
def RemoveSelected(self):
|
||||
self.results.remove_duplicates(self.selected_dupes)
|
||||
|
||||
def RenameSelected(self,newname):
|
||||
try:
|
||||
d = self.selected_dupes[0]
|
||||
d = d.move(d.parent,newname)
|
||||
return True
|
||||
except (IndexError,fs.FSError),e:
|
||||
logging.warning("dupeGuru Warning: %s" % str(e))
|
||||
return False
|
||||
|
||||
def RevealSelected(self):
|
||||
if self.selected_dupes:
|
||||
path = unicode(self.selected_dupes[0].path)
|
||||
NSWorkspace.sharedWorkspace().selectFile_inFileViewerRootedAtPath_(path,'')
|
||||
|
||||
def start_scanning(self):
|
||||
self.RefreshDetailsTable(None, None)
|
||||
try:
|
||||
app.DupeGuru.start_scanning(self)
|
||||
return 0
|
||||
except app.NoScannableFileError:
|
||||
return 3
|
||||
except app.AllFilesAreRefError:
|
||||
return 1
|
||||
|
||||
def SelectResultNodePaths(self,node_paths):
|
||||
def extract_dupe(t):
|
||||
g,d = t
|
||||
if d is not None:
|
||||
return d
|
||||
else:
|
||||
if g is not None:
|
||||
return g.ref
|
||||
|
||||
selected = [extract_dupe(self.GetObjects(p)) for p in node_paths]
|
||||
self.selected_dupes = [dupe for dupe in selected if dupe is not None]
|
||||
|
||||
def SelectPowerMarkerNodePaths(self,node_paths):
|
||||
rows = [p[0] for p in node_paths]
|
||||
self.selected_dupes = [
|
||||
self.results.dupes[row] for row in rows if row in xrange(len(self.results.dupes))
|
||||
]
|
||||
|
||||
def SetDirectoryState(self,node_path,state):
|
||||
d = self.GetDirectory(node_path)
|
||||
self.directories.SetState(d.path,state)
|
||||
|
||||
def sort_dupes(self,key,asc):
|
||||
self.results.sort_dupes(key,asc,self.display_delta_values)
|
||||
|
||||
def sort_groups(self,key,asc):
|
||||
self.results.sort_groups(key,asc)
|
||||
|
||||
def ToggleSelectedMarkState(self):
|
||||
for dupe in self.selected_dupes:
|
||||
self.results.mark_toggle(dupe)
|
||||
|
||||
#---Data
|
||||
def GetOutlineViewMaxLevel(self, tag):
|
||||
if tag == 0:
|
||||
return 2
|
||||
elif tag == 1:
|
||||
return 0
|
||||
elif tag == 2:
|
||||
return 1
|
||||
|
||||
def GetOutlineViewChildCounts(self, tag, node_path):
|
||||
if self.progress._job_running:
|
||||
return []
|
||||
if tag == 0: #Normal results
|
||||
assert not node_path # no other value is possible
|
||||
return [len(g.dupes) for g in self.results.groups]
|
||||
elif tag == 1: #Directories
|
||||
dirs = self.GetDirectory(node_path).dirs if node_path else self.directories
|
||||
return [d.dircount for d in dirs]
|
||||
else: #Power Marker
|
||||
assert not node_path # no other value is possible
|
||||
return [0 for d in self.results.dupes]
|
||||
|
||||
def GetOutlineViewValues(self, tag, node_path):
|
||||
if self.progress._job_running:
|
||||
return
|
||||
if not node_path:
|
||||
return
|
||||
if tag in (0,2): #Normal results / Power Marker
|
||||
if tag == 0:
|
||||
g, d = self.GetObjects(node_path)
|
||||
if d is None:
|
||||
d = g.ref
|
||||
else:
|
||||
d = self.results.dupes[node_path[0]]
|
||||
g = self.results.get_group_of_duplicate(d)
|
||||
result = self.data.GetDisplayInfo(d, g, self.display_delta_values)
|
||||
return result
|
||||
elif tag == 1: #Directories
|
||||
d = self.GetDirectory(node_path)
|
||||
return [
|
||||
d.name,
|
||||
self.directories.GetState(d.path)
|
||||
]
|
||||
|
||||
def GetOutlineViewMarked(self, tag, node_path):
|
||||
# 0=unmarked 1=marked 2=unmarkable
|
||||
if self.progress._job_running:
|
||||
return
|
||||
if not node_path:
|
||||
return 2
|
||||
if tag == 1: #Directories
|
||||
return 2
|
||||
if tag == 0: #Normal results
|
||||
g, d = self.GetObjects(node_path)
|
||||
else: #Power Marker
|
||||
d = self.results.dupes[node_path[0]]
|
||||
if (d is None) or (not self.results.is_markable(d)):
|
||||
return 2
|
||||
elif self.results.is_marked(d):
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def GetTableViewCount(self, tag):
|
||||
if self.progress._job_running:
|
||||
return 0
|
||||
return len(self.details_table)
|
||||
|
||||
def GetTableViewMarkedIndexes(self,tag):
|
||||
return []
|
||||
|
||||
def GetTableViewValues(self,tag,row):
|
||||
return self.details_table[row]
|
||||
|
||||
|
||||
320
py/app_cocoa_test.py
Normal file
320
py/app_cocoa_test.py
Normal file
@@ -0,0 +1,320 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.tests.app_cocoa
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/11/11
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-29 17:51:41 +0200 (Fri, 29 May 2009) $
|
||||
$Revision: 4409 $
|
||||
Copyright 2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import tempfile
|
||||
import shutil
|
||||
import logging
|
||||
|
||||
from hsutil.path import Path
|
||||
from hsutil.testcase import TestCase
|
||||
from hsutil.decorators import log_calls
|
||||
import hsfs.phys
|
||||
import os.path as op
|
||||
|
||||
from . import engine, data
|
||||
try:
|
||||
from .app_cocoa import DupeGuru as DupeGuruBase, DGDirectory
|
||||
except ImportError:
|
||||
from nose.plugins.skip import SkipTest
|
||||
raise SkipTest("These tests can only be run on OS X")
|
||||
from .results_test import GetTestGroups
|
||||
|
||||
class DupeGuru(DupeGuruBase):
|
||||
def __init__(self):
|
||||
DupeGuruBase.__init__(self, data, '/tmp', appid=4)
|
||||
|
||||
def _start_job(self, jobid, func):
|
||||
func(nulljob)
|
||||
|
||||
|
||||
def r2np(rows):
|
||||
#Transforms a list of rows [1,2,3] into a list of node paths [[1],[2],[3]]
|
||||
return [[i] for i in rows]
|
||||
|
||||
class TCDupeGuru(TestCase):
|
||||
def setUp(self):
|
||||
self.app = DupeGuru()
|
||||
self.objects,self.matches,self.groups = GetTestGroups()
|
||||
self.app.results.groups = self.groups
|
||||
|
||||
def test_GetObjects(self):
|
||||
app = self.app
|
||||
objects = self.objects
|
||||
groups = self.groups
|
||||
g,d = app.GetObjects([0])
|
||||
self.assert_(g is groups[0])
|
||||
self.assert_(d is None)
|
||||
g,d = app.GetObjects([0,0])
|
||||
self.assert_(g is groups[0])
|
||||
self.assert_(d is objects[1])
|
||||
g,d = app.GetObjects([1,0])
|
||||
self.assert_(g is groups[1])
|
||||
self.assert_(d is objects[4])
|
||||
|
||||
def test_GetObjects_after_sort(self):
|
||||
app = self.app
|
||||
objects = self.objects
|
||||
groups = self.groups[:] #To keep the old order in memory
|
||||
app.sort_groups(0,False) #0 = Filename
|
||||
#Now, the group order is supposed to be reversed
|
||||
g,d = app.GetObjects([0,0])
|
||||
self.assert_(g is groups[1])
|
||||
self.assert_(d is objects[4])
|
||||
|
||||
def test_GetObjects_out_of_range(self):
|
||||
app = self.app
|
||||
self.assertEqual((None,None),app.GetObjects([2]))
|
||||
self.assertEqual((None,None),app.GetObjects([]))
|
||||
self.assertEqual((None,None),app.GetObjects([1,2]))
|
||||
|
||||
def test_selectResultNodePaths(self):
|
||||
app = self.app
|
||||
objects = self.objects
|
||||
app.SelectResultNodePaths([[0,0],[0,1]])
|
||||
self.assertEqual(2,len(app.selected_dupes))
|
||||
self.assert_(app.selected_dupes[0] is objects[1])
|
||||
self.assert_(app.selected_dupes[1] is objects[2])
|
||||
|
||||
def test_selectResultNodePaths_with_ref(self):
|
||||
app = self.app
|
||||
objects = self.objects
|
||||
app.SelectResultNodePaths([[0,0],[0,1],[1]])
|
||||
self.assertEqual(3,len(app.selected_dupes))
|
||||
self.assert_(app.selected_dupes[0] is objects[1])
|
||||
self.assert_(app.selected_dupes[1] is objects[2])
|
||||
self.assert_(app.selected_dupes[2] is self.groups[1].ref)
|
||||
|
||||
def test_selectResultNodePaths_empty(self):
|
||||
self.app.SelectResultNodePaths([])
|
||||
self.assertEqual(0,len(self.app.selected_dupes))
|
||||
|
||||
def test_selectResultNodePaths_after_sort(self):
|
||||
app = self.app
|
||||
objects = self.objects
|
||||
groups = self.groups[:] #To keep the old order in memory
|
||||
app.sort_groups(0,False) #0 = Filename
|
||||
#Now, the group order is supposed to be reversed
|
||||
app.SelectResultNodePaths([[0,0],[1],[1,0]])
|
||||
self.assertEqual(3,len(app.selected_dupes))
|
||||
self.assert_(app.selected_dupes[0] is objects[4])
|
||||
self.assert_(app.selected_dupes[1] is groups[0].ref)
|
||||
self.assert_(app.selected_dupes[2] is objects[1])
|
||||
|
||||
def test_selectResultNodePaths_out_of_range(self):
|
||||
app = self.app
|
||||
app.SelectResultNodePaths([[0,0],[0,1],[1],[1,1],[2]])
|
||||
self.assertEqual(3,len(app.selected_dupes))
|
||||
|
||||
def test_selectPowerMarkerRows(self):
|
||||
app = self.app
|
||||
objects = self.objects
|
||||
app.SelectPowerMarkerNodePaths(r2np([0,1,2]))
|
||||
self.assertEqual(3,len(app.selected_dupes))
|
||||
self.assert_(app.selected_dupes[0] is objects[1])
|
||||
self.assert_(app.selected_dupes[1] is objects[2])
|
||||
self.assert_(app.selected_dupes[2] is objects[4])
|
||||
|
||||
def test_selectPowerMarkerRows_empty(self):
|
||||
self.app.SelectPowerMarkerNodePaths([])
|
||||
self.assertEqual(0,len(self.app.selected_dupes))
|
||||
|
||||
def test_selectPowerMarkerRows_after_sort(self):
|
||||
app = self.app
|
||||
objects = self.objects
|
||||
app.sort_dupes(0,False) #0 = Filename
|
||||
app.SelectPowerMarkerNodePaths(r2np([0,1,2]))
|
||||
self.assertEqual(3,len(app.selected_dupes))
|
||||
self.assert_(app.selected_dupes[0] is objects[4])
|
||||
self.assert_(app.selected_dupes[1] is objects[2])
|
||||
self.assert_(app.selected_dupes[2] is objects[1])
|
||||
|
||||
def test_selectPowerMarkerRows_out_of_range(self):
|
||||
app = self.app
|
||||
app.SelectPowerMarkerNodePaths(r2np([0,1,2,3]))
|
||||
self.assertEqual(3,len(app.selected_dupes))
|
||||
|
||||
def test_toggleSelectedMark(self):
|
||||
app = self.app
|
||||
objects = self.objects
|
||||
app.ToggleSelectedMarkState()
|
||||
self.assertEqual(0,app.results.mark_count)
|
||||
app.SelectPowerMarkerNodePaths(r2np([0,2]))
|
||||
app.ToggleSelectedMarkState()
|
||||
self.assertEqual(2,app.results.mark_count)
|
||||
self.assert_(not app.results.is_marked(objects[0]))
|
||||
self.assert_(app.results.is_marked(objects[1]))
|
||||
self.assert_(not app.results.is_marked(objects[2]))
|
||||
self.assert_(not app.results.is_marked(objects[3]))
|
||||
self.assert_(app.results.is_marked(objects[4]))
|
||||
|
||||
def test_refreshDetailsWithSelected(self):
|
||||
def mock_refresh(dupe,group):
|
||||
self.called = True
|
||||
if self.app.selected_dupes:
|
||||
self.assert_(dupe is self.app.selected_dupes[0])
|
||||
self.assert_(group is self.app.results.get_group_of_duplicate(dupe))
|
||||
else:
|
||||
self.assert_(dupe is None)
|
||||
self.assert_(group is None)
|
||||
|
||||
self.app.RefreshDetailsTable = mock_refresh
|
||||
self.called = False
|
||||
self.app.SelectPowerMarkerNodePaths(r2np([0,2]))
|
||||
self.app.RefreshDetailsWithSelected()
|
||||
self.assert_(self.called)
|
||||
self.called = False
|
||||
self.app.SelectPowerMarkerNodePaths([])
|
||||
self.app.RefreshDetailsWithSelected()
|
||||
self.assert_(self.called)
|
||||
|
||||
def test_makeSelectedReference(self):
|
||||
app = self.app
|
||||
objects = self.objects
|
||||
groups = self.groups
|
||||
app.SelectPowerMarkerNodePaths(r2np([0,2]))
|
||||
app.MakeSelectedReference()
|
||||
self.assert_(groups[0].ref is objects[1])
|
||||
self.assert_(groups[1].ref is objects[4])
|
||||
|
||||
def test_makeSelectedReference_by_selecting_two_dupes_in_the_same_group(self):
|
||||
app = self.app
|
||||
objects = self.objects
|
||||
groups = self.groups
|
||||
app.SelectPowerMarkerNodePaths(r2np([0,1,2]))
|
||||
#Only 0 and 2 must go ref, not 1 because it is a part of the same group
|
||||
app.MakeSelectedReference()
|
||||
self.assert_(groups[0].ref is objects[1])
|
||||
self.assert_(groups[1].ref is objects[4])
|
||||
|
||||
def test_removeSelected(self):
|
||||
app = self.app
|
||||
app.SelectPowerMarkerNodePaths(r2np([0,2]))
|
||||
app.RemoveSelected()
|
||||
self.assertEqual(1,len(app.results.dupes))
|
||||
app.RemoveSelected()
|
||||
self.assertEqual(1,len(app.results.dupes))
|
||||
app.SelectPowerMarkerNodePaths(r2np([0,2]))
|
||||
app.RemoveSelected()
|
||||
self.assertEqual(0,len(app.results.dupes))
|
||||
|
||||
def test_addDirectory_simple(self):
|
||||
app = self.app
|
||||
self.assertEqual(0,app.AddDirectory(self.datadirpath()))
|
||||
self.assertEqual(1,len(app.directories))
|
||||
|
||||
def test_addDirectory_already_there(self):
|
||||
app = self.app
|
||||
self.assertEqual(0,app.AddDirectory(self.datadirpath()))
|
||||
self.assertEqual(1,app.AddDirectory(self.datadirpath()))
|
||||
|
||||
def test_addDirectory_does_not_exist(self):
|
||||
app = self.app
|
||||
self.assertEqual(2,app.AddDirectory('/does_not_exist'))
|
||||
|
||||
def test_ignore(self):
|
||||
app = self.app
|
||||
app.SelectPowerMarkerNodePaths(r2np([2])) #The dupe of the second, 2 sized group
|
||||
app.AddSelectedToIgnoreList()
|
||||
self.assertEqual(1,len(app.scanner.ignore_list))
|
||||
app.SelectPowerMarkerNodePaths(r2np([0])) #first dupe of the 3 dupes group
|
||||
app.AddSelectedToIgnoreList()
|
||||
#BOTH the ref and the other dupe should have been added
|
||||
self.assertEqual(3,len(app.scanner.ignore_list))
|
||||
|
||||
def test_purgeIgnoreList(self):
|
||||
app = self.app
|
||||
p1 = self.filepath('zerofile')
|
||||
p2 = self.filepath('zerofill')
|
||||
dne = '/does_not_exist'
|
||||
app.scanner.ignore_list.Ignore(dne,p1)
|
||||
app.scanner.ignore_list.Ignore(p2,dne)
|
||||
app.scanner.ignore_list.Ignore(p1,p2)
|
||||
app.PurgeIgnoreList()
|
||||
self.assertEqual(1,len(app.scanner.ignore_list))
|
||||
self.assert_(app.scanner.ignore_list.AreIgnored(p1,p2))
|
||||
self.assert_(not app.scanner.ignore_list.AreIgnored(dne,p1))
|
||||
|
||||
def test_only_unicode_is_added_to_ignore_list(self):
|
||||
def FakeIgnore(first,second):
|
||||
if not isinstance(first,unicode):
|
||||
self.fail()
|
||||
if not isinstance(second,unicode):
|
||||
self.fail()
|
||||
|
||||
app = self.app
|
||||
app.scanner.ignore_list.Ignore = FakeIgnore
|
||||
app.SelectPowerMarkerNodePaths(r2np([2])) #The dupe of the second, 2 sized group
|
||||
app.AddSelectedToIgnoreList()
|
||||
|
||||
def test_dirclass(self):
|
||||
self.assert_(self.app.directories.dirclass is DGDirectory)
|
||||
|
||||
|
||||
class TCDupeGuru_renameSelected(TestCase):
|
||||
def setUp(self):
|
||||
p = Path(tempfile.mkdtemp())
|
||||
fp = open(str(p + 'foo bar 1'),mode='w')
|
||||
fp.close()
|
||||
fp = open(str(p + 'foo bar 2'),mode='w')
|
||||
fp.close()
|
||||
fp = open(str(p + 'foo bar 3'),mode='w')
|
||||
fp.close()
|
||||
refdir = hsfs.phys.Directory(None,str(p))
|
||||
matches = engine.MatchFactory().getmatches(refdir.files)
|
||||
groups = engine.get_groups(matches)
|
||||
g = groups[0]
|
||||
g.prioritize(lambda x:x.name)
|
||||
app = DupeGuru()
|
||||
app.results.groups = groups
|
||||
self.app = app
|
||||
self.groups = groups
|
||||
self.p = p
|
||||
self.refdir = refdir
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(str(self.p))
|
||||
|
||||
def test_simple(self):
|
||||
app = self.app
|
||||
refdir = self.refdir
|
||||
g = self.groups[0]
|
||||
app.SelectPowerMarkerNodePaths(r2np([0]))
|
||||
self.assert_(app.RenameSelected('renamed'))
|
||||
self.assert_('renamed' in refdir)
|
||||
self.assert_('foo bar 2' not in refdir)
|
||||
self.assert_(g.dupes[0] is refdir['renamed'])
|
||||
self.assert_(g.dupes[0] in refdir)
|
||||
|
||||
def test_none_selected(self):
|
||||
app = self.app
|
||||
refdir = self.refdir
|
||||
g = self.groups[0]
|
||||
app.SelectPowerMarkerNodePaths([])
|
||||
self.mock(logging, 'warning', log_calls(lambda msg: None))
|
||||
self.assert_(not app.RenameSelected('renamed'))
|
||||
msg = logging.warning.calls[0]['msg']
|
||||
self.assertEqual('dupeGuru Warning: list index out of range', msg)
|
||||
self.assert_('renamed' not in refdir)
|
||||
self.assert_('foo bar 2' in refdir)
|
||||
self.assert_(g.dupes[0] is refdir['foo bar 2'])
|
||||
|
||||
def test_name_already_exists(self):
|
||||
app = self.app
|
||||
refdir = self.refdir
|
||||
g = self.groups[0]
|
||||
app.SelectPowerMarkerNodePaths(r2np([0]))
|
||||
self.mock(logging, 'warning', log_calls(lambda msg: None))
|
||||
self.assert_(not app.RenameSelected('foo bar 1'))
|
||||
msg = logging.warning.calls[0]['msg']
|
||||
self.assert_(msg.startswith('dupeGuru Warning: \'foo bar 2\' already exists in'))
|
||||
self.assert_('foo bar 1' in refdir)
|
||||
self.assert_('foo bar 2' in refdir)
|
||||
self.assert_(g.dupes[0] is refdir['foo bar 2'])
|
||||
|
||||
68
py/app_me_cocoa.py
Normal file
68
py/app_me_cocoa.py
Normal file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.app_me_cocoa
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/11/16
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4392 $
|
||||
Copyright 2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import os.path as op
|
||||
import logging
|
||||
from appscript import app, k, CommandError
|
||||
import time
|
||||
|
||||
from hsutil.cocoa import as_fetch
|
||||
import hsfs.phys.music
|
||||
|
||||
import app_cocoa, data_me, scanner
|
||||
|
||||
JOB_REMOVE_DEAD_TRACKS = 'jobRemoveDeadTracks'
|
||||
JOB_SCAN_DEAD_TRACKS = 'jobScanDeadTracks'
|
||||
|
||||
app_cocoa.JOBID2TITLE.update({
|
||||
JOB_REMOVE_DEAD_TRACKS: "Removing dead tracks from your iTunes Library",
|
||||
JOB_SCAN_DEAD_TRACKS: "Scanning the iTunes Library",
|
||||
})
|
||||
|
||||
class DupeGuruME(app_cocoa.DupeGuru):
|
||||
def __init__(self):
|
||||
app_cocoa.DupeGuru.__init__(self, data_me, 'dupeguru_me', appid=1)
|
||||
self.scanner = scanner.ScannerME()
|
||||
self.directories.dirclass = hsfs.phys.music.Directory
|
||||
self.dead_tracks = []
|
||||
|
||||
def remove_dead_tracks(self):
|
||||
def do(j):
|
||||
a = app('iTunes')
|
||||
for index, track in enumerate(j.iter_with_progress(self.dead_tracks)):
|
||||
if index % 100 == 0:
|
||||
time.sleep(.1)
|
||||
try:
|
||||
track.delete()
|
||||
except CommandError as e:
|
||||
logging.warning('Error while trying to remove a track from iTunes: %s' % unicode(e))
|
||||
|
||||
self._start_job(JOB_REMOVE_DEAD_TRACKS, do)
|
||||
|
||||
def scan_dead_tracks(self):
|
||||
def do(j):
|
||||
a = app('iTunes')
|
||||
try:
|
||||
[source] = [s for s in a.sources() if s.kind() == k.library]
|
||||
[library] = source.library_playlists()
|
||||
except ValueError:
|
||||
logging.warning('Some unexpected iTunes configuration encountered')
|
||||
return
|
||||
self.dead_tracks = []
|
||||
tracks = as_fetch(library.file_tracks, k.file_track)
|
||||
for index, track in enumerate(j.iter_with_progress(tracks)):
|
||||
if index % 100 == 0:
|
||||
time.sleep(.1)
|
||||
if track.location() == k.missing_value:
|
||||
self.dead_tracks.append(track)
|
||||
logging.info('Found %d dead tracks' % len(self.dead_tracks))
|
||||
|
||||
self._start_job(JOB_SCAN_DEAD_TRACKS, do)
|
||||
|
||||
212
py/app_pe_cocoa.py
Normal file
212
py/app_pe_cocoa.py
Normal file
@@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.app_pe_cocoa
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/11/13
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4392 $
|
||||
Copyright 2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import os
|
||||
import os.path as op
|
||||
import logging
|
||||
import plistlib
|
||||
|
||||
import objc
|
||||
from Foundation import *
|
||||
from AppKit import *
|
||||
from appscript import app, k
|
||||
|
||||
from hsutil import job, io
|
||||
import hsfs as fs
|
||||
from hsfs import phys
|
||||
from hsutil import files
|
||||
from hsutil.str import get_file_ext
|
||||
from hsutil.path import Path
|
||||
from hsutil.cocoa import as_fetch
|
||||
|
||||
import app_cocoa, data_pe, directories, picture.matchbase
|
||||
from picture.cache import string_to_colors, Cache
|
||||
|
||||
mainBundle = NSBundle.mainBundle()
|
||||
PictureBlocks = mainBundle.classNamed_('PictureBlocks')
|
||||
assert PictureBlocks is not None
|
||||
|
||||
class Photo(phys.File):
|
||||
cls_info_map = {
|
||||
'size': fs.IT_ATTRS,
|
||||
'ctime': fs.IT_ATTRS,
|
||||
'mtime': fs.IT_ATTRS,
|
||||
'md5': fs.IT_MD5,
|
||||
'md5partial': fs.IT_MD5,
|
||||
'dimensions': fs.IT_EXTRA,
|
||||
}
|
||||
|
||||
def _initialize_info(self,section):
|
||||
super(Photo, self)._initialize_info(section)
|
||||
if section == fs.IT_EXTRA:
|
||||
self._info.update({
|
||||
'dimensions': (0,0),
|
||||
})
|
||||
|
||||
def _read_info(self,section):
|
||||
super(Photo, self)._read_info(section)
|
||||
if section == fs.IT_EXTRA:
|
||||
size = PictureBlocks.getImageSize_(unicode(self.path))
|
||||
self._info['dimensions'] = (size.width, size.height)
|
||||
|
||||
def get_blocks(self, block_count_per_side):
|
||||
try:
|
||||
blocks = PictureBlocks.getBlocksFromImagePath_blockCount_scanArea_(unicode(self.path), block_count_per_side, 0)
|
||||
except Exception, e:
|
||||
raise IOError('The reading of "%s" failed with "%s"' % (unicode(self.path), unicode(e)))
|
||||
if not blocks:
|
||||
raise IOError('The picture %s could not be read' % unicode(self.path))
|
||||
return string_to_colors(blocks)
|
||||
|
||||
|
||||
class IPhoto(Photo):
|
||||
def __init__(self, parent, whole_path):
|
||||
super(IPhoto, self).__init__(parent, whole_path[-1])
|
||||
self.whole_path = whole_path
|
||||
|
||||
def _build_path(self):
|
||||
return self.whole_path
|
||||
|
||||
@property
|
||||
def display_path(self):
|
||||
return super(IPhoto, self)._build_path()
|
||||
|
||||
|
||||
class Directory(phys.Directory):
|
||||
cls_file_class = Photo
|
||||
cls_supported_exts = ('png', 'jpg', 'jpeg', 'gif', 'psd', 'bmp', 'tiff', 'nef', 'cr2')
|
||||
|
||||
def _fetch_subitems(self):
|
||||
subdirs, subfiles = super(Directory,self)._fetch_subitems()
|
||||
return subdirs, [name for name in subfiles if get_file_ext(name) in self.cls_supported_exts]
|
||||
|
||||
|
||||
class IPhotoLibrary(fs.Directory):
|
||||
def __init__(self, plistpath):
|
||||
self.plistpath = plistpath
|
||||
self.refpath = plistpath[:-1]
|
||||
# the AlbumData.xml file lives right in the library path
|
||||
super(IPhotoLibrary, self).__init__(None, 'iPhoto Library')
|
||||
|
||||
def _update_photo(self, photo_data):
|
||||
if photo_data['MediaType'] != 'Image':
|
||||
return
|
||||
photo_path = Path(photo_data['ImagePath'])
|
||||
subpath = photo_path[len(self.refpath):-1]
|
||||
subdir = self
|
||||
for element in subpath:
|
||||
try:
|
||||
subdir = subdir[element]
|
||||
except KeyError:
|
||||
subdir = fs.Directory(subdir, element)
|
||||
IPhoto(subdir, photo_path)
|
||||
|
||||
def update(self):
|
||||
self.clear()
|
||||
s = open(unicode(self.plistpath)).read()
|
||||
# There was a case where a guy had 0x10 chars in his plist, causing expat errors on loading
|
||||
s = s.replace('\x10', '')
|
||||
plist = plistlib.readPlistFromString(s)
|
||||
for photo_data in plist['Master Image List'].values():
|
||||
self._update_photo(photo_data)
|
||||
|
||||
def force_update(self): # Don't update
|
||||
pass
|
||||
|
||||
|
||||
class DupeGuruPE(app_cocoa.DupeGuru):
|
||||
def __init__(self):
|
||||
app_cocoa.DupeGuru.__init__(self, data_pe, 'dupeguru_pe', appid=5)
|
||||
self.scanner.match_factory = picture.matchbase.AsyncMatchFactory()
|
||||
self.directories.dirclass = Directory
|
||||
self.directories.special_dirclasses[Path('iPhoto Library')] = lambda _, __: self._create_iphoto_library()
|
||||
p = op.join(self.appdata, 'cached_pictures.db')
|
||||
self.scanner.match_factory.cached_blocks = Cache(p)
|
||||
|
||||
def _create_iphoto_library(self):
|
||||
ud = NSUserDefaults.standardUserDefaults()
|
||||
prefs = ud.persistentDomainForName_('com.apple.iApps')
|
||||
plisturl = NSURL.URLWithString_(prefs['iPhotoRecentDatabases'][0])
|
||||
plistpath = Path(plisturl.path())
|
||||
return IPhotoLibrary(plistpath)
|
||||
|
||||
def _do_delete(self, j):
|
||||
def op(dupe):
|
||||
j.add_progress()
|
||||
return self._do_delete_dupe(dupe)
|
||||
|
||||
marked = [dupe for dupe in self.results.dupes if self.results.is_marked(dupe)]
|
||||
self.path2iphoto = {}
|
||||
if any(isinstance(dupe, IPhoto) for dupe in marked):
|
||||
a = app('iPhoto')
|
||||
a.select(a.photo_library_album())
|
||||
photos = as_fetch(a.photo_library_album().photos, k.item)
|
||||
for photo in photos:
|
||||
self.path2iphoto[photo.image_path()] = photo
|
||||
self.last_op_error_count = self.results.perform_on_marked(op, True)
|
||||
del self.path2iphoto
|
||||
|
||||
def _do_delete_dupe(self, dupe):
|
||||
if isinstance(dupe, IPhoto):
|
||||
photo = self.path2iphoto[unicode(dupe.path)]
|
||||
app('iPhoto').remove(photo)
|
||||
return True
|
||||
else:
|
||||
return app_cocoa.DupeGuru._do_delete_dupe(self, dupe)
|
||||
|
||||
def _do_load(self, j):
|
||||
self.directories.LoadFromFile(op.join(self.appdata, 'last_directories.xml'))
|
||||
for d in self.directories:
|
||||
if isinstance(d, IPhotoLibrary):
|
||||
d.update()
|
||||
self.results.load_from_xml(op.join(self.appdata, 'last_results.xml'), self._get_file, j)
|
||||
|
||||
def _get_file(self, str_path):
|
||||
p = Path(str_path)
|
||||
for d in self.directories:
|
||||
result = None
|
||||
if p in d.path:
|
||||
result = d.find_path(p[d.path:])
|
||||
if isinstance(d, IPhotoLibrary) and p in d.refpath:
|
||||
result = d.find_path(p[d.refpath:])
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
def AddDirectory(self, d):
|
||||
try:
|
||||
added = self.directories.add_path(Path(d))
|
||||
if d == 'iPhoto Library':
|
||||
added.update()
|
||||
return 0
|
||||
except directories.AlreadyThereError:
|
||||
return 1
|
||||
|
||||
def CopyOrMove(self, dupe, copy, destination, dest_type):
|
||||
if isinstance(dupe, IPhoto):
|
||||
copy = True
|
||||
return app_cocoa.DupeGuru.CopyOrMove(self, dupe, copy, destination, dest_type)
|
||||
|
||||
def start_scanning(self):
|
||||
for directory in self.directories:
|
||||
if isinstance(directory, IPhotoLibrary):
|
||||
self.directories.SetState(directory.refpath, directories.STATE_EXCLUDED)
|
||||
return app_cocoa.DupeGuru.start_scanning(self)
|
||||
|
||||
def selected_dupe_path(self):
|
||||
if not self.selected_dupes:
|
||||
return None
|
||||
return self.selected_dupes[0].path
|
||||
|
||||
def selected_dupe_ref_path(self):
|
||||
if not self.selected_dupes:
|
||||
return None
|
||||
ref = self.results.get_group_of_duplicate(self.selected_dupes[0]).ref
|
||||
return ref.path
|
||||
|
||||
13
py/app_se_cocoa.py
Normal file
13
py/app_se_cocoa.py
Normal file
@@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env python
|
||||
# Unit Name: app_se_cocoa
|
||||
# Created By: Virgil Dupras
|
||||
# Created On: 2009-05-24
|
||||
# $Id$
|
||||
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
|
||||
|
||||
import app_cocoa, data
|
||||
|
||||
class DupeGuru(app_cocoa.DupeGuru):
|
||||
def __init__(self):
|
||||
app_cocoa.DupeGuru.__init__(self, data, 'dupeguru', appid=4)
|
||||
|
||||
137
py/app_test.py
Normal file
137
py/app_test.py
Normal file
@@ -0,0 +1,137 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.tests.app
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2007-06-23
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 16:02:48 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4388 $
|
||||
Copyright 2007 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import unittest
|
||||
import os
|
||||
|
||||
from hsutil.testcase import TestCase
|
||||
from hsutil import io
|
||||
from hsutil.path import Path
|
||||
from hsutil.decorators import log_calls
|
||||
import hsfs as fs
|
||||
import hsfs.phys
|
||||
import hsutil.files
|
||||
from hsutil.job import nulljob
|
||||
|
||||
from . import data, app
|
||||
from .app import DupeGuru as DupeGuruBase
|
||||
|
||||
class DupeGuru(DupeGuruBase):
|
||||
def __init__(self):
|
||||
DupeGuruBase.__init__(self, data, '/tmp', appid=4)
|
||||
|
||||
def _start_job(self, jobid, func):
|
||||
func(nulljob)
|
||||
|
||||
|
||||
class TCDupeGuru(TestCase):
|
||||
cls_tested_module = app
|
||||
def test_ApplyFilter_calls_results_apply_filter(self):
|
||||
app = DupeGuru()
|
||||
self.mock(app.results, 'apply_filter', log_calls(app.results.apply_filter))
|
||||
app.ApplyFilter('foo')
|
||||
self.assertEqual(2, len(app.results.apply_filter.calls))
|
||||
call = app.results.apply_filter.calls[0]
|
||||
self.assert_(call['filter_str'] is None)
|
||||
call = app.results.apply_filter.calls[1]
|
||||
self.assertEqual('foo', call['filter_str'])
|
||||
|
||||
def test_ApplyFilter_escapes_regexp(self):
|
||||
app = DupeGuru()
|
||||
self.mock(app.results, 'apply_filter', log_calls(app.results.apply_filter))
|
||||
app.ApplyFilter('()[]\\.|+?^abc')
|
||||
call = app.results.apply_filter.calls[1]
|
||||
self.assertEqual('\\(\\)\\[\\]\\\\\\.\\|\\+\\?\\^abc', call['filter_str'])
|
||||
app.ApplyFilter('(*)') # In "simple mode", we want the * to behave as a wilcard
|
||||
call = app.results.apply_filter.calls[3]
|
||||
self.assertEqual('\(.*\)', call['filter_str'])
|
||||
app.options['escape_filter_regexp'] = False
|
||||
app.ApplyFilter('(abc)')
|
||||
call = app.results.apply_filter.calls[5]
|
||||
self.assertEqual('(abc)', call['filter_str'])
|
||||
|
||||
def test_CopyOrMove(self):
|
||||
# The goal here is just to have a test for a previous blowup I had. I know my test coverage
|
||||
# for this unit is pathetic. What's done is done. My approach now is to add tests for
|
||||
# every change I want to make. The blowup was caused by a missing import.
|
||||
dupe_parent = fs.Directory(None, 'foo')
|
||||
dupe = fs.File(dupe_parent, 'bar')
|
||||
dupe.copy = log_calls(lambda dest, newname: None)
|
||||
self.mock(hsutil.files, 'copy', log_calls(lambda source_path, dest_path: None))
|
||||
self.mock(os, 'makedirs', lambda path: None) # We don't want the test to create that fake directory
|
||||
self.mock(fs.phys, 'Directory', fs.Directory) # We don't want an error because makedirs didn't work
|
||||
app = DupeGuru()
|
||||
app.CopyOrMove(dupe, True, 'some_destination', 0)
|
||||
self.assertEqual(1, len(hsutil.files.copy.calls))
|
||||
call = hsutil.files.copy.calls[0]
|
||||
self.assertEqual('some_destination', call['dest_path'])
|
||||
self.assertEqual(dupe.path, call['source_path'])
|
||||
|
||||
def test_CopyOrMove_clean_empty_dirs(self):
|
||||
tmppath = Path(self.tmpdir())
|
||||
sourcepath = tmppath + 'source'
|
||||
io.mkdir(sourcepath)
|
||||
io.open(sourcepath + 'myfile', 'w')
|
||||
tmpdir = hsfs.phys.Directory(None, unicode(tmppath))
|
||||
myfile = tmpdir['source']['myfile']
|
||||
app = DupeGuru()
|
||||
self.mock(app, 'clean_empty_dirs', log_calls(lambda path: None))
|
||||
app.CopyOrMove(myfile, False, tmppath + 'dest', 0)
|
||||
calls = app.clean_empty_dirs.calls
|
||||
self.assertEqual(1, len(calls))
|
||||
self.assertEqual(sourcepath, calls[0]['path'])
|
||||
|
||||
def test_Scan_with_objects_evaluating_to_false(self):
|
||||
# At some point, any() was used in a wrong way that made Scan() wrongly return 1
|
||||
app = DupeGuru()
|
||||
f1, f2 = [fs.File(None, 'foo') for i in range(2)]
|
||||
f1.is_ref, f2.is_ref = (False, False)
|
||||
assert not (bool(f1) and bool(f2))
|
||||
app.directories.get_files = lambda: [f1, f2]
|
||||
app.directories._dirs.append('this is just so Scan() doesnt return 3')
|
||||
app.start_scanning() # no exception
|
||||
|
||||
|
||||
class TCDupeGuru_clean_empty_dirs(TestCase):
|
||||
cls_tested_module = app
|
||||
def setUp(self):
|
||||
self.mock(hsutil.files, 'delete_if_empty', log_calls(lambda path, files_to_delete=[]: None))
|
||||
self.app = DupeGuru()
|
||||
|
||||
def test_option_off(self):
|
||||
self.app.clean_empty_dirs(Path('/foo/bar'))
|
||||
self.assertEqual(0, len(hsutil.files.delete_if_empty.calls))
|
||||
|
||||
def test_option_on(self):
|
||||
self.app.options['clean_empty_dirs'] = True
|
||||
self.app.clean_empty_dirs(Path('/foo/bar'))
|
||||
calls = hsutil.files.delete_if_empty.calls
|
||||
self.assertEqual(1, len(calls))
|
||||
self.assertEqual(Path('/foo/bar'), calls[0]['path'])
|
||||
self.assertEqual(['.DS_Store'], calls[0]['files_to_delete'])
|
||||
|
||||
def test_recurse_up(self):
|
||||
# delete_if_empty must be recursively called up in the path until it returns False
|
||||
@log_calls
|
||||
def mock_delete_if_empty(path, files_to_delete=[]):
|
||||
return len(path) > 1
|
||||
|
||||
self.mock(hsutil.files, 'delete_if_empty', mock_delete_if_empty)
|
||||
self.app.options['clean_empty_dirs'] = True
|
||||
self.app.clean_empty_dirs(Path('not-empty/empty/empty'))
|
||||
calls = hsutil.files.delete_if_empty.calls
|
||||
self.assertEqual(3, len(calls))
|
||||
self.assertEqual(Path('not-empty/empty/empty'), calls[0]['path'])
|
||||
self.assertEqual(Path('not-empty/empty'), calls[1]['path'])
|
||||
self.assertEqual(Path('not-empty'), calls[2]['path'])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
105
py/data.py
Normal file
105
py/data.py
Normal file
@@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.data
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/03/15
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
|
||||
from hsutil.str import format_time, FT_DECIMAL, format_size
|
||||
|
||||
import time
|
||||
|
||||
def format_path(p):
|
||||
return unicode(p[:-1])
|
||||
|
||||
def format_timestamp(t, delta):
|
||||
if delta:
|
||||
return format_time(t, FT_DECIMAL)
|
||||
else:
|
||||
if t > 0:
|
||||
return time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(t))
|
||||
else:
|
||||
return '---'
|
||||
|
||||
def format_words(w):
|
||||
def do_format(w):
|
||||
if isinstance(w, list):
|
||||
return '(%s)' % ', '.join(do_format(item) for item in w)
|
||||
else:
|
||||
return w.replace('\n', ' ')
|
||||
|
||||
return ', '.join(do_format(item) for item in w)
|
||||
|
||||
def format_perc(p):
|
||||
return "%0.0f" % p
|
||||
|
||||
def format_dupe_count(c):
|
||||
return str(c) if c else '---'
|
||||
|
||||
def cmp_value(value):
|
||||
return value.lower() if isinstance(value, basestring) else value
|
||||
|
||||
COLUMNS = [
|
||||
{'attr':'name','display':'Filename'},
|
||||
{'attr':'path','display':'Directory'},
|
||||
{'attr':'size','display':'Size (KB)'},
|
||||
{'attr':'extension','display':'Kind'},
|
||||
{'attr':'ctime','display':'Creation'},
|
||||
{'attr':'mtime','display':'Modification'},
|
||||
{'attr':'percentage','display':'Match %'},
|
||||
{'attr':'words','display':'Words Used'},
|
||||
{'attr':'dupe_count','display':'Dupe Count'},
|
||||
]
|
||||
|
||||
def GetDisplayInfo(dupe, group, delta=False):
|
||||
if (dupe is None) or (group is None):
|
||||
return ['---'] * len(COLUMNS)
|
||||
size = dupe.size
|
||||
ctime = dupe.ctime
|
||||
mtime = dupe.mtime
|
||||
m = group.get_match_of(dupe)
|
||||
if m:
|
||||
percentage = m.percentage
|
||||
dupe_count = 0
|
||||
if delta:
|
||||
r = group.ref
|
||||
size -= r.size
|
||||
ctime -= r.ctime
|
||||
mtime -= r.mtime
|
||||
else:
|
||||
percentage = group.percentage
|
||||
dupe_count = len(group.dupes)
|
||||
return [
|
||||
dupe.name,
|
||||
format_path(dupe.path),
|
||||
format_size(size, 0, 1, False),
|
||||
dupe.extension,
|
||||
format_timestamp(ctime, delta and m),
|
||||
format_timestamp(mtime, delta and m),
|
||||
format_perc(percentage),
|
||||
format_words(dupe.words),
|
||||
format_dupe_count(dupe_count)
|
||||
]
|
||||
|
||||
def GetDupeSortKey(dupe, get_group, key, delta):
|
||||
if key == 6:
|
||||
m = get_group().get_match_of(dupe)
|
||||
return m.percentage
|
||||
if key == 8:
|
||||
return 0
|
||||
r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
|
||||
if delta and (key in (2, 4, 5)):
|
||||
r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
|
||||
return r
|
||||
|
||||
def GetGroupSortKey(group, key):
|
||||
if key == 6:
|
||||
return group.percentage
|
||||
if key == 8:
|
||||
return len(group)
|
||||
return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))
|
||||
|
||||
100
py/data_me.py
Normal file
100
py/data_me.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.data
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/03/15
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
|
||||
from hsutil.str import format_time, FT_MINUTES, format_size
|
||||
from .data import (format_path, format_timestamp, format_words, format_perc,
|
||||
format_dupe_count, cmp_value)
|
||||
|
||||
COLUMNS = [
|
||||
{'attr':'name','display':'Filename'},
|
||||
{'attr':'path','display':'Directory'},
|
||||
{'attr':'size','display':'Size (MB)'},
|
||||
{'attr':'duration','display':'Time'},
|
||||
{'attr':'bitrate','display':'Bitrate'},
|
||||
{'attr':'samplerate','display':'Sample Rate'},
|
||||
{'attr':'extension','display':'Kind'},
|
||||
{'attr':'ctime','display':'Creation'},
|
||||
{'attr':'mtime','display':'Modification'},
|
||||
{'attr':'title','display':'Title'},
|
||||
{'attr':'artist','display':'Artist'},
|
||||
{'attr':'album','display':'Album'},
|
||||
{'attr':'genre','display':'Genre'},
|
||||
{'attr':'year','display':'Year'},
|
||||
{'attr':'track','display':'Track Number'},
|
||||
{'attr':'comment','display':'Comment'},
|
||||
{'attr':'percentage','display':'Match %'},
|
||||
{'attr':'words','display':'Words Used'},
|
||||
{'attr':'dupe_count','display':'Dupe Count'},
|
||||
]
|
||||
|
||||
def GetDisplayInfo(dupe, group, delta=False):
|
||||
if (dupe is None) or (group is None):
|
||||
return ['---'] * len(COLUMNS)
|
||||
size = dupe.size
|
||||
duration = dupe.duration
|
||||
bitrate = dupe.bitrate
|
||||
samplerate = dupe.samplerate
|
||||
ctime = dupe.ctime
|
||||
mtime = dupe.mtime
|
||||
m = group.get_match_of(dupe)
|
||||
if m:
|
||||
percentage = m.percentage
|
||||
dupe_count = 0
|
||||
if delta:
|
||||
r = group.ref
|
||||
size -= r.size
|
||||
duration -= r.duration
|
||||
bitrate -= r.bitrate
|
||||
samplerate -= r.samplerate
|
||||
ctime -= r.ctime
|
||||
mtime -= r.mtime
|
||||
else:
|
||||
percentage = group.percentage
|
||||
dupe_count = len(group.dupes)
|
||||
return [
|
||||
dupe.name,
|
||||
format_path(dupe.path),
|
||||
format_size(size, 2, 2, False),
|
||||
format_time(duration, FT_MINUTES),
|
||||
str(bitrate),
|
||||
str(samplerate),
|
||||
dupe.extension,
|
||||
format_timestamp(ctime,delta and m),
|
||||
format_timestamp(mtime,delta and m),
|
||||
dupe.title,
|
||||
dupe.artist,
|
||||
dupe.album,
|
||||
dupe.genre,
|
||||
dupe.year,
|
||||
str(dupe.track),
|
||||
dupe.comment,
|
||||
format_perc(percentage),
|
||||
format_words(dupe.words),
|
||||
format_dupe_count(dupe_count)
|
||||
]
|
||||
|
||||
def GetDupeSortKey(dupe, get_group, key, delta):
|
||||
if key == 16:
|
||||
m = get_group().get_match_of(dupe)
|
||||
return m.percentage
|
||||
if key == 18:
|
||||
return 0
|
||||
r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
|
||||
if delta and (key in (2, 3, 4, 7, 8)):
|
||||
r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
|
||||
return r
|
||||
|
||||
def GetGroupSortKey(group, key):
|
||||
if key == 16:
|
||||
return group.percentage
|
||||
if key == 18:
|
||||
return len(group)
|
||||
return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))
|
||||
77
py/data_pe.py
Normal file
77
py/data_pe.py
Normal file
@@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.data
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/03/15
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
from hsutil.str import format_size
|
||||
from .data import format_path, format_timestamp, format_perc, format_dupe_count, cmp_value
|
||||
|
||||
def format_dimensions(dimensions):
|
||||
return '%d x %d' % (dimensions[0], dimensions[1])
|
||||
|
||||
COLUMNS = [
|
||||
{'attr':'name','display':'Filename'},
|
||||
{'attr':'path','display':'Directory'},
|
||||
{'attr':'size','display':'Size (KB)'},
|
||||
{'attr':'extension','display':'Kind'},
|
||||
{'attr':'dimensions','display':'Dimensions'},
|
||||
{'attr':'ctime','display':'Creation'},
|
||||
{'attr':'mtime','display':'Modification'},
|
||||
{'attr':'percentage','display':'Match %'},
|
||||
{'attr':'dupe_count','display':'Dupe Count'},
|
||||
]
|
||||
|
||||
def GetDisplayInfo(dupe,group,delta=False):
|
||||
if (dupe is None) or (group is None):
|
||||
return ['---'] * len(COLUMNS)
|
||||
size = dupe.size
|
||||
ctime = dupe.ctime
|
||||
mtime = dupe.mtime
|
||||
m = group.get_match_of(dupe)
|
||||
if m:
|
||||
percentage = m.percentage
|
||||
dupe_count = 0
|
||||
if delta:
|
||||
r = group.ref
|
||||
size -= r.size
|
||||
ctime -= r.ctime
|
||||
mtime -= r.mtime
|
||||
else:
|
||||
percentage = group.percentage
|
||||
dupe_count = len(group.dupes)
|
||||
dupe_path = getattr(dupe, 'display_path', dupe.path)
|
||||
return [
|
||||
dupe.name,
|
||||
format_path(dupe_path),
|
||||
format_size(size, 0, 1, False),
|
||||
dupe.extension,
|
||||
format_dimensions(dupe.dimensions),
|
||||
format_timestamp(ctime, delta and m),
|
||||
format_timestamp(mtime, delta and m),
|
||||
format_perc(percentage),
|
||||
format_dupe_count(dupe_count)
|
||||
]
|
||||
|
||||
def GetDupeSortKey(dupe, get_group, key, delta):
|
||||
if key == 7:
|
||||
m = get_group().get_match_of(dupe)
|
||||
return m.percentage
|
||||
if key == 8:
|
||||
return 0
|
||||
r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
|
||||
if delta and (key in (2, 5, 6)):
|
||||
r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
|
||||
return r
|
||||
|
||||
def GetGroupSortKey(group, key):
|
||||
if key == 7:
|
||||
return group.percentage
|
||||
if key == 8:
|
||||
return len(group)
|
||||
return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))
|
||||
|
||||
161
py/directories.py
Normal file
161
py/directories.py
Normal file
@@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.directories
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/02/27
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 16:02:48 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4388 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import xml.dom.minidom
|
||||
|
||||
from hsfs import phys
|
||||
import hsfs as fs
|
||||
from hsutil.files import FileOrPath
|
||||
from hsutil.path import Path
|
||||
|
||||
(STATE_NORMAL,
|
||||
STATE_REFERENCE,
|
||||
STATE_EXCLUDED) = range(3)
|
||||
|
||||
class AlreadyThereError(Exception):
|
||||
"""The path being added is already in the directory list"""
|
||||
|
||||
class InvalidPathError(Exception):
|
||||
"""The path being added is invalid"""
|
||||
|
||||
class Directories(object):
|
||||
#---Override
|
||||
def __init__(self):
|
||||
self._dirs = []
|
||||
self.states = {}
|
||||
self.dirclass = phys.Directory
|
||||
self.special_dirclasses = {}
|
||||
|
||||
def __contains__(self,path):
|
||||
for d in self._dirs:
|
||||
if path in d.path:
|
||||
return True
|
||||
return False
|
||||
|
||||
def __delitem__(self,key):
|
||||
self._dirs.__delitem__(key)
|
||||
|
||||
def __getitem__(self,key):
|
||||
return self._dirs.__getitem__(key)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._dirs)
|
||||
|
||||
#---Private
|
||||
def _get_files(self, from_dir, state=STATE_NORMAL):
|
||||
state = self.states.get(from_dir.path, state)
|
||||
result = []
|
||||
for subdir in from_dir.dirs:
|
||||
for file in self._get_files(subdir, state):
|
||||
yield file
|
||||
if state != STATE_EXCLUDED:
|
||||
for file in from_dir.files:
|
||||
file.is_ref = state == STATE_REFERENCE
|
||||
yield file
|
||||
|
||||
#---Public
|
||||
def add_path(self, path):
|
||||
"""Adds 'path' to self, if not already there.
|
||||
|
||||
Raises AlreadyThereError if 'path' is already in self. If path is a directory containing
|
||||
some of the directories already present in self, 'path' will be added, but all directories
|
||||
under it will be removed. Can also raise InvalidPathError if 'path' does not exist.
|
||||
"""
|
||||
if path in self:
|
||||
raise AlreadyThereError
|
||||
self._dirs = [d for d in self._dirs if d.path not in path]
|
||||
try:
|
||||
dirclass = self.special_dirclasses.get(path, self.dirclass)
|
||||
d = dirclass(None, unicode(path))
|
||||
d[:] #If an InvalidPath exception has to be raised, it will be raised here
|
||||
self._dirs.append(d)
|
||||
return d
|
||||
except fs.InvalidPath:
|
||||
raise InvalidPathError
|
||||
|
||||
def get_files(self):
|
||||
"""Returns a list of all files that are not excluded.
|
||||
|
||||
Returned files also have their 'is_ref' attr set.
|
||||
"""
|
||||
for d in self._dirs:
|
||||
d.force_update()
|
||||
try:
|
||||
for file in self._get_files(d):
|
||||
yield file
|
||||
except fs.InvalidPath:
|
||||
pass
|
||||
|
||||
def GetState(self, path):
|
||||
"""Returns the state of 'path' (One of the STATE_* const.)
|
||||
|
||||
Raises LookupError if 'path' is not in self.
|
||||
"""
|
||||
if path not in self:
|
||||
raise LookupError("The path '%s' is not in the directory list." % str(path))
|
||||
try:
|
||||
return self.states[path]
|
||||
except KeyError:
|
||||
if path[-1].startswith('.'): # hidden
|
||||
return STATE_EXCLUDED
|
||||
parent = path[:-1]
|
||||
if parent in self:
|
||||
return self.GetState(parent)
|
||||
else:
|
||||
return STATE_NORMAL
|
||||
|
||||
def LoadFromFile(self,infile):
|
||||
try:
|
||||
doc = xml.dom.minidom.parse(infile)
|
||||
except:
|
||||
return
|
||||
root_dir_nodes = doc.getElementsByTagName('root_directory')
|
||||
for rdn in root_dir_nodes:
|
||||
if not rdn.getAttributeNode('path'):
|
||||
continue
|
||||
path = rdn.getAttributeNode('path').nodeValue
|
||||
try:
|
||||
self.add_path(Path(path))
|
||||
except (AlreadyThereError,InvalidPathError):
|
||||
pass
|
||||
state_nodes = doc.getElementsByTagName('state')
|
||||
for sn in state_nodes:
|
||||
if not (sn.getAttributeNode('path') and sn.getAttributeNode('value')):
|
||||
continue
|
||||
path = sn.getAttributeNode('path').nodeValue
|
||||
state = sn.getAttributeNode('value').nodeValue
|
||||
self.SetState(Path(path), int(state))
|
||||
|
||||
def Remove(self,directory):
|
||||
self._dirs.remove(directory)
|
||||
|
||||
def SaveToFile(self,outfile):
|
||||
with FileOrPath(outfile, 'wb') as fp:
|
||||
doc = xml.dom.minidom.Document()
|
||||
root = doc.appendChild(doc.createElement('directories'))
|
||||
for root_dir in self:
|
||||
root_dir_node = root.appendChild(doc.createElement('root_directory'))
|
||||
root_dir_node.setAttribute('path', unicode(root_dir.path).encode('utf-8'))
|
||||
for path,state in self.states.iteritems():
|
||||
state_node = root.appendChild(doc.createElement('state'))
|
||||
state_node.setAttribute('path', unicode(path).encode('utf-8'))
|
||||
state_node.setAttribute('value', str(state))
|
||||
doc.writexml(fp,'\t','\t','\n',encoding='utf-8')
|
||||
|
||||
def SetState(self,path,state):
|
||||
try:
|
||||
if self.GetState(path) == state:
|
||||
return
|
||||
self.states[path] = state
|
||||
if (self.GetState(path[:-1]) == state) and (not path[-1].startswith('.')):
|
||||
del self.states[path]
|
||||
except LookupError:
|
||||
pass
|
||||
|
||||
280
py/directories_test.py
Normal file
280
py/directories_test.py
Normal file
@@ -0,0 +1,280 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.tests.directories
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/02/27
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-29 08:51:14 +0200 (Fri, 29 May 2009) $
|
||||
$Revision: 4398 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import unittest
|
||||
import os.path as op
|
||||
import os
|
||||
import time
|
||||
import shutil
|
||||
|
||||
from hsutil import job, io
|
||||
from hsutil.path import Path
|
||||
from hsutil.testcase import TestCase
|
||||
import hsfs.phys
|
||||
from hsfs.phys import phys_test
|
||||
|
||||
from directories import *
|
||||
|
||||
testpath = Path(TestCase.datadirpath())
|
||||
|
||||
class TCDirectories(TestCase):
|
||||
def test_empty(self):
|
||||
d = Directories()
|
||||
self.assertEqual(0,len(d))
|
||||
self.assert_('foobar' not in d)
|
||||
|
||||
def test_add_path(self):
|
||||
d = Directories()
|
||||
p = testpath + 'utils'
|
||||
added = d.add_path(p)
|
||||
self.assertEqual(1,len(d))
|
||||
self.assert_(p in d)
|
||||
self.assert_((p + 'foobar') in d)
|
||||
self.assert_(p[:-1] not in d)
|
||||
self.assertEqual(p,added.path)
|
||||
self.assert_(d[0] is added)
|
||||
p = self.tmppath()
|
||||
d.add_path(p)
|
||||
self.assertEqual(2,len(d))
|
||||
self.assert_(p in d)
|
||||
|
||||
def test_AddPath_when_path_is_already_there(self):
|
||||
d = Directories()
|
||||
p = testpath + 'utils'
|
||||
d.add_path(p)
|
||||
self.assertRaises(AlreadyThereError, d.add_path, p)
|
||||
self.assertRaises(AlreadyThereError, d.add_path, p + 'foobar')
|
||||
self.assertEqual(1, len(d))
|
||||
|
||||
def test_AddPath_containing_paths_already_there(self):
|
||||
d = Directories()
|
||||
d.add_path(testpath + 'utils')
|
||||
self.assertEqual(1, len(d))
|
||||
added = d.add_path(testpath)
|
||||
self.assertEqual(1, len(d))
|
||||
self.assert_(added is d[0])
|
||||
|
||||
def test_AddPath_non_latin(self):
|
||||
p = Path(self.tmpdir())
|
||||
to_add = p + u'unicode\u201a'
|
||||
os.mkdir(unicode(to_add))
|
||||
d = Directories()
|
||||
try:
|
||||
d.add_path(to_add)
|
||||
except UnicodeDecodeError:
|
||||
self.fail()
|
||||
|
||||
def test_del(self):
|
||||
d = Directories()
|
||||
d.add_path(testpath + 'utils')
|
||||
try:
|
||||
del d[1]
|
||||
self.fail()
|
||||
except IndexError:
|
||||
pass
|
||||
d.add_path(self.tmppath())
|
||||
del d[1]
|
||||
self.assertEqual(1, len(d))
|
||||
|
||||
def test_states(self):
|
||||
d = Directories()
|
||||
p = testpath + 'utils'
|
||||
d.add_path(p)
|
||||
self.assertEqual(STATE_NORMAL,d.GetState(p))
|
||||
d.SetState(p,STATE_REFERENCE)
|
||||
self.assertEqual(STATE_REFERENCE,d.GetState(p))
|
||||
self.assertEqual(STATE_REFERENCE,d.GetState(p + 'dir1'))
|
||||
self.assertEqual(1,len(d.states))
|
||||
self.assertEqual(p,d.states.keys()[0])
|
||||
self.assertEqual(STATE_REFERENCE,d.states[p])
|
||||
|
||||
def test_GetState_with_path_not_there(self):
|
||||
d = Directories()
|
||||
d.add_path(testpath + 'utils')
|
||||
self.assertRaises(LookupError,d.GetState,testpath)
|
||||
|
||||
def test_states_remain_when_larger_directory_eat_smaller_ones(self):
|
||||
d = Directories()
|
||||
p = testpath + 'utils'
|
||||
d.add_path(p)
|
||||
d.SetState(p,STATE_EXCLUDED)
|
||||
d.add_path(testpath)
|
||||
d.SetState(testpath,STATE_REFERENCE)
|
||||
self.assertEqual(STATE_EXCLUDED,d.GetState(p))
|
||||
self.assertEqual(STATE_EXCLUDED,d.GetState(p + 'dir1'))
|
||||
self.assertEqual(STATE_REFERENCE,d.GetState(testpath))
|
||||
|
||||
def test_SetState_keep_state_dict_size_to_minimum(self):
|
||||
d = Directories()
|
||||
p = Path(phys_test.create_fake_fs(self.tmpdir()))
|
||||
d.add_path(p)
|
||||
d.SetState(p,STATE_REFERENCE)
|
||||
d.SetState(p + 'dir1',STATE_REFERENCE)
|
||||
self.assertEqual(1,len(d.states))
|
||||
self.assertEqual(STATE_REFERENCE,d.GetState(p + 'dir1'))
|
||||
d.SetState(p + 'dir1',STATE_NORMAL)
|
||||
self.assertEqual(2,len(d.states))
|
||||
self.assertEqual(STATE_NORMAL,d.GetState(p + 'dir1'))
|
||||
d.SetState(p + 'dir1',STATE_REFERENCE)
|
||||
self.assertEqual(1,len(d.states))
|
||||
self.assertEqual(STATE_REFERENCE,d.GetState(p + 'dir1'))
|
||||
|
||||
def test_get_files(self):
|
||||
d = Directories()
|
||||
p = Path(phys_test.create_fake_fs(self.tmpdir()))
|
||||
d.add_path(p)
|
||||
d.SetState(p + 'dir1',STATE_REFERENCE)
|
||||
d.SetState(p + 'dir2',STATE_EXCLUDED)
|
||||
files = d.get_files()
|
||||
self.assertEqual(5, len(list(files)))
|
||||
for f in files:
|
||||
if f.parent.path == p + 'dir1':
|
||||
self.assert_(f.is_ref)
|
||||
else:
|
||||
self.assert_(not f.is_ref)
|
||||
|
||||
def test_get_files_with_inherited_exclusion(self):
|
||||
d = Directories()
|
||||
p = testpath + 'utils'
|
||||
d.add_path(p)
|
||||
d.SetState(p,STATE_EXCLUDED)
|
||||
self.assertEqual([], list(d.get_files()))
|
||||
|
||||
def test_save_and_load(self):
|
||||
d1 = Directories()
|
||||
d2 = Directories()
|
||||
p1 = self.tmppath()
|
||||
p2 = self.tmppath()
|
||||
d1.add_path(p1)
|
||||
d1.add_path(p2)
|
||||
d1.SetState(p1, STATE_REFERENCE)
|
||||
d1.SetState(p1 + 'dir1',STATE_EXCLUDED)
|
||||
tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
|
||||
d1.SaveToFile(tmpxml)
|
||||
d2.LoadFromFile(tmpxml)
|
||||
self.assertEqual(2, len(d2))
|
||||
self.assertEqual(STATE_REFERENCE,d2.GetState(p1))
|
||||
self.assertEqual(STATE_EXCLUDED,d2.GetState(p1 + 'dir1'))
|
||||
|
||||
def test_invalid_path(self):
|
||||
d = Directories()
|
||||
p = Path('does_not_exist')
|
||||
self.assertRaises(InvalidPathError, d.add_path, p)
|
||||
self.assertEqual(0, len(d))
|
||||
|
||||
def test_SetState_on_invalid_path(self):
|
||||
d = Directories()
|
||||
try:
|
||||
d.SetState(Path('foobar',),STATE_NORMAL)
|
||||
except LookupError:
|
||||
self.fail()
|
||||
|
||||
def test_default_dirclass(self):
|
||||
self.assert_(Directories().dirclass is hsfs.phys.Directory)
|
||||
|
||||
def test_dirclass(self):
|
||||
class MySpecialDirclass(hsfs.phys.Directory): pass
|
||||
d = Directories()
|
||||
d.dirclass = MySpecialDirclass
|
||||
d.add_path(testpath)
|
||||
self.assert_(isinstance(d[0], MySpecialDirclass))
|
||||
|
||||
def test_LoadFromFile_with_invalid_path(self):
|
||||
#This test simulates a load from file resulting in a
|
||||
#InvalidPath raise. Other directories must be loaded.
|
||||
d1 = Directories()
|
||||
d1.add_path(testpath + 'utils')
|
||||
#Will raise InvalidPath upon loading
|
||||
d1.add_path(self.tmppath()).name = 'does_not_exist'
|
||||
tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
|
||||
d1.SaveToFile(tmpxml)
|
||||
d2 = Directories()
|
||||
d2.LoadFromFile(tmpxml)
|
||||
self.assertEqual(1, len(d2))
|
||||
|
||||
def test_LoadFromFile_with_same_paths(self):
|
||||
#This test simulates a load from file resulting in a
|
||||
#AlreadyExists raise. Other directories must be loaded.
|
||||
d1 = Directories()
|
||||
p1 = self.tmppath()
|
||||
p2 = self.tmppath()
|
||||
d1.add_path(p1)
|
||||
d1.add_path(p2)
|
||||
#Will raise AlreadyExists upon loading
|
||||
d1.add_path(self.tmppath()).name = unicode(p1)
|
||||
tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
|
||||
d1.SaveToFile(tmpxml)
|
||||
d2 = Directories()
|
||||
d2.LoadFromFile(tmpxml)
|
||||
self.assertEqual(2, len(d2))
|
||||
|
||||
def test_Remove(self):
|
||||
d = Directories()
|
||||
d1 = d.add_path(self.tmppath())
|
||||
d2 = d.add_path(self.tmppath())
|
||||
d.Remove(d1)
|
||||
self.assertEqual(1, len(d))
|
||||
self.assert_(d[0] is d2)
|
||||
|
||||
def test_unicode_save(self):
|
||||
d = Directories()
|
||||
p1 = self.tmppath() + u'hello\xe9'
|
||||
io.mkdir(p1)
|
||||
io.mkdir(p1 + u'foo\xe9')
|
||||
d.add_path(p1)
|
||||
d.SetState(d[0][0].path, STATE_EXCLUDED)
|
||||
tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
|
||||
try:
|
||||
d.SaveToFile(tmpxml)
|
||||
except UnicodeDecodeError:
|
||||
self.fail()
|
||||
|
||||
def test_get_files_refreshes_its_directories(self):
|
||||
d = Directories()
|
||||
p = Path(phys_test.create_fake_fs(self.tmpdir()))
|
||||
d.add_path(p)
|
||||
files = d.get_files()
|
||||
self.assertEqual(6, len(list(files)))
|
||||
time.sleep(1)
|
||||
os.remove(str(p + ('dir1','file1.test')))
|
||||
files = d.get_files()
|
||||
self.assertEqual(5, len(list(files)))
|
||||
|
||||
def test_get_files_does_not_choke_on_non_existing_directories(self):
|
||||
d = Directories()
|
||||
p = Path(self.tmpdir())
|
||||
d.add_path(p)
|
||||
io.rmtree(p)
|
||||
self.assertEqual([], list(d.get_files()))
|
||||
|
||||
def test_GetState_returns_excluded_by_default_for_hidden_directories(self):
|
||||
d = Directories()
|
||||
p = Path(self.tmpdir())
|
||||
hidden_dir_path = p + '.foo'
|
||||
io.mkdir(p + '.foo')
|
||||
d.add_path(p)
|
||||
self.assertEqual(d.GetState(hidden_dir_path), STATE_EXCLUDED)
|
||||
# But it can be overriden
|
||||
d.SetState(hidden_dir_path, STATE_NORMAL)
|
||||
self.assertEqual(d.GetState(hidden_dir_path), STATE_NORMAL)
|
||||
|
||||
def test_special_dirclasses(self):
|
||||
# if a path is in special_dirclasses, use this class instead
|
||||
class MySpecialDirclass(hsfs.phys.Directory): pass
|
||||
d = Directories()
|
||||
p1 = self.tmppath()
|
||||
p2 = self.tmppath()
|
||||
d.special_dirclasses[p1] = MySpecialDirclass
|
||||
self.assert_(isinstance(d.add_path(p2), hsfs.phys.Directory))
|
||||
self.assert_(isinstance(d.add_path(p1), MySpecialDirclass))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
360
py/engine.py
Normal file
360
py/engine.py
Normal file
@@ -0,0 +1,360 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.engine
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/01/29
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: $
|
||||
$Revision: $
|
||||
Copyright 2007 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
from __future__ import division
|
||||
import difflib
|
||||
import logging
|
||||
import string
|
||||
from collections import defaultdict, namedtuple
|
||||
from unicodedata import normalize
|
||||
|
||||
from hsutil.str import multi_replace
|
||||
from hsutil import job
|
||||
|
||||
(WEIGHT_WORDS,
|
||||
MATCH_SIMILAR_WORDS,
|
||||
NO_FIELD_ORDER) = range(3)
|
||||
|
||||
JOB_REFRESH_RATE = 100
|
||||
|
||||
def getwords(s):
|
||||
if isinstance(s, unicode):
|
||||
s = normalize('NFD', s)
|
||||
s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", ' ').lower()
|
||||
s = ''.join(c for c in s if c in string.ascii_letters + string.digits + string.whitespace)
|
||||
return filter(None, s.split(' ')) # filter() is to remove empty elements
|
||||
|
||||
def getfields(s):
|
||||
fields = [getwords(field) for field in s.split(' - ')]
|
||||
return filter(None, fields)
|
||||
|
||||
def unpack_fields(fields):
|
||||
result = []
|
||||
for field in fields:
|
||||
if isinstance(field, list):
|
||||
result += field
|
||||
else:
|
||||
result.append(field)
|
||||
return result
|
||||
|
||||
def compare(first, second, flags=()):
|
||||
"""Returns the % of words that match between first and second
|
||||
|
||||
The result is a int in the range 0..100.
|
||||
First and second can be either a string or a list.
|
||||
"""
|
||||
if not (first and second):
|
||||
return 0
|
||||
if any(isinstance(element, list) for element in first):
|
||||
return compare_fields(first, second, flags)
|
||||
second = second[:] #We must use a copy of second because we remove items from it
|
||||
match_similar = MATCH_SIMILAR_WORDS in flags
|
||||
weight_words = WEIGHT_WORDS in flags
|
||||
joined = first + second
|
||||
total_count = (sum(len(word) for word in joined) if weight_words else len(joined))
|
||||
match_count = 0
|
||||
in_order = True
|
||||
for word in first:
|
||||
if match_similar and (word not in second):
|
||||
similar = difflib.get_close_matches(word, second, 1, 0.8)
|
||||
if similar:
|
||||
word = similar[0]
|
||||
if word in second:
|
||||
if second[0] != word:
|
||||
in_order = False
|
||||
second.remove(word)
|
||||
match_count += (len(word) if weight_words else 1)
|
||||
result = round(((match_count * 2) / total_count) * 100)
|
||||
if (result == 100) and (not in_order):
|
||||
result = 99 # We cannot consider a match exact unless the ordering is the same
|
||||
return result
|
||||
|
||||
def compare_fields(first, second, flags=()):
|
||||
"""Returns the score for the lowest matching fields.
|
||||
|
||||
first and second must be lists of lists of string.
|
||||
"""
|
||||
if len(first) != len(second):
|
||||
return 0
|
||||
if NO_FIELD_ORDER in flags:
|
||||
results = []
|
||||
#We don't want to remove field directly in the list. We must work on a copy.
|
||||
second = second[:]
|
||||
for field1 in first:
|
||||
max = 0
|
||||
matched_field = None
|
||||
for field2 in second:
|
||||
r = compare(field1, field2, flags)
|
||||
if r > max:
|
||||
max = r
|
||||
matched_field = field2
|
||||
results.append(max)
|
||||
if matched_field:
|
||||
second.remove(matched_field)
|
||||
else:
|
||||
results = [compare(word1, word2, flags) for word1, word2 in zip(first, second)]
|
||||
return min(results) if results else 0
|
||||
|
||||
def build_word_dict(objects, j=job.nulljob):
|
||||
"""Returns a dict of objects mapped by their words.
|
||||
|
||||
objects must have a 'words' attribute being a list of strings or a list of lists of strings.
|
||||
|
||||
The result will be a dict with words as keys, lists of objects as values.
|
||||
"""
|
||||
result = defaultdict(set)
|
||||
for object in j.iter_with_progress(objects, 'Prepared %d/%d files', JOB_REFRESH_RATE):
|
||||
for word in unpack_fields(object.words):
|
||||
result[word].add(object)
|
||||
return result
|
||||
|
||||
def merge_similar_words(word_dict):
|
||||
"""Take all keys in word_dict that are similar, and merge them together.
|
||||
"""
|
||||
keys = word_dict.keys()
|
||||
keys.sort(key=len)# we want the shortest word to stay
|
||||
while keys:
|
||||
key = keys.pop(0)
|
||||
similars = difflib.get_close_matches(key, keys, 100, 0.8)
|
||||
if not similars:
|
||||
continue
|
||||
objects = word_dict[key]
|
||||
for similar in similars:
|
||||
objects |= word_dict[similar]
|
||||
del word_dict[similar]
|
||||
keys.remove(similar)
|
||||
|
||||
def reduce_common_words(word_dict, threshold):
|
||||
"""Remove all objects from word_dict values where the object count >= threshold
|
||||
|
||||
The exception to this removal are the objects where all the words of the object are common.
|
||||
Because if we remove them, we will miss some duplicates!
|
||||
"""
|
||||
uncommon_words = set(word for word, objects in word_dict.items() if len(objects) < threshold)
|
||||
for word, objects in word_dict.items():
|
||||
if len(objects) < threshold:
|
||||
continue
|
||||
reduced = set()
|
||||
for o in objects:
|
||||
if not any(w in uncommon_words for w in unpack_fields(o.words)):
|
||||
reduced.add(o)
|
||||
if reduced:
|
||||
word_dict[word] = reduced
|
||||
else:
|
||||
del word_dict[word]
|
||||
|
||||
Match = namedtuple('Match', 'first second percentage')
|
||||
def get_match(first, second, flags=()):
|
||||
#it is assumed here that first and second both have a "words" attribute
|
||||
percentage = compare(first.words, second.words, flags)
|
||||
return Match(first, second, percentage)
|
||||
|
||||
class MatchFactory(object):
|
||||
common_word_threshold = 50
|
||||
match_similar_words = False
|
||||
min_match_percentage = 0
|
||||
weight_words = False
|
||||
no_field_order = False
|
||||
limit = 5000000
|
||||
|
||||
def getmatches(self, objects, j=job.nulljob):
|
||||
j = j.start_subjob(2)
|
||||
sj = j.start_subjob(2)
|
||||
for o in objects:
|
||||
if not hasattr(o, 'words'):
|
||||
o.words = getwords(o.name)
|
||||
word_dict = build_word_dict(objects, sj)
|
||||
reduce_common_words(word_dict, self.common_word_threshold)
|
||||
if self.match_similar_words:
|
||||
merge_similar_words(word_dict)
|
||||
match_flags = []
|
||||
if self.weight_words:
|
||||
match_flags.append(WEIGHT_WORDS)
|
||||
if self.match_similar_words:
|
||||
match_flags.append(MATCH_SIMILAR_WORDS)
|
||||
if self.no_field_order:
|
||||
match_flags.append(NO_FIELD_ORDER)
|
||||
j.start_job(len(word_dict), '0 matches found')
|
||||
compared = defaultdict(set)
|
||||
result = []
|
||||
try:
|
||||
# This whole 'popping' thing is there to avoid taking too much memory at the same time.
|
||||
while word_dict:
|
||||
items = word_dict.popitem()[1]
|
||||
while items:
|
||||
ref = items.pop()
|
||||
compared_already = compared[ref]
|
||||
to_compare = items - compared_already
|
||||
compared_already |= to_compare
|
||||
for other in to_compare:
|
||||
m = get_match(ref, other, match_flags)
|
||||
if m.percentage >= self.min_match_percentage:
|
||||
result.append(m)
|
||||
if len(result) >= self.limit:
|
||||
return result
|
||||
j.add_progress(desc='%d matches found' % len(result))
|
||||
except MemoryError:
|
||||
# This is the place where the memory usage is at its peak during the scan.
|
||||
# Just continue the process with an incomplete list of matches.
|
||||
del compared # This should give us enough room to call logging.
|
||||
logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
|
||||
return result
|
||||
return result
|
||||
|
||||
|
||||
class Group(object):
|
||||
#---Override
|
||||
def __init__(self):
|
||||
self._clear()
|
||||
|
||||
def __contains__(self, item):
|
||||
return item in self.unordered
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.ordered.__getitem__(key)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.ordered)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ordered)
|
||||
|
||||
#---Private
|
||||
def _clear(self):
|
||||
self._percentage = None
|
||||
self._matches_for_ref = None
|
||||
self.matches = set()
|
||||
self.candidates = defaultdict(set)
|
||||
self.ordered = []
|
||||
self.unordered = set()
|
||||
|
||||
def _get_matches_for_ref(self):
|
||||
if self._matches_for_ref is None:
|
||||
ref = self.ref
|
||||
self._matches_for_ref = [match for match in self.matches if ref in match]
|
||||
return self._matches_for_ref
|
||||
|
||||
#---Public
|
||||
def add_match(self, match):
|
||||
def add_candidate(item, match):
|
||||
matches = self.candidates[item]
|
||||
matches.add(match)
|
||||
if self.unordered <= matches:
|
||||
self.ordered.append(item)
|
||||
self.unordered.add(item)
|
||||
|
||||
if match in self.matches:
|
||||
return
|
||||
self.matches.add(match)
|
||||
first, second, _ = match
|
||||
if first not in self.unordered:
|
||||
add_candidate(first, second)
|
||||
if second not in self.unordered:
|
||||
add_candidate(second, first)
|
||||
self._percentage = None
|
||||
self._matches_for_ref = None
|
||||
|
||||
def clean_matches(self):
|
||||
self.matches = set(m for m in self.matches if (m.first in self.unordered) and (m.second in self.unordered))
|
||||
self.candidates = defaultdict(set)
|
||||
|
||||
def get_match_of(self, item):
|
||||
if item is self.ref:
|
||||
return
|
||||
for m in self._get_matches_for_ref():
|
||||
if item in m:
|
||||
return m
|
||||
|
||||
def prioritize(self, key_func, tie_breaker=None):
|
||||
# tie_breaker(ref, dupe) --> True if dupe should be ref
|
||||
self.ordered.sort(key=key_func)
|
||||
if tie_breaker is None:
|
||||
return
|
||||
ref = self.ref
|
||||
key_value = key_func(ref)
|
||||
for dupe in self.dupes:
|
||||
if key_func(dupe) != key_value:
|
||||
break
|
||||
if tie_breaker(ref, dupe):
|
||||
ref = dupe
|
||||
if ref is not self.ref:
|
||||
self.switch_ref(ref)
|
||||
|
||||
def remove_dupe(self, item, clean_matches=True):
|
||||
try:
|
||||
self.ordered.remove(item)
|
||||
self.unordered.remove(item)
|
||||
self._percentage = None
|
||||
self._matches_for_ref = None
|
||||
if (len(self) > 1) and any(not getattr(item, 'is_ref', False) for item in self):
|
||||
if clean_matches:
|
||||
self.matches = set(m for m in self.matches if item not in m)
|
||||
else:
|
||||
self._clear()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def switch_ref(self, with_dupe):
|
||||
try:
|
||||
self.ordered.remove(with_dupe)
|
||||
self.ordered.insert(0, with_dupe)
|
||||
self._percentage = None
|
||||
self._matches_for_ref = None
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
dupes = property(lambda self: self[1:])
|
||||
|
||||
@property
|
||||
def percentage(self):
|
||||
if self._percentage is None:
|
||||
if self.dupes:
|
||||
matches = self._get_matches_for_ref()
|
||||
self._percentage = sum(match.percentage for match in matches) // len(matches)
|
||||
else:
|
||||
self._percentage = 0
|
||||
return self._percentage
|
||||
|
||||
@property
|
||||
def ref(self):
|
||||
if self:
|
||||
return self[0]
|
||||
|
||||
|
||||
def get_groups(matches, j=job.nulljob):
|
||||
matches.sort(key=lambda match: -match.percentage)
|
||||
dupe2group = {}
|
||||
groups = []
|
||||
for match in j.iter_with_progress(matches, 'Grouped %d/%d matches', JOB_REFRESH_RATE):
|
||||
first, second, _ = match
|
||||
first_group = dupe2group.get(first)
|
||||
second_group = dupe2group.get(second)
|
||||
if first_group:
|
||||
if second_group:
|
||||
if first_group is second_group:
|
||||
target_group = first_group
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
target_group = first_group
|
||||
dupe2group[second] = target_group
|
||||
else:
|
||||
if second_group:
|
||||
target_group = second_group
|
||||
dupe2group[first] = target_group
|
||||
else:
|
||||
target_group = Group()
|
||||
groups.append(target_group)
|
||||
dupe2group[first] = target_group
|
||||
dupe2group[second] = target_group
|
||||
target_group.add_match(match)
|
||||
for group in groups:
|
||||
group.clean_matches()
|
||||
return groups
|
||||
822
py/engine_test.py
Normal file
822
py/engine_test.py
Normal file
@@ -0,0 +1,822 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.engine_test
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/01/29
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: $
|
||||
$Revision: $
|
||||
Copyright 2004-2008 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import unittest
|
||||
import sys
|
||||
|
||||
from hsutil import job
|
||||
from hsutil.decorators import log_calls
|
||||
from hsutil.testcase import TestCase
|
||||
|
||||
from . import engine
|
||||
from .engine import *
|
||||
|
||||
class NamedObject(object):
|
||||
def __init__(self, name="foobar", with_words=False):
|
||||
self.name = name
|
||||
if with_words:
|
||||
self.words = getwords(name)
|
||||
|
||||
|
||||
def get_match_triangle():
|
||||
o1 = NamedObject(with_words=True)
|
||||
o2 = NamedObject(with_words=True)
|
||||
o3 = NamedObject(with_words=True)
|
||||
m1 = get_match(o1,o2)
|
||||
m2 = get_match(o1,o3)
|
||||
m3 = get_match(o2,o3)
|
||||
return [m1, m2, m3]
|
||||
|
||||
def get_test_group():
|
||||
m1, m2, m3 = get_match_triangle()
|
||||
result = Group()
|
||||
result.add_match(m1)
|
||||
result.add_match(m2)
|
||||
result.add_match(m3)
|
||||
return result
|
||||
|
||||
class TCgetwords(TestCase):
|
||||
def test_spaces(self):
|
||||
self.assertEqual(['a', 'b', 'c', 'd'], getwords("a b c d"))
|
||||
self.assertEqual(['a', 'b', 'c', 'd'], getwords(" a b c d "))
|
||||
|
||||
def test_splitter_chars(self):
|
||||
self.assertEqual(
|
||||
[chr(i) for i in xrange(ord('a'),ord('z')+1)],
|
||||
getwords("a-b_c&d+e(f)g;h\\i[j]k{l}m:n.o,p<q>r/s?t~u!v@w#x$y*z")
|
||||
)
|
||||
|
||||
def test_joiner_chars(self):
|
||||
self.assertEqual(["aec"], getwords(u"a'e\u0301c"))
|
||||
|
||||
def test_empty(self):
|
||||
self.assertEqual([], getwords(''))
|
||||
|
||||
def test_returns_lowercase(self):
|
||||
self.assertEqual(['foo', 'bar'], getwords('FOO BAR'))
|
||||
|
||||
def test_decompose_unicode(self):
|
||||
self.assertEqual(getwords(u'foo\xe9bar'), ['fooebar'])
|
||||
|
||||
|
||||
class TCgetfields(TestCase):
|
||||
def test_simple(self):
|
||||
self.assertEqual([['a', 'b'], ['c', 'd', 'e']], getfields('a b - c d e'))
|
||||
|
||||
def test_empty(self):
|
||||
self.assertEqual([], getfields(''))
|
||||
|
||||
def test_cleans_empty_fields(self):
|
||||
expected = [['a', 'bc', 'def']]
|
||||
actual = getfields(' - a bc def')
|
||||
self.assertEqual(expected, actual)
|
||||
expected = [['bc', 'def']]
|
||||
|
||||
|
||||
class TCunpack_fields(TestCase):
|
||||
def test_with_fields(self):
|
||||
expected = ['a', 'b', 'c', 'd', 'e', 'f']
|
||||
actual = unpack_fields([['a'], ['b', 'c'], ['d', 'e', 'f']])
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
def test_without_fields(self):
|
||||
expected = ['a', 'b', 'c', 'd', 'e', 'f']
|
||||
actual = unpack_fields(['a', 'b', 'c', 'd', 'e', 'f'])
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
def test_empty(self):
|
||||
self.assertEqual([], unpack_fields([]))
|
||||
|
||||
|
||||
class TCWordCompare(TestCase):
|
||||
def test_list(self):
|
||||
self.assertEqual(100, compare(['a', 'b', 'c', 'd'],['a', 'b', 'c', 'd']))
|
||||
self.assertEqual(86, compare(['a', 'b', 'c', 'd'],['a', 'b', 'c']))
|
||||
|
||||
def test_unordered(self):
|
||||
#Sometimes, users don't want fuzzy matching too much When they set the slider
|
||||
#to 100, they don't expect a filename with the same words, but not the same order, to match.
|
||||
#Thus, we want to return 99 in that case.
|
||||
self.assertEqual(99, compare(['a', 'b', 'c', 'd'], ['d', 'b', 'c', 'a']))
|
||||
|
||||
def test_word_occurs_twice(self):
|
||||
#if a word occurs twice in first, but once in second, we want the word to be only counted once
|
||||
self.assertEqual(89, compare(['a', 'b', 'c', 'd', 'a'], ['d', 'b', 'c', 'a']))
|
||||
|
||||
def test_uses_copy_of_lists(self):
|
||||
first = ['foo', 'bar']
|
||||
second = ['bar', 'bleh']
|
||||
compare(first, second)
|
||||
self.assertEqual(['foo', 'bar'], first)
|
||||
self.assertEqual(['bar', 'bleh'], second)
|
||||
|
||||
def test_word_weight(self):
|
||||
self.assertEqual(int((6.0 / 13.0) * 100), compare(['foo', 'bar'], ['bar', 'bleh'], (WEIGHT_WORDS, )))
|
||||
|
||||
def test_similar_words(self):
|
||||
self.assertEqual(100, compare(['the', 'white', 'stripes'],['the', 'whites', 'stripe'], (MATCH_SIMILAR_WORDS, )))
|
||||
|
||||
def test_empty(self):
|
||||
self.assertEqual(0, compare([], []))
|
||||
|
||||
def test_with_fields(self):
|
||||
self.assertEqual(67, compare([['a', 'b'], ['c', 'd', 'e']], [['a', 'b'], ['c', 'd', 'f']]))
|
||||
|
||||
def test_propagate_flags_with_fields(self):
|
||||
def mock_compare(first, second, flags):
|
||||
self.assertEqual((0, 1, 2, 3, 5), flags)
|
||||
|
||||
self.mock(engine, 'compare_fields', mock_compare)
|
||||
compare([['a']], [['a']], (0, 1, 2, 3, 5))
|
||||
|
||||
|
||||
class TCWordCompareWithFields(TestCase):
|
||||
def test_simple(self):
|
||||
self.assertEqual(67, compare_fields([['a', 'b'], ['c', 'd', 'e']], [['a', 'b'], ['c', 'd', 'f']]))
|
||||
|
||||
def test_empty(self):
|
||||
self.assertEqual(0, compare_fields([], []))
|
||||
|
||||
def test_different_length(self):
|
||||
self.assertEqual(0, compare_fields([['a'], ['b']], [['a'], ['b'], ['c']]))
|
||||
|
||||
def test_propagates_flags(self):
|
||||
def mock_compare(first, second, flags):
|
||||
self.assertEqual((0, 1, 2, 3, 5), flags)
|
||||
|
||||
self.mock(engine, 'compare_fields', mock_compare)
|
||||
compare_fields([['a']], [['a']],(0, 1, 2, 3, 5))
|
||||
|
||||
def test_order(self):
|
||||
first = [['a', 'b'], ['c', 'd', 'e']]
|
||||
second = [['c', 'd', 'f'], ['a', 'b']]
|
||||
self.assertEqual(0, compare_fields(first, second))
|
||||
|
||||
def test_no_order(self):
|
||||
first = [['a','b'],['c','d','e']]
|
||||
second = [['c','d','f'],['a','b']]
|
||||
self.assertEqual(67, compare_fields(first, second, (NO_FIELD_ORDER, )))
|
||||
first = [['a','b'],['a','b']] #a field can only be matched once.
|
||||
second = [['c','d','f'],['a','b']]
|
||||
self.assertEqual(0, compare_fields(first, second, (NO_FIELD_ORDER, )))
|
||||
first = [['a','b'],['a','b','c']]
|
||||
second = [['c','d','f'],['a','b']]
|
||||
self.assertEqual(33, compare_fields(first, second, (NO_FIELD_ORDER, )))
|
||||
|
||||
def test_compare_fields_without_order_doesnt_alter_fields(self):
|
||||
#The NO_ORDER comp type altered the fields!
|
||||
first = [['a','b'],['c','d','e']]
|
||||
second = [['c','d','f'],['a','b']]
|
||||
self.assertEqual(67, compare_fields(first, second, (NO_FIELD_ORDER, )))
|
||||
self.assertEqual([['a','b'],['c','d','e']],first)
|
||||
self.assertEqual([['c','d','f'],['a','b']],second)
|
||||
|
||||
|
||||
class TCbuild_word_dict(TestCase):
|
||||
def test_with_standard_words(self):
|
||||
l = [NamedObject('foo bar',True)]
|
||||
l.append(NamedObject('bar baz',True))
|
||||
l.append(NamedObject('baz bleh foo',True))
|
||||
d = build_word_dict(l)
|
||||
self.assertEqual(4,len(d))
|
||||
self.assertEqual(2,len(d['foo']))
|
||||
self.assert_(l[0] in d['foo'])
|
||||
self.assert_(l[2] in d['foo'])
|
||||
self.assertEqual(2,len(d['bar']))
|
||||
self.assert_(l[0] in d['bar'])
|
||||
self.assert_(l[1] in d['bar'])
|
||||
self.assertEqual(2,len(d['baz']))
|
||||
self.assert_(l[1] in d['baz'])
|
||||
self.assert_(l[2] in d['baz'])
|
||||
self.assertEqual(1,len(d['bleh']))
|
||||
self.assert_(l[2] in d['bleh'])
|
||||
|
||||
def test_unpack_fields(self):
|
||||
o = NamedObject('')
|
||||
o.words = [['foo','bar'],['baz']]
|
||||
d = build_word_dict([o])
|
||||
self.assertEqual(3,len(d))
|
||||
self.assertEqual(1,len(d['foo']))
|
||||
|
||||
def test_words_are_unaltered(self):
|
||||
o = NamedObject('')
|
||||
o.words = [['foo','bar'],['baz']]
|
||||
d = build_word_dict([o])
|
||||
self.assertEqual([['foo','bar'],['baz']],o.words)
|
||||
|
||||
def test_object_instances_can_only_be_once_in_words_object_list(self):
|
||||
o = NamedObject('foo foo',True)
|
||||
d = build_word_dict([o])
|
||||
self.assertEqual(1,len(d['foo']))
|
||||
|
||||
def test_job(self):
|
||||
def do_progress(p,d=''):
|
||||
self.log.append(p)
|
||||
return True
|
||||
|
||||
j = job.Job(1,do_progress)
|
||||
self.log = []
|
||||
s = "foo bar"
|
||||
build_word_dict([NamedObject(s, True), NamedObject(s, True), NamedObject(s, True)], j)
|
||||
self.assertEqual(0,self.log[0])
|
||||
self.assertEqual(33,self.log[1])
|
||||
self.assertEqual(66,self.log[2])
|
||||
self.assertEqual(100,self.log[3])
|
||||
|
||||
|
||||
class TCmerge_similar_words(TestCase):
|
||||
def test_some_similar_words(self):
|
||||
d = {
|
||||
'foobar':set([1]),
|
||||
'foobar1':set([2]),
|
||||
'foobar2':set([3]),
|
||||
}
|
||||
merge_similar_words(d)
|
||||
self.assertEqual(1,len(d))
|
||||
self.assertEqual(3,len(d['foobar']))
|
||||
|
||||
|
||||
|
||||
class TCreduce_common_words(TestCase):
|
||||
def test_typical(self):
|
||||
d = {
|
||||
'foo': set([NamedObject('foo bar',True) for i in range(50)]),
|
||||
'bar': set([NamedObject('foo bar',True) for i in range(49)])
|
||||
}
|
||||
reduce_common_words(d, 50)
|
||||
self.assert_('foo' not in d)
|
||||
self.assertEqual(49,len(d['bar']))
|
||||
|
||||
def test_dont_remove_objects_with_only_common_words(self):
|
||||
d = {
|
||||
'common': set([NamedObject("common uncommon",True) for i in range(50)] + [NamedObject("common",True)]),
|
||||
'uncommon': set([NamedObject("common uncommon",True)])
|
||||
}
|
||||
reduce_common_words(d, 50)
|
||||
self.assertEqual(1,len(d['common']))
|
||||
self.assertEqual(1,len(d['uncommon']))
|
||||
|
||||
def test_values_still_are_set_instances(self):
|
||||
d = {
|
||||
'common': set([NamedObject("common uncommon",True) for i in range(50)] + [NamedObject("common",True)]),
|
||||
'uncommon': set([NamedObject("common uncommon",True)])
|
||||
}
|
||||
reduce_common_words(d, 50)
|
||||
self.assert_(isinstance(d['common'],set))
|
||||
self.assert_(isinstance(d['uncommon'],set))
|
||||
|
||||
def test_dont_raise_KeyError_when_a_word_has_been_removed(self):
|
||||
#If a word has been removed by the reduce, an object in a subsequent common word that
|
||||
#contains the word that has been removed would cause a KeyError.
|
||||
d = {
|
||||
'foo': set([NamedObject('foo bar baz',True) for i in range(50)]),
|
||||
'bar': set([NamedObject('foo bar baz',True) for i in range(50)]),
|
||||
'baz': set([NamedObject('foo bar baz',True) for i in range(49)])
|
||||
}
|
||||
try:
|
||||
reduce_common_words(d, 50)
|
||||
except KeyError:
|
||||
self.fail()
|
||||
|
||||
def test_unpack_fields(self):
|
||||
#object.words may be fields.
|
||||
def create_it():
|
||||
o = NamedObject('')
|
||||
o.words = [['foo','bar'],['baz']]
|
||||
return o
|
||||
|
||||
d = {
|
||||
'foo': set([create_it() for i in range(50)])
|
||||
}
|
||||
try:
|
||||
reduce_common_words(d, 50)
|
||||
except TypeError:
|
||||
self.fail("must support fields.")
|
||||
|
||||
def test_consider_a_reduced_common_word_common_even_after_reduction(self):
|
||||
#There was a bug in the code that causeda word that has already been reduced not to
|
||||
#be counted as a common word for subsequent words. For example, if 'foo' is processed
|
||||
#as a common word, keeping a "foo bar" file in it, and the 'bar' is processed, "foo bar"
|
||||
#would not stay in 'bar' because 'foo' is not a common word anymore.
|
||||
only_common = NamedObject('foo bar',True)
|
||||
d = {
|
||||
'foo': set([NamedObject('foo bar baz',True) for i in range(49)] + [only_common]),
|
||||
'bar': set([NamedObject('foo bar baz',True) for i in range(49)] + [only_common]),
|
||||
'baz': set([NamedObject('foo bar baz',True) for i in range(49)])
|
||||
}
|
||||
reduce_common_words(d, 50)
|
||||
self.assertEqual(1,len(d['foo']))
|
||||
self.assertEqual(1,len(d['bar']))
|
||||
self.assertEqual(49,len(d['baz']))
|
||||
|
||||
|
||||
class TCget_match(TestCase):
|
||||
def test_simple(self):
|
||||
o1 = NamedObject("foo bar",True)
|
||||
o2 = NamedObject("bar bleh",True)
|
||||
m = get_match(o1,o2)
|
||||
self.assertEqual(50,m.percentage)
|
||||
self.assertEqual(['foo','bar'],m.first.words)
|
||||
self.assertEqual(['bar','bleh'],m.second.words)
|
||||
self.assert_(m.first is o1)
|
||||
self.assert_(m.second is o2)
|
||||
|
||||
def test_in(self):
|
||||
o1 = NamedObject("foo",True)
|
||||
o2 = NamedObject("bar",True)
|
||||
m = get_match(o1,o2)
|
||||
self.assert_(o1 in m)
|
||||
self.assert_(o2 in m)
|
||||
self.assert_(object() not in m)
|
||||
|
||||
def test_word_weight(self):
|
||||
self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage)
|
||||
|
||||
|
||||
class TCMatchFactory(TestCase):
|
||||
def test_empty(self):
|
||||
self.assertEqual([],MatchFactory().getmatches([]))
|
||||
|
||||
def test_defaults(self):
|
||||
mf = MatchFactory()
|
||||
self.assertEqual(50,mf.common_word_threshold)
|
||||
self.assertEqual(False,mf.weight_words)
|
||||
self.assertEqual(False,mf.match_similar_words)
|
||||
self.assertEqual(False,mf.no_field_order)
|
||||
self.assertEqual(0,mf.min_match_percentage)
|
||||
|
||||
def test_simple(self):
|
||||
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
|
||||
r = MatchFactory().getmatches(l)
|
||||
self.assertEqual(2,len(r))
|
||||
seek = [m for m in r if m.percentage == 50] #"foo bar" and "bar bleh"
|
||||
m = seek[0]
|
||||
self.assertEqual(['foo','bar'],m.first.words)
|
||||
self.assertEqual(['bar','bleh'],m.second.words)
|
||||
seek = [m for m in r if m.percentage == 33] #"foo bar" and "a b c foo"
|
||||
m = seek[0]
|
||||
self.assertEqual(['foo','bar'],m.first.words)
|
||||
self.assertEqual(['a','b','c','foo'],m.second.words)
|
||||
|
||||
def test_null_and_unrelated_objects(self):
|
||||
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")]
|
||||
r = MatchFactory().getmatches(l)
|
||||
self.assertEqual(1,len(r))
|
||||
m = r[0]
|
||||
self.assertEqual(50,m.percentage)
|
||||
self.assertEqual(['foo','bar'],m.first.words)
|
||||
self.assertEqual(['bar','bleh'],m.second.words)
|
||||
|
||||
def test_twice_the_same_word(self):
|
||||
l = [NamedObject("foo foo bar"),NamedObject("bar bleh")]
|
||||
r = MatchFactory().getmatches(l)
|
||||
self.assertEqual(1,len(r))
|
||||
|
||||
def test_twice_the_same_word_when_preworded(self):
|
||||
l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)]
|
||||
r = MatchFactory().getmatches(l)
|
||||
self.assertEqual(1,len(r))
|
||||
|
||||
def test_two_words_match(self):
|
||||
l = [NamedObject("foo bar"),NamedObject("foo bar bleh")]
|
||||
r = MatchFactory().getmatches(l)
|
||||
self.assertEqual(1,len(r))
|
||||
|
||||
def test_match_files_with_only_common_words(self):
|
||||
#If a word occurs more than 50 times, it is excluded from the matching process
|
||||
#The problem with the common_word_threshold is that the files containing only common
|
||||
#words will never be matched together. We *should* match them.
|
||||
mf = MatchFactory()
|
||||
mf.common_word_threshold = 50
|
||||
l = [NamedObject("foo") for i in range(50)]
|
||||
r = mf.getmatches(l)
|
||||
self.assertEqual(1225,len(r))
|
||||
|
||||
def test_use_words_already_there_if_there(self):
|
||||
o1 = NamedObject('foo')
|
||||
o2 = NamedObject('bar')
|
||||
o2.words = ['foo']
|
||||
self.assertEqual(1,len(MatchFactory().getmatches([o1,o2])))
|
||||
|
||||
def test_job(self):
|
||||
def do_progress(p,d=''):
|
||||
self.log.append(p)
|
||||
return True
|
||||
|
||||
j = job.Job(1,do_progress)
|
||||
self.log = []
|
||||
s = "foo bar"
|
||||
MatchFactory().getmatches([NamedObject(s),NamedObject(s),NamedObject(s)],j)
|
||||
self.assert_(len(self.log) > 2)
|
||||
self.assertEqual(0,self.log[0])
|
||||
self.assertEqual(100,self.log[-1])
|
||||
|
||||
def test_weight_words(self):
|
||||
mf = MatchFactory()
|
||||
mf.weight_words = True
|
||||
l = [NamedObject("foo bar"),NamedObject("bar bleh")]
|
||||
m = mf.getmatches(l)[0]
|
||||
self.assertEqual(int((6.0 / 13.0) * 100),m.percentage)
|
||||
|
||||
def test_similar_word(self):
|
||||
mf = MatchFactory()
|
||||
mf.match_similar_words = True
|
||||
l = [NamedObject("foobar"),NamedObject("foobars")]
|
||||
self.assertEqual(1,len(mf.getmatches(l)))
|
||||
self.assertEqual(100,mf.getmatches(l)[0].percentage)
|
||||
l = [NamedObject("foobar"),NamedObject("foo")]
|
||||
self.assertEqual(0,len(mf.getmatches(l))) #too far
|
||||
l = [NamedObject("bizkit"),NamedObject("bizket")]
|
||||
self.assertEqual(1,len(mf.getmatches(l)))
|
||||
l = [NamedObject("foobar"),NamedObject("foosbar")]
|
||||
self.assertEqual(1,len(mf.getmatches(l)))
|
||||
|
||||
def test_single_object_with_similar_words(self):
|
||||
mf = MatchFactory()
|
||||
mf.match_similar_words = True
|
||||
l = [NamedObject("foo foos")]
|
||||
self.assertEqual(0,len(mf.getmatches(l)))
|
||||
|
||||
def test_double_words_get_counted_only_once(self):
|
||||
mf = MatchFactory()
|
||||
l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")]
|
||||
m = mf.getmatches(l)[0]
|
||||
self.assertEqual(75,m.percentage)
|
||||
|
||||
def test_with_fields(self):
|
||||
mf = MatchFactory()
|
||||
o1 = NamedObject("foo bar - foo bleh")
|
||||
o2 = NamedObject("foo bar - bleh bar")
|
||||
o1.words = getfields(o1.name)
|
||||
o2.words = getfields(o2.name)
|
||||
m = mf.getmatches([o1, o2])[0]
|
||||
self.assertEqual(50, m.percentage)
|
||||
|
||||
def test_with_fields_no_order(self):
|
||||
mf = MatchFactory()
|
||||
mf.no_field_order = True
|
||||
o1 = NamedObject("foo bar - foo bleh")
|
||||
o2 = NamedObject("bleh bang - foo bar")
|
||||
o1.words = getfields(o1.name)
|
||||
o2.words = getfields(o2.name)
|
||||
m = mf.getmatches([o1, o2])[0]
|
||||
self.assertEqual(50 ,m.percentage)
|
||||
|
||||
def test_only_match_similar_when_the_option_is_set(self):
|
||||
mf = MatchFactory()
|
||||
mf.match_similar_words = False
|
||||
l = [NamedObject("foobar"),NamedObject("foobars")]
|
||||
self.assertEqual(0,len(mf.getmatches(l)))
|
||||
|
||||
def test_dont_recurse_do_match(self):
|
||||
# with nosetests, the stack is increased. The number has to be high enough not to be failing falsely
|
||||
sys.setrecursionlimit(100)
|
||||
mf = MatchFactory()
|
||||
files = [NamedObject('foo bar') for i in range(101)]
|
||||
try:
|
||||
mf.getmatches(files)
|
||||
except RuntimeError:
|
||||
self.fail()
|
||||
finally:
|
||||
sys.setrecursionlimit(1000)
|
||||
|
||||
def test_min_match_percentage(self):
|
||||
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
|
||||
mf = MatchFactory()
|
||||
mf.min_match_percentage = 50
|
||||
r = mf.getmatches(l)
|
||||
self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match
|
||||
|
||||
def test_limit(self):
|
||||
l = [NamedObject(),NamedObject(),NamedObject()]
|
||||
mf = MatchFactory()
|
||||
mf.limit = 2
|
||||
r = mf.getmatches(l)
|
||||
self.assertEqual(2,len(r))
|
||||
|
||||
def test_MemoryError(self):
|
||||
@log_calls
|
||||
def mocked_match(first, second, flags):
|
||||
if len(mocked_match.calls) > 42:
|
||||
raise MemoryError()
|
||||
return Match(first, second, 0)
|
||||
|
||||
objects = [NamedObject() for i in range(10)] # results in 45 matches
|
||||
self.mock(engine, 'get_match', mocked_match)
|
||||
mf = MatchFactory()
|
||||
try:
|
||||
r = mf.getmatches(objects)
|
||||
except MemoryError:
|
||||
self.fail('MemorryError must be handled')
|
||||
self.assertEqual(42, len(r))
|
||||
|
||||
|
||||
class TCGroup(TestCase):
|
||||
def test_empy(self):
|
||||
g = Group()
|
||||
self.assertEqual(None,g.ref)
|
||||
self.assertEqual([],g.dupes)
|
||||
self.assertEqual(0,len(g.matches))
|
||||
|
||||
def test_add_match(self):
|
||||
g = Group()
|
||||
m = get_match(NamedObject("foo",True),NamedObject("bar",True))
|
||||
g.add_match(m)
|
||||
self.assert_(g.ref is m.first)
|
||||
self.assertEqual([m.second],g.dupes)
|
||||
self.assertEqual(1,len(g.matches))
|
||||
self.assert_(m in g.matches)
|
||||
|
||||
def test_multiple_add_match(self):
|
||||
g = Group()
|
||||
o1 = NamedObject("a",True)
|
||||
o2 = NamedObject("b",True)
|
||||
o3 = NamedObject("c",True)
|
||||
o4 = NamedObject("d",True)
|
||||
g.add_match(get_match(o1,o2))
|
||||
self.assert_(g.ref is o1)
|
||||
self.assertEqual([o2],g.dupes)
|
||||
self.assertEqual(1,len(g.matches))
|
||||
g.add_match(get_match(o1,o3))
|
||||
self.assertEqual([o2],g.dupes)
|
||||
self.assertEqual(2,len(g.matches))
|
||||
g.add_match(get_match(o2,o3))
|
||||
self.assertEqual([o2,o3],g.dupes)
|
||||
self.assertEqual(3,len(g.matches))
|
||||
g.add_match(get_match(o1,o4))
|
||||
self.assertEqual([o2,o3],g.dupes)
|
||||
self.assertEqual(4,len(g.matches))
|
||||
g.add_match(get_match(o2,o4))
|
||||
self.assertEqual([o2,o3],g.dupes)
|
||||
self.assertEqual(5,len(g.matches))
|
||||
g.add_match(get_match(o3,o4))
|
||||
self.assertEqual([o2,o3,o4],g.dupes)
|
||||
self.assertEqual(6,len(g.matches))
|
||||
|
||||
def test_len(self):
|
||||
g = Group()
|
||||
self.assertEqual(0,len(g))
|
||||
g.add_match(get_match(NamedObject("foo",True),NamedObject("bar",True)))
|
||||
self.assertEqual(2,len(g))
|
||||
|
||||
def test_add_same_match_twice(self):
|
||||
g = Group()
|
||||
m = get_match(NamedObject("foo",True),NamedObject("foo",True))
|
||||
g.add_match(m)
|
||||
self.assertEqual(2,len(g))
|
||||
self.assertEqual(1,len(g.matches))
|
||||
g.add_match(m)
|
||||
self.assertEqual(2,len(g))
|
||||
self.assertEqual(1,len(g.matches))
|
||||
|
||||
def test_in(self):
|
||||
g = Group()
|
||||
o1 = NamedObject("foo",True)
|
||||
o2 = NamedObject("bar",True)
|
||||
self.assert_(o1 not in g)
|
||||
g.add_match(get_match(o1,o2))
|
||||
self.assert_(o1 in g)
|
||||
self.assert_(o2 in g)
|
||||
|
||||
def test_remove(self):
|
||||
g = Group()
|
||||
o1 = NamedObject("foo",True)
|
||||
o2 = NamedObject("bar",True)
|
||||
o3 = NamedObject("bleh",True)
|
||||
g.add_match(get_match(o1,o2))
|
||||
g.add_match(get_match(o1,o3))
|
||||
g.add_match(get_match(o2,o3))
|
||||
self.assertEqual(3,len(g.matches))
|
||||
self.assertEqual(3,len(g))
|
||||
g.remove_dupe(o3)
|
||||
self.assertEqual(1,len(g.matches))
|
||||
self.assertEqual(2,len(g))
|
||||
g.remove_dupe(o1)
|
||||
self.assertEqual(0,len(g.matches))
|
||||
self.assertEqual(0,len(g))
|
||||
|
||||
def test_remove_with_ref_dupes(self):
|
||||
g = Group()
|
||||
o1 = NamedObject("foo",True)
|
||||
o2 = NamedObject("bar",True)
|
||||
o3 = NamedObject("bleh",True)
|
||||
g.add_match(get_match(o1,o2))
|
||||
g.add_match(get_match(o1,o3))
|
||||
g.add_match(get_match(o2,o3))
|
||||
o1.is_ref = True
|
||||
o2.is_ref = True
|
||||
g.remove_dupe(o3)
|
||||
self.assertEqual(0,len(g))
|
||||
|
||||
def test_switch_ref(self):
|
||||
o1 = NamedObject(with_words=True)
|
||||
o2 = NamedObject(with_words=True)
|
||||
g = Group()
|
||||
g.add_match(get_match(o1,o2))
|
||||
self.assert_(o1 is g.ref)
|
||||
g.switch_ref(o2)
|
||||
self.assert_(o2 is g.ref)
|
||||
self.assertEqual([o1],g.dupes)
|
||||
g.switch_ref(o2)
|
||||
self.assert_(o2 is g.ref)
|
||||
g.switch_ref(NamedObject('',True))
|
||||
self.assert_(o2 is g.ref)
|
||||
|
||||
def test_get_match_of(self):
|
||||
g = Group()
|
||||
for m in get_match_triangle():
|
||||
g.add_match(m)
|
||||
o = g.dupes[0]
|
||||
m = g.get_match_of(o)
|
||||
self.assert_(g.ref in m)
|
||||
self.assert_(o in m)
|
||||
self.assert_(g.get_match_of(NamedObject('',True)) is None)
|
||||
self.assert_(g.get_match_of(g.ref) is None)
|
||||
|
||||
def test_percentage(self):
|
||||
#percentage should return the avg percentage in relation to the ref
|
||||
m1,m2,m3 = get_match_triangle()
|
||||
m1 = Match(m1[0], m1[1], 100)
|
||||
m2 = Match(m2[0], m2[1], 50)
|
||||
m3 = Match(m3[0], m3[1], 33)
|
||||
g = Group()
|
||||
g.add_match(m1)
|
||||
g.add_match(m2)
|
||||
g.add_match(m3)
|
||||
self.assertEqual(75,g.percentage)
|
||||
g.switch_ref(g.dupes[0])
|
||||
self.assertEqual(66,g.percentage)
|
||||
g.remove_dupe(g.dupes[0])
|
||||
self.assertEqual(33,g.percentage)
|
||||
g.add_match(m1)
|
||||
g.add_match(m2)
|
||||
self.assertEqual(66,g.percentage)
|
||||
|
||||
def test_percentage_on_empty_group(self):
|
||||
g = Group()
|
||||
self.assertEqual(0,g.percentage)
|
||||
|
||||
def test_prioritize(self):
|
||||
m1,m2,m3 = get_match_triangle()
|
||||
o1 = m1.first
|
||||
o2 = m1.second
|
||||
o3 = m2.second
|
||||
o1.name = 'c'
|
||||
o2.name = 'b'
|
||||
o3.name = 'a'
|
||||
g = Group()
|
||||
g.add_match(m1)
|
||||
g.add_match(m2)
|
||||
g.add_match(m3)
|
||||
self.assert_(o1 is g.ref)
|
||||
g.prioritize(lambda x:x.name)
|
||||
self.assert_(o3 is g.ref)
|
||||
|
||||
def test_prioritize_with_tie_breaker(self):
|
||||
# if the ref has the same key as one or more of the dupe, run the tie_breaker func among them
|
||||
g = get_test_group()
|
||||
o1, o2, o3 = g.ordered
|
||||
tie_breaker = lambda ref, dupe: dupe is o3
|
||||
g.prioritize(lambda x:0, tie_breaker)
|
||||
self.assertTrue(g.ref is o3)
|
||||
|
||||
def test_prioritize_with_tie_breaker_runs_on_all_dupes(self):
|
||||
# Even if a dupe is chosen to switch with ref with a tie breaker, we still run the tie breaker
|
||||
# with other dupes and the newly chosen ref
|
||||
g = get_test_group()
|
||||
o1, o2, o3 = g.ordered
|
||||
o1.foo = 1
|
||||
o2.foo = 2
|
||||
o3.foo = 3
|
||||
tie_breaker = lambda ref, dupe: dupe.foo > ref.foo
|
||||
g.prioritize(lambda x:0, tie_breaker)
|
||||
self.assertTrue(g.ref is o3)
|
||||
|
||||
def test_prioritize_with_tie_breaker_runs_only_on_tie_dupes(self):
|
||||
# The tie breaker only runs on dupes that had the same value for the key_func
|
||||
g = get_test_group()
|
||||
o1, o2, o3 = g.ordered
|
||||
o1.foo = 2
|
||||
o2.foo = 2
|
||||
o3.foo = 1
|
||||
o1.bar = 1
|
||||
o2.bar = 2
|
||||
o3.bar = 3
|
||||
key_func = lambda x: -x.foo
|
||||
tie_breaker = lambda ref, dupe: dupe.bar > ref.bar
|
||||
g.prioritize(key_func, tie_breaker)
|
||||
self.assertTrue(g.ref is o2)
|
||||
|
||||
def test_list_like(self):
|
||||
g = Group()
|
||||
o1,o2 = (NamedObject("foo",True),NamedObject("bar",True))
|
||||
g.add_match(get_match(o1,o2))
|
||||
self.assert_(g[0] is o1)
|
||||
self.assert_(g[1] is o2)
|
||||
|
||||
def test_clean_matches(self):
|
||||
g = Group()
|
||||
o1,o2,o3 = (NamedObject("foo",True),NamedObject("bar",True),NamedObject("baz",True))
|
||||
g.add_match(get_match(o1,o2))
|
||||
g.add_match(get_match(o1,o3))
|
||||
g.clean_matches()
|
||||
self.assertEqual(1,len(g.matches))
|
||||
self.assertEqual(0,len(g.candidates))
|
||||
|
||||
|
||||
class TCget_groups(TestCase):
|
||||
def test_empty(self):
|
||||
r = get_groups([])
|
||||
self.assertEqual([],r)
|
||||
|
||||
def test_simple(self):
|
||||
l = [NamedObject("foo bar"),NamedObject("bar bleh")]
|
||||
matches = MatchFactory().getmatches(l)
|
||||
m = matches[0]
|
||||
r = get_groups(matches)
|
||||
self.assertEqual(1,len(r))
|
||||
g = r[0]
|
||||
self.assert_(g.ref is m.first)
|
||||
self.assertEqual([m.second],g.dupes)
|
||||
|
||||
def test_group_with_multiple_matches(self):
|
||||
#This results in 3 matches
|
||||
l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")]
|
||||
matches = MatchFactory().getmatches(l)
|
||||
r = get_groups(matches)
|
||||
self.assertEqual(1,len(r))
|
||||
g = r[0]
|
||||
self.assertEqual(3,len(g))
|
||||
|
||||
def test_must_choose_a_group(self):
|
||||
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")]
|
||||
#There will be 2 groups here: group "a b" and group "c d"
|
||||
#"b c" can go either of them, but not both.
|
||||
matches = MatchFactory().getmatches(l)
|
||||
r = get_groups(matches)
|
||||
self.assertEqual(2,len(r))
|
||||
self.assertEqual(5,len(r[0])+len(r[1]))
|
||||
|
||||
def test_should_all_go_in_the_same_group(self):
|
||||
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")]
|
||||
#There will be 2 groups here: group "a b" and group "c d"
|
||||
#"b c" can fit in both, but it must be in only one of them
|
||||
matches = MatchFactory().getmatches(l)
|
||||
r = get_groups(matches)
|
||||
self.assertEqual(1,len(r))
|
||||
|
||||
def test_give_priority_to_matches_with_higher_percentage(self):
|
||||
o1 = NamedObject(with_words=True)
|
||||
o2 = NamedObject(with_words=True)
|
||||
o3 = NamedObject(with_words=True)
|
||||
m1 = Match(o1, o2, 1)
|
||||
m2 = Match(o2, o3, 2)
|
||||
r = get_groups([m1,m2])
|
||||
self.assertEqual(1,len(r))
|
||||
g = r[0]
|
||||
self.assertEqual(2,len(g))
|
||||
self.assert_(o1 not in g)
|
||||
self.assert_(o2 in g)
|
||||
self.assert_(o3 in g)
|
||||
|
||||
def test_four_sized_group(self):
|
||||
l = [NamedObject("foobar") for i in xrange(4)]
|
||||
m = MatchFactory().getmatches(l)
|
||||
r = get_groups(m)
|
||||
self.assertEqual(1,len(r))
|
||||
self.assertEqual(4,len(r[0]))
|
||||
|
||||
def test_referenced_by_ref2(self):
|
||||
o1 = NamedObject(with_words=True)
|
||||
o2 = NamedObject(with_words=True)
|
||||
o3 = NamedObject(with_words=True)
|
||||
m1 = get_match(o1,o2)
|
||||
m2 = get_match(o3,o1)
|
||||
m3 = get_match(o3,o2)
|
||||
r = get_groups([m1,m2,m3])
|
||||
self.assertEqual(3,len(r[0]))
|
||||
|
||||
def test_job(self):
|
||||
def do_progress(p,d=''):
|
||||
self.log.append(p)
|
||||
return True
|
||||
|
||||
self.log = []
|
||||
j = job.Job(1,do_progress)
|
||||
m1,m2,m3 = get_match_triangle()
|
||||
#101%: To make sure it is processed first so the job test works correctly
|
||||
m4 = Match(NamedObject('a',True), NamedObject('a',True), 101)
|
||||
get_groups([m1,m2,m3,m4],j)
|
||||
self.assertEqual(0,self.log[0])
|
||||
self.assertEqual(100,self.log[-1])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
67
py/export.py
Normal file
67
py/export.py
Normal file
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.export
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/09/16
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
from xml.dom import minidom
|
||||
import tempfile
|
||||
import os.path as op
|
||||
import os
|
||||
from StringIO import StringIO
|
||||
|
||||
from hsutil.files import FileOrPath
|
||||
|
||||
def output_column_xml(outfile, columns):
|
||||
"""Creates a xml file outfile with the supplied columns.
|
||||
|
||||
outfile can be a filename or a file object.
|
||||
columns is a list of 2 sized tuples (display,enabled)
|
||||
"""
|
||||
doc = minidom.Document()
|
||||
root = doc.appendChild(doc.createElement('columns'))
|
||||
for display,enabled in columns:
|
||||
col_node = root.appendChild(doc.createElement('column'))
|
||||
col_node.setAttribute('display', display)
|
||||
col_node.setAttribute('enabled', {True:'y',False:'n'}[enabled])
|
||||
with FileOrPath(outfile, 'wb') as fp:
|
||||
doc.writexml(fp, '\t','\t','\n', encoding='utf-8')
|
||||
|
||||
def merge_css_into_xhtml(xhtml, css):
|
||||
with FileOrPath(xhtml, 'r+') as xhtml:
|
||||
with FileOrPath(css) as css:
|
||||
try:
|
||||
doc = minidom.parse(xhtml)
|
||||
except Exception:
|
||||
return False
|
||||
head = doc.getElementsByTagName('head')[0]
|
||||
links = head.getElementsByTagName('link')
|
||||
for link in links:
|
||||
if link.getAttribute('rel') == 'stylesheet':
|
||||
head.removeChild(link)
|
||||
style = head.appendChild(doc.createElement('style'))
|
||||
style.setAttribute('type','text/css')
|
||||
style.appendChild(doc.createTextNode(css.read()))
|
||||
xhtml.truncate(0)
|
||||
doc.writexml(xhtml, '\t','\t','\n', encoding='utf-8')
|
||||
xhtml.seek(0)
|
||||
return True
|
||||
|
||||
def export_to_xhtml(xml, xslt, css, columns, cmd='xsltproc --path "%(folder)s" "%(xslt)s" "%(xml)s"'):
|
||||
folder = op.split(xml)[0]
|
||||
output_column_xml(op.join(folder,'columns.xml'),columns)
|
||||
html = StringIO()
|
||||
cmd = cmd % {'folder': folder, 'xslt': xslt, 'xml': xml}
|
||||
html.write(os.popen(cmd).read())
|
||||
html.seek(0)
|
||||
merge_css_into_xhtml(html,css)
|
||||
html.seek(0)
|
||||
html_path = op.join(folder,'export.htm')
|
||||
html_file = open(html_path,'w')
|
||||
html_file.write(html.read().encode('utf-8'))
|
||||
html_file.close()
|
||||
return html_path
|
||||
91
py/export_test.py
Normal file
91
py/export_test.py
Normal file
@@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.tests.export
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/09/16
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import unittest
|
||||
from xml.dom import minidom
|
||||
from StringIO import StringIO
|
||||
|
||||
from hsutil.testcase import TestCase
|
||||
|
||||
from .export import *
|
||||
from . import export
|
||||
|
||||
class TCoutput_columns_xml(TestCase):
|
||||
def test_empty_columns(self):
|
||||
f = StringIO()
|
||||
output_column_xml(f,[])
|
||||
f.seek(0)
|
||||
doc = minidom.parse(f)
|
||||
root = doc.documentElement
|
||||
self.assertEqual('columns',root.nodeName)
|
||||
self.assertEqual(0,len(root.childNodes))
|
||||
|
||||
def test_some_columns(self):
|
||||
f = StringIO()
|
||||
output_column_xml(f,[('foo',True),('bar',False),('baz',True)])
|
||||
f.seek(0)
|
||||
doc = minidom.parse(f)
|
||||
columns = doc.getElementsByTagName('column')
|
||||
self.assertEqual(3,len(columns))
|
||||
c1,c2,c3 = columns
|
||||
self.assertEqual('foo',c1.getAttribute('display'))
|
||||
self.assertEqual('bar',c2.getAttribute('display'))
|
||||
self.assertEqual('baz',c3.getAttribute('display'))
|
||||
self.assertEqual('y',c1.getAttribute('enabled'))
|
||||
self.assertEqual('n',c2.getAttribute('enabled'))
|
||||
self.assertEqual('y',c3.getAttribute('enabled'))
|
||||
|
||||
|
||||
class TCmerge_css_into_xhtml(TestCase):
|
||||
def test_main(self):
|
||||
css = StringIO()
|
||||
css.write('foobar')
|
||||
css.seek(0)
|
||||
xhtml = StringIO()
|
||||
xhtml.write("""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||
<title>dupeGuru - Duplicate file scanner</title>
|
||||
<link rel="SHORTCUT ICON" href="/favicon.ico" />
|
||||
<link rel="stylesheet" href="../hardcoded.css" type="text/css" />
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>""")
|
||||
xhtml.seek(0)
|
||||
self.assert_(merge_css_into_xhtml(xhtml,css))
|
||||
xhtml.seek(0)
|
||||
doc = minidom.parse(xhtml)
|
||||
head = doc.getElementsByTagName('head')[0]
|
||||
#A style node should have been added in head.
|
||||
styles = head.getElementsByTagName('style')
|
||||
self.assertEqual(1,len(styles))
|
||||
style = styles[0]
|
||||
self.assertEqual('text/css',style.getAttribute('type'))
|
||||
self.assertEqual('foobar',style.firstChild.nodeValue.strip())
|
||||
#all <link rel="stylesheet"> should be removed
|
||||
self.assertEqual(1,len(head.getElementsByTagName('link')))
|
||||
|
||||
def test_empty(self):
|
||||
self.assert_(not merge_css_into_xhtml(StringIO(),StringIO()))
|
||||
|
||||
def test_malformed(self):
|
||||
xhtml = StringIO()
|
||||
xhtml.write("""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">""")
|
||||
xhtml.seek(0)
|
||||
self.assert_(not merge_css_into_xhtml(xhtml,StringIO()))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
28
py/gen.py
Normal file
28
py/gen.py
Normal file
@@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env python
|
||||
# Unit Name: gen
|
||||
# Created By: Virgil Dupras
|
||||
# Created On: 2009-05-26
|
||||
# $Id$
|
||||
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
|
||||
|
||||
import os
|
||||
import os.path as op
|
||||
|
||||
def move(src, dst):
|
||||
if not op.exists(src):
|
||||
return
|
||||
if op.exists(dst):
|
||||
os.remove(dst)
|
||||
print 'Moving %s --> %s' % (src, dst)
|
||||
os.rename(src, dst)
|
||||
|
||||
|
||||
os.chdir(op.join('modules', 'block'))
|
||||
os.system('python setup.py build_ext --inplace')
|
||||
os.chdir(op.join('..', 'cache'))
|
||||
os.system('python setup.py build_ext --inplace')
|
||||
os.chdir(op.join('..', '..'))
|
||||
move(op.join('modules', 'block', '_block.so'), op.join('picture', '_block.so'))
|
||||
move(op.join('modules', 'block', '_block.pyd'), op.join('picture', '_block.pyd'))
|
||||
move(op.join('modules', 'cache', '_cache.so'), op.join('picture', '_cache.so'))
|
||||
move(op.join('modules', 'cache', '_cache.pyd'), op.join('picture', '_cache.pyd'))
|
||||
117
py/ignore.py
Normal file
117
py/ignore.py
Normal file
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: ignore
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/05/02
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
from hsutil.files import FileOrPath
|
||||
|
||||
import xml.dom.minidom
|
||||
|
||||
class IgnoreList(object):
|
||||
"""An ignore list implementation that is iterable, filterable and exportable to XML.
|
||||
|
||||
Call Ignore to add an ignore list entry, and AreIgnore to check if 2 items are in the list.
|
||||
When iterated, 2 sized tuples will be returned, the tuples containing 2 items ignored together.
|
||||
"""
|
||||
#---Override
|
||||
def __init__(self):
|
||||
self._ignored = {}
|
||||
self._count = 0
|
||||
|
||||
def __iter__(self):
|
||||
for first,seconds in self._ignored.iteritems():
|
||||
for second in seconds:
|
||||
yield (first,second)
|
||||
|
||||
def __len__(self):
|
||||
return self._count
|
||||
|
||||
#---Public
|
||||
def AreIgnored(self,first,second):
|
||||
def do_check(first,second):
|
||||
try:
|
||||
matches = self._ignored[first]
|
||||
return second in matches
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
return do_check(first,second) or do_check(second,first)
|
||||
|
||||
def Clear(self):
|
||||
self._ignored = {}
|
||||
self._count = 0
|
||||
|
||||
def Filter(self,func):
|
||||
"""Applies a filter on all ignored items, and remove all matches where func(first,second)
|
||||
doesn't return True.
|
||||
"""
|
||||
filtered = IgnoreList()
|
||||
for first,second in self:
|
||||
if func(first,second):
|
||||
filtered.Ignore(first,second)
|
||||
self._ignored = filtered._ignored
|
||||
self._count = filtered._count
|
||||
|
||||
def Ignore(self,first,second):
|
||||
if self.AreIgnored(first,second):
|
||||
return
|
||||
try:
|
||||
matches = self._ignored[first]
|
||||
matches.add(second)
|
||||
except KeyError:
|
||||
try:
|
||||
matches = self._ignored[second]
|
||||
matches.add(first)
|
||||
except KeyError:
|
||||
matches = set()
|
||||
matches.add(second)
|
||||
self._ignored[first] = matches
|
||||
self._count += 1
|
||||
|
||||
def load_from_xml(self,infile):
|
||||
"""Loads the ignore list from a XML created with save_to_xml.
|
||||
|
||||
infile can be a file object or a filename.
|
||||
"""
|
||||
try:
|
||||
doc = xml.dom.minidom.parse(infile)
|
||||
except Exception:
|
||||
return
|
||||
file_nodes = doc.getElementsByTagName('file')
|
||||
for fn in file_nodes:
|
||||
if not fn.getAttributeNode('path'):
|
||||
continue
|
||||
file_path = fn.getAttributeNode('path').nodeValue
|
||||
subfile_nodes = fn.getElementsByTagName('file')
|
||||
for sfn in subfile_nodes:
|
||||
if not sfn.getAttributeNode('path'):
|
||||
continue
|
||||
subfile_path = sfn.getAttributeNode('path').nodeValue
|
||||
self.Ignore(file_path,subfile_path)
|
||||
|
||||
def save_to_xml(self,outfile):
|
||||
"""Create a XML file that can be used by load_from_xml.
|
||||
|
||||
outfile can be a file object or a filename.
|
||||
"""
|
||||
doc = xml.dom.minidom.Document()
|
||||
root = doc.appendChild(doc.createElement('ignore_list'))
|
||||
for file,subfiles in self._ignored.items():
|
||||
file_node = root.appendChild(doc.createElement('file'))
|
||||
if isinstance(file,unicode):
|
||||
file = file.encode('utf-8')
|
||||
file_node.setAttribute('path',file)
|
||||
for subfile in subfiles:
|
||||
subfile_node = file_node.appendChild(doc.createElement('file'))
|
||||
if isinstance(subfile,unicode):
|
||||
subfile = subfile.encode('utf-8')
|
||||
subfile_node.setAttribute('path',subfile)
|
||||
with FileOrPath(outfile, 'wb') as fp:
|
||||
doc.writexml(fp,'\t','\t','\n',encoding='utf-8')
|
||||
|
||||
|
||||
158
py/ignore_test.py
Normal file
158
py/ignore_test.py
Normal file
@@ -0,0 +1,158 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: ignore
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/05/02
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import unittest
|
||||
import cStringIO
|
||||
import xml.dom.minidom
|
||||
|
||||
from .ignore import *
|
||||
|
||||
class TCIgnoreList(unittest.TestCase):
|
||||
def test_empty(self):
|
||||
il = IgnoreList()
|
||||
self.assertEqual(0,len(il))
|
||||
self.assert_(not il.AreIgnored('foo','bar'))
|
||||
|
||||
def test_simple(self):
|
||||
il = IgnoreList()
|
||||
il.Ignore('foo','bar')
|
||||
self.assert_(il.AreIgnored('foo','bar'))
|
||||
self.assert_(il.AreIgnored('bar','foo'))
|
||||
self.assert_(not il.AreIgnored('foo','bleh'))
|
||||
self.assert_(not il.AreIgnored('bleh','bar'))
|
||||
self.assertEqual(1,len(il))
|
||||
|
||||
def test_multiple(self):
|
||||
il = IgnoreList()
|
||||
il.Ignore('foo','bar')
|
||||
il.Ignore('foo','bleh')
|
||||
il.Ignore('bleh','bar')
|
||||
il.Ignore('aybabtu','bleh')
|
||||
self.assert_(il.AreIgnored('foo','bar'))
|
||||
self.assert_(il.AreIgnored('bar','foo'))
|
||||
self.assert_(il.AreIgnored('foo','bleh'))
|
||||
self.assert_(il.AreIgnored('bleh','bar'))
|
||||
self.assert_(not il.AreIgnored('aybabtu','bar'))
|
||||
self.assertEqual(4,len(il))
|
||||
|
||||
def test_clear(self):
|
||||
il = IgnoreList()
|
||||
il.Ignore('foo','bar')
|
||||
il.Clear()
|
||||
self.assert_(not il.AreIgnored('foo','bar'))
|
||||
self.assert_(not il.AreIgnored('bar','foo'))
|
||||
self.assertEqual(0,len(il))
|
||||
|
||||
def test_add_same_twice(self):
|
||||
il = IgnoreList()
|
||||
il.Ignore('foo','bar')
|
||||
il.Ignore('bar','foo')
|
||||
self.assertEqual(1,len(il))
|
||||
|
||||
def test_save_to_xml(self):
|
||||
il = IgnoreList()
|
||||
il.Ignore('foo','bar')
|
||||
il.Ignore('foo','bleh')
|
||||
il.Ignore('bleh','bar')
|
||||
f = cStringIO.StringIO()
|
||||
il.save_to_xml(f)
|
||||
f.seek(0)
|
||||
doc = xml.dom.minidom.parse(f)
|
||||
root = doc.documentElement
|
||||
self.assertEqual('ignore_list',root.nodeName)
|
||||
children = [c for c in root.childNodes if c.localName]
|
||||
self.assertEqual(2,len(children))
|
||||
self.assertEqual(2,len([c for c in children if c.nodeName == 'file']))
|
||||
f1,f2 = children
|
||||
subchildren = [c for c in f1.childNodes if c.localName == 'file'] +\
|
||||
[c for c in f2.childNodes if c.localName == 'file']
|
||||
self.assertEqual(3,len(subchildren))
|
||||
|
||||
def test_SaveThenLoad(self):
|
||||
il = IgnoreList()
|
||||
il.Ignore('foo','bar')
|
||||
il.Ignore('foo','bleh')
|
||||
il.Ignore('bleh','bar')
|
||||
il.Ignore(u'\u00e9','bar')
|
||||
f = cStringIO.StringIO()
|
||||
il.save_to_xml(f)
|
||||
f.seek(0)
|
||||
il = IgnoreList()
|
||||
il.load_from_xml(f)
|
||||
self.assertEqual(4,len(il))
|
||||
self.assert_(il.AreIgnored(u'\u00e9','bar'))
|
||||
|
||||
def test_LoadXML_with_empty_file_tags(self):
|
||||
f = cStringIO.StringIO()
|
||||
f.write('<?xml version="1.0" encoding="utf-8"?><ignore_list><file><file/></file></ignore_list>')
|
||||
f.seek(0)
|
||||
il = IgnoreList()
|
||||
il.load_from_xml(f)
|
||||
self.assertEqual(0,len(il))
|
||||
|
||||
def test_AreIgnore_works_when_a_child_is_a_key_somewhere_else(self):
|
||||
il = IgnoreList()
|
||||
il.Ignore('foo','bar')
|
||||
il.Ignore('bar','baz')
|
||||
self.assert_(il.AreIgnored('bar','foo'))
|
||||
|
||||
|
||||
def test_no_dupes_when_a_child_is_a_key_somewhere_else(self):
|
||||
il = IgnoreList()
|
||||
il.Ignore('foo','bar')
|
||||
il.Ignore('bar','baz')
|
||||
il.Ignore('bar','foo')
|
||||
self.assertEqual(2,len(il))
|
||||
|
||||
def test_iterate(self):
|
||||
#It must be possible to iterate through ignore list
|
||||
il = IgnoreList()
|
||||
expected = [('foo','bar'),('bar','baz'),('foo','baz')]
|
||||
for i in expected:
|
||||
il.Ignore(i[0],i[1])
|
||||
for i in il:
|
||||
expected.remove(i) #No exception should be raised
|
||||
self.assert_(not expected) #expected should be empty
|
||||
|
||||
def test_filter(self):
|
||||
il = IgnoreList()
|
||||
il.Ignore('foo','bar')
|
||||
il.Ignore('bar','baz')
|
||||
il.Ignore('foo','baz')
|
||||
il.Filter(lambda f,s: f == 'bar')
|
||||
self.assertEqual(1,len(il))
|
||||
self.assert_(not il.AreIgnored('foo','bar'))
|
||||
self.assert_(il.AreIgnored('bar','baz'))
|
||||
|
||||
def test_save_with_non_ascii_non_unicode_items(self):
|
||||
il = IgnoreList()
|
||||
il.Ignore('\xac','\xbf')
|
||||
f = cStringIO.StringIO()
|
||||
try:
|
||||
il.save_to_xml(f)
|
||||
except Exception,e:
|
||||
self.fail(str(e))
|
||||
|
||||
def test_len(self):
|
||||
il = IgnoreList()
|
||||
self.assertEqual(0,len(il))
|
||||
il.Ignore('foo','bar')
|
||||
self.assertEqual(1,len(il))
|
||||
|
||||
def test_nonzero(self):
|
||||
il = IgnoreList()
|
||||
self.assert_(not il)
|
||||
il.Ignore('foo','bar')
|
||||
self.assert_(il)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
93
py/modules/block/block.pyx
Normal file
93
py/modules/block/block.pyx
Normal file
@@ -0,0 +1,93 @@
|
||||
# Created By: Virgil Dupras
|
||||
# Created On: 2009-04-23
|
||||
# $Id$
|
||||
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
|
||||
|
||||
cdef extern from "stdlib.h":
|
||||
int abs(int n) # required so that abs() is applied on ints, not python objects
|
||||
|
||||
class NoBlocksError(Exception):
|
||||
"""avgdiff/maxdiff has been called with empty lists"""
|
||||
|
||||
class DifferentBlockCountError(Exception):
|
||||
"""avgdiff/maxdiff has been called with 2 block lists of different size."""
|
||||
|
||||
|
||||
cdef object getblock(object image):
|
||||
"""Returns a 3 sized tuple containing the mean color of 'image'.
|
||||
|
||||
image: a PIL image or crop.
|
||||
"""
|
||||
cdef int pixel_count, red, green, blue, r, g, b
|
||||
if image.size[0]:
|
||||
pixel_count = image.size[0] * image.size[1]
|
||||
red = green = blue = 0
|
||||
for r, g, b in image.getdata():
|
||||
red += r
|
||||
green += g
|
||||
blue += b
|
||||
return (red // pixel_count, green // pixel_count, blue // pixel_count)
|
||||
else:
|
||||
return (0, 0, 0)
|
||||
|
||||
def getblocks2(image, int block_count_per_side):
|
||||
"""Returns a list of blocks (3 sized tuples).
|
||||
|
||||
image: A PIL image to base the blocks on.
|
||||
block_count_per_side: This integer determine the number of blocks the function will return.
|
||||
If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not
|
||||
necessarely cover square areas. The area covered by each block will be proportional to the image
|
||||
itself.
|
||||
"""
|
||||
if not image.size[0]:
|
||||
return []
|
||||
cdef int width, height, block_width, block_height, ih, iw, top, bottom, left, right
|
||||
width, height = image.size
|
||||
block_width = max(width // block_count_per_side, 1)
|
||||
block_height = max(height // block_count_per_side, 1)
|
||||
result = []
|
||||
for ih in range(block_count_per_side):
|
||||
top = min(ih * block_height, height - block_height)
|
||||
bottom = top + block_height
|
||||
for iw in range(block_count_per_side):
|
||||
left = min(iw * block_width, width - block_width)
|
||||
right = left + block_width
|
||||
box = (left, top, right, bottom)
|
||||
crop = image.crop(box)
|
||||
result.append(getblock(crop))
|
||||
return result
|
||||
|
||||
cdef int diff(first, second):
|
||||
"""Returns the difference between the first block and the second.
|
||||
|
||||
It returns an absolute sum of the 3 differences (RGB).
|
||||
"""
|
||||
cdef int r1, g1, b1, r2, g2, b2
|
||||
r1, g1, b1 = first
|
||||
r2, g2, b2 = second
|
||||
return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2)
|
||||
|
||||
def avgdiff(first, second, int limit, int min_iterations):
|
||||
"""Returns the average diff between first blocks and seconds.
|
||||
|
||||
If the result surpasses limit, limit + 1 is returned, except if less than min_iterations
|
||||
iterations have been made in the blocks.
|
||||
"""
|
||||
cdef int count, sum, i, iteration_count
|
||||
count = len(first)
|
||||
if count != len(second):
|
||||
raise DifferentBlockCountError()
|
||||
if not count:
|
||||
raise NoBlocksError()
|
||||
sum = 0
|
||||
for i in range(count):
|
||||
iteration_count = i + 1
|
||||
item1 = first[i]
|
||||
item2 = second[i]
|
||||
sum += diff(item1, item2)
|
||||
if sum > limit * iteration_count and iteration_count >= min_iterations:
|
||||
return limit + 1
|
||||
result = sum // count
|
||||
if (not result) and sum:
|
||||
result = 1
|
||||
return result
|
||||
14
py/modules/block/setup.py
Normal file
14
py/modules/block/setup.py
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/usr/bin/env python
|
||||
# Created By: Virgil Dupras
|
||||
# Created On: 2009-04-23
|
||||
# $Id$
|
||||
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
|
||||
|
||||
from distutils.core import setup
|
||||
from distutils.extension import Extension
|
||||
from Cython.Distutils import build_ext
|
||||
|
||||
setup(
|
||||
cmdclass = {'build_ext': build_ext},
|
||||
ext_modules = [Extension("_block", ["block.pyx"])]
|
||||
)
|
||||
34
py/modules/cache/cache.pyx
vendored
Normal file
34
py/modules/cache/cache.pyx
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env python
|
||||
# Created By: Virgil Dupras
|
||||
# Created On: 2009-04-23
|
||||
# $Id$
|
||||
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
|
||||
|
||||
# ok, this is hacky and stuff, but I don't know C well enough to play with char buffers, copy
|
||||
# them around and stuff
|
||||
cdef int xchar_to_int(char c):
|
||||
if 48 <= c <= 57: # 0-9
|
||||
return c - 48
|
||||
elif 65 <= c <= 70: # A-F
|
||||
return c - 55
|
||||
elif 97 <= c <= 102: # a-f
|
||||
return c - 87
|
||||
|
||||
def string_to_colors(s):
|
||||
"""Transform the string 's' in a list of 3 sized tuples.
|
||||
"""
|
||||
result = []
|
||||
cdef int i, char_count, r, g, b
|
||||
cdef char* cs
|
||||
char_count = len(s)
|
||||
char_count = (char_count // 6) * 6
|
||||
cs = s
|
||||
for i in range(0, char_count, 6):
|
||||
r = xchar_to_int(cs[i]) << 4
|
||||
r += xchar_to_int(cs[i+1])
|
||||
g = xchar_to_int(cs[i+2]) << 4
|
||||
g += xchar_to_int(cs[i+3])
|
||||
b = xchar_to_int(cs[i+4]) << 4
|
||||
b += xchar_to_int(cs[i+5])
|
||||
result.append((r, g, b))
|
||||
return result
|
||||
14
py/modules/cache/setup.py
vendored
Normal file
14
py/modules/cache/setup.py
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/usr/bin/env python
|
||||
# Created By: Virgil Dupras
|
||||
# Created On: 2009-04-23
|
||||
# $Id$
|
||||
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
|
||||
|
||||
from distutils.core import setup
|
||||
from distutils.extension import Extension
|
||||
from Cython.Distutils import build_ext
|
||||
|
||||
setup(
|
||||
cmdclass = {'build_ext': build_ext},
|
||||
ext_modules = [Extension("_cache", ["cache.pyx"])]
|
||||
)
|
||||
0
py/picture/__init__.py
Normal file
0
py/picture/__init__.py
Normal file
124
py/picture/block.py
Normal file
124
py/picture/block.py
Normal file
@@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: hs.picture.block
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/09/01
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-26 18:12:39 +0200 (Tue, 26 May 2009) $
|
||||
$Revision: 4365 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
from _block import NoBlocksError, DifferentBlockCountError, avgdiff, getblocks2
|
||||
|
||||
# Converted to Cython
|
||||
# def getblock(image):
|
||||
# """Returns a 3 sized tuple containing the mean color of 'image'.
|
||||
#
|
||||
# image: a PIL image or crop.
|
||||
# """
|
||||
# if image.size[0]:
|
||||
# pixel_count = image.size[0] * image.size[1]
|
||||
# red = green = blue = 0
|
||||
# for r,g,b in image.getdata():
|
||||
# red += r
|
||||
# green += g
|
||||
# blue += b
|
||||
# return (red // pixel_count, green // pixel_count, blue // pixel_count)
|
||||
# else:
|
||||
# return (0,0,0)
|
||||
|
||||
# This is not used anymore
|
||||
# def getblocks(image,blocksize):
|
||||
# """Returns a list of blocks (3 sized tuples).
|
||||
#
|
||||
# image: A PIL image to base the blocks on.
|
||||
# blocksize: The size of the blocks to be create. This is a single integer, defining
|
||||
# both width and height (blocks are square).
|
||||
# """
|
||||
# if min(image.size) < blocksize:
|
||||
# return ()
|
||||
# result = []
|
||||
# for i in xrange(image.size[1] // blocksize):
|
||||
# for j in xrange(image.size[0] // blocksize):
|
||||
# box = (blocksize * j, blocksize * i, blocksize * (j + 1), blocksize * (i + 1))
|
||||
# crop = image.crop(box)
|
||||
# result.append(getblock(crop))
|
||||
# return result
|
||||
|
||||
# Converted to Cython
|
||||
# def getblocks2(image,block_count_per_side):
|
||||
# """Returns a list of blocks (3 sized tuples).
|
||||
#
|
||||
# image: A PIL image to base the blocks on.
|
||||
# block_count_per_side: This integer determine the number of blocks the function will return.
|
||||
# If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not
|
||||
# necessarely cover square areas. The area covered by each block will be proportional to the image
|
||||
# itself.
|
||||
# """
|
||||
# if not image.size[0]:
|
||||
# return []
|
||||
# width,height = image.size
|
||||
# block_width = max(width // block_count_per_side,1)
|
||||
# block_height = max(height // block_count_per_side,1)
|
||||
# result = []
|
||||
# for ih in range(block_count_per_side):
|
||||
# top = min(ih * block_height, height - block_height)
|
||||
# bottom = top + block_height
|
||||
# for iw in range(block_count_per_side):
|
||||
# left = min(iw * block_width, width - block_width)
|
||||
# right = left + block_width
|
||||
# box = (left,top,right,bottom)
|
||||
# crop = image.crop(box)
|
||||
# result.append(getblock(crop))
|
||||
# return result
|
||||
|
||||
# Converted to Cython
|
||||
# def diff(first, second):
|
||||
# """Returns the difference between the first block and the second.
|
||||
#
|
||||
# It returns an absolute sum of the 3 differences (RGB).
|
||||
# """
|
||||
# r1, g1, b1 = first
|
||||
# r2, g2, b2 = second
|
||||
# return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2)
|
||||
|
||||
# Converted to Cython
|
||||
# def avgdiff(first, second, limit=768, min_iterations=1):
|
||||
# """Returns the average diff between first blocks and seconds.
|
||||
#
|
||||
# If the result surpasses limit, limit + 1 is returned, except if less than min_iterations
|
||||
# iterations have been made in the blocks.
|
||||
# """
|
||||
# if len(first) != len(second):
|
||||
# raise DifferentBlockCountError
|
||||
# if not first:
|
||||
# raise NoBlocksError
|
||||
# count = len(first)
|
||||
# sum = 0
|
||||
# zipped = izip(xrange(1, count + 1), first, second)
|
||||
# for i, first, second in zipped:
|
||||
# sum += diff(first, second)
|
||||
# if sum > limit * i and i >= min_iterations:
|
||||
# return limit + 1
|
||||
# result = sum // count
|
||||
# if (not result) and sum:
|
||||
# result = 1
|
||||
# return result
|
||||
|
||||
# This is not used anymore
|
||||
# def maxdiff(first,second,limit=768):
|
||||
# """Returns the max diff between first blocks and seconds.
|
||||
#
|
||||
# If the result surpasses limit, the first max being over limit is returned.
|
||||
# """
|
||||
# if len(first) != len(second):
|
||||
# raise DifferentBlockCountError
|
||||
# if not first:
|
||||
# raise NoBlocksError
|
||||
# result = 0
|
||||
# zipped = zip(first,second)
|
||||
# for first,second in zipped:
|
||||
# result = max(result,diff(first,second))
|
||||
# if result > limit:
|
||||
# return result
|
||||
# return result
|
||||
313
py/picture/block_test.py
Normal file
313
py/picture/block_test.py
Normal file
@@ -0,0 +1,313 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: tests.picture.block
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/09/01
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
# The commented out tests are tests for function that have been converted to pure C for speed
|
||||
import unittest
|
||||
|
||||
from .block import *
|
||||
|
||||
def my_avgdiff(first, second, limit=768, min_iter=3): # this is so I don't have to re-write every call
|
||||
return avgdiff(first, second, limit, min_iter)
|
||||
|
||||
BLACK = (0,0,0)
|
||||
RED = (0xff,0,0)
|
||||
GREEN = (0,0xff,0)
|
||||
BLUE = (0,0,0xff)
|
||||
|
||||
class FakeImage(object):
|
||||
def __init__(self, size, data):
|
||||
self.size = size
|
||||
self.data = data
|
||||
|
||||
def getdata(self):
|
||||
return self.data
|
||||
|
||||
def crop(self, box):
|
||||
pixels = []
|
||||
for i in range(box[1], box[3]):
|
||||
for j in range(box[0], box[2]):
|
||||
pixel = self.data[i * self.size[0] + j]
|
||||
pixels.append(pixel)
|
||||
return FakeImage((box[2] - box[0], box[3] - box[1]), pixels)
|
||||
|
||||
def empty():
|
||||
return FakeImage((0,0), [])
|
||||
|
||||
def single_pixel(): #one red pixel
|
||||
return FakeImage((1, 1), [(0xff,0,0)])
|
||||
|
||||
def four_pixels():
|
||||
pixels = [RED,(0,0x80,0xff),(0x80,0,0),(0,0x40,0x80)]
|
||||
return FakeImage((2, 2), pixels)
|
||||
|
||||
class TCgetblock(unittest.TestCase):
|
||||
def test_single_pixel(self):
|
||||
im = single_pixel()
|
||||
[b] = getblocks2(im, 1)
|
||||
self.assertEqual(RED,b)
|
||||
|
||||
def test_no_pixel(self):
|
||||
im = empty()
|
||||
self.assertEqual([], getblocks2(im, 1))
|
||||
|
||||
def test_four_pixels(self):
|
||||
im = four_pixels()
|
||||
[b] = getblocks2(im, 1)
|
||||
meanred = (0xff + 0x80) // 4
|
||||
meangreen = (0x80 + 0x40) // 4
|
||||
meanblue = (0xff + 0x80) // 4
|
||||
self.assertEqual((meanred,meangreen,meanblue),b)
|
||||
|
||||
|
||||
# class TCdiff(unittest.TestCase):
|
||||
# def test_diff(self):
|
||||
# b1 = (10, 20, 30)
|
||||
# b2 = (1, 2, 3)
|
||||
# self.assertEqual(9 + 18 + 27,diff(b1,b2))
|
||||
#
|
||||
# def test_diff_negative(self):
|
||||
# b1 = (10, 20, 30)
|
||||
# b2 = (1, 2, 3)
|
||||
# self.assertEqual(9 + 18 + 27,diff(b2,b1))
|
||||
#
|
||||
# def test_diff_mixed_positive_and_negative(self):
|
||||
# b1 = (1, 5, 10)
|
||||
# b2 = (10, 1, 15)
|
||||
# self.assertEqual(9 + 4 + 5,diff(b1,b2))
|
||||
#
|
||||
|
||||
# class TCgetblocks(unittest.TestCase):
|
||||
# def test_empty_image(self):
|
||||
# im = empty()
|
||||
# blocks = getblocks(im,1)
|
||||
# self.assertEqual(0,len(blocks))
|
||||
#
|
||||
# def test_one_block_image(self):
|
||||
# im = four_pixels()
|
||||
# blocks = getblocks2(im, 1)
|
||||
# self.assertEqual(1,len(blocks))
|
||||
# block = blocks[0]
|
||||
# meanred = (0xff + 0x80) // 4
|
||||
# meangreen = (0x80 + 0x40) // 4
|
||||
# meanblue = (0xff + 0x80) // 4
|
||||
# self.assertEqual((meanred,meangreen,meanblue),block)
|
||||
#
|
||||
# def test_not_enough_height_to_fit_a_block(self):
|
||||
# im = FakeImage((2,1), [BLACK, BLACK])
|
||||
# blocks = getblocks(im,2)
|
||||
# self.assertEqual(0,len(blocks))
|
||||
#
|
||||
# def xtest_dont_include_leftovers(self):
|
||||
# # this test is disabled because getblocks is not used and getblock in cdeffed
|
||||
# pixels = [
|
||||
# RED,(0,0x80,0xff),BLACK,
|
||||
# (0x80,0,0),(0,0x40,0x80),BLACK,
|
||||
# BLACK,BLACK,BLACK
|
||||
# ]
|
||||
# im = FakeImage((3,3), pixels)
|
||||
# blocks = getblocks(im,2)
|
||||
# block = blocks[0]
|
||||
# #Because the block is smaller than the image, only blocksize must be considered.
|
||||
# meanred = (0xff + 0x80) // 4
|
||||
# meangreen = (0x80 + 0x40) // 4
|
||||
# meanblue = (0xff + 0x80) // 4
|
||||
# self.assertEqual((meanred,meangreen,meanblue),block)
|
||||
#
|
||||
# def xtest_two_blocks(self):
|
||||
# # this test is disabled because getblocks is not used and getblock in cdeffed
|
||||
# pixels = [BLACK for i in xrange(4 * 2)]
|
||||
# pixels[0] = RED
|
||||
# pixels[1] = (0,0x80,0xff)
|
||||
# pixels[4] = (0x80,0,0)
|
||||
# pixels[5] = (0,0x40,0x80)
|
||||
# im = FakeImage((4, 2), pixels)
|
||||
# blocks = getblocks(im,2)
|
||||
# self.assertEqual(2,len(blocks))
|
||||
# block = blocks[0]
|
||||
# #Because the block is smaller than the image, only blocksize must be considered.
|
||||
# meanred = (0xff + 0x80) // 4
|
||||
# meangreen = (0x80 + 0x40) // 4
|
||||
# meanblue = (0xff + 0x80) // 4
|
||||
# self.assertEqual((meanred,meangreen,meanblue),block)
|
||||
# self.assertEqual(BLACK,blocks[1])
|
||||
#
|
||||
# def test_four_blocks(self):
|
||||
# pixels = [BLACK for i in xrange(4 * 4)]
|
||||
# pixels[0] = RED
|
||||
# pixels[1] = (0,0x80,0xff)
|
||||
# pixels[4] = (0x80,0,0)
|
||||
# pixels[5] = (0,0x40,0x80)
|
||||
# im = FakeImage((4, 4), pixels)
|
||||
# blocks = getblocks2(im, 2)
|
||||
# self.assertEqual(4,len(blocks))
|
||||
# block = blocks[0]
|
||||
# #Because the block is smaller than the image, only blocksize must be considered.
|
||||
# meanred = (0xff + 0x80) // 4
|
||||
# meangreen = (0x80 + 0x40) // 4
|
||||
# meanblue = (0xff + 0x80) // 4
|
||||
# self.assertEqual((meanred,meangreen,meanblue),block)
|
||||
# self.assertEqual(BLACK,blocks[1])
|
||||
# self.assertEqual(BLACK,blocks[2])
|
||||
# self.assertEqual(BLACK,blocks[3])
|
||||
#
|
||||
|
||||
class TCgetblocks2(unittest.TestCase):
|
||||
def test_empty_image(self):
|
||||
im = empty()
|
||||
blocks = getblocks2(im,1)
|
||||
self.assertEqual(0,len(blocks))
|
||||
|
||||
def test_one_block_image(self):
|
||||
im = four_pixels()
|
||||
blocks = getblocks2(im,1)
|
||||
self.assertEqual(1,len(blocks))
|
||||
block = blocks[0]
|
||||
meanred = (0xff + 0x80) // 4
|
||||
meangreen = (0x80 + 0x40) // 4
|
||||
meanblue = (0xff + 0x80) // 4
|
||||
self.assertEqual((meanred,meangreen,meanblue),block)
|
||||
|
||||
def test_four_blocks_all_black(self):
|
||||
im = FakeImage((2, 2), [BLACK, BLACK, BLACK, BLACK])
|
||||
blocks = getblocks2(im,2)
|
||||
self.assertEqual(4,len(blocks))
|
||||
for block in blocks:
|
||||
self.assertEqual(BLACK,block)
|
||||
|
||||
def test_two_pixels_image_horizontal(self):
|
||||
pixels = [RED,BLUE]
|
||||
im = FakeImage((2, 1), pixels)
|
||||
blocks = getblocks2(im,2)
|
||||
self.assertEqual(4,len(blocks))
|
||||
self.assertEqual(RED,blocks[0])
|
||||
self.assertEqual(BLUE,blocks[1])
|
||||
self.assertEqual(RED,blocks[2])
|
||||
self.assertEqual(BLUE,blocks[3])
|
||||
|
||||
def test_two_pixels_image_vertical(self):
|
||||
pixels = [RED,BLUE]
|
||||
im = FakeImage((1, 2), pixels)
|
||||
blocks = getblocks2(im,2)
|
||||
self.assertEqual(4,len(blocks))
|
||||
self.assertEqual(RED,blocks[0])
|
||||
self.assertEqual(RED,blocks[1])
|
||||
self.assertEqual(BLUE,blocks[2])
|
||||
self.assertEqual(BLUE,blocks[3])
|
||||
|
||||
|
||||
class TCavgdiff(unittest.TestCase):
|
||||
def test_empty(self):
|
||||
self.assertRaises(NoBlocksError, my_avgdiff, [], [])
|
||||
|
||||
def test_two_blocks(self):
|
||||
im = empty()
|
||||
b1 = (5,10,15)
|
||||
b2 = (255,250,245)
|
||||
b3 = (0,0,0)
|
||||
b4 = (255,0,255)
|
||||
blocks1 = [b1,b2]
|
||||
blocks2 = [b3,b4]
|
||||
expected1 = 5 + 10 + 15
|
||||
expected2 = 0 + 250 + 10
|
||||
expected = (expected1 + expected2) // 2
|
||||
self.assertEqual(expected, my_avgdiff(blocks1, blocks2))
|
||||
|
||||
def test_blocks_not_the_same_size(self):
|
||||
b = (0,0,0)
|
||||
self.assertRaises(DifferentBlockCountError,my_avgdiff,[b,b],[b])
|
||||
|
||||
def test_first_arg_is_empty_but_not_second(self):
|
||||
#Don't return 0 (as when the 2 lists are empty), raise!
|
||||
b = (0,0,0)
|
||||
self.assertRaises(DifferentBlockCountError,my_avgdiff,[],[b])
|
||||
|
||||
def test_limit(self):
|
||||
ref = (0,0,0)
|
||||
b1 = (10,10,10) #avg 30
|
||||
b2 = (20,20,20) #avg 45
|
||||
b3 = (30,30,30) #avg 60
|
||||
blocks1 = [ref,ref,ref]
|
||||
blocks2 = [b1,b2,b3]
|
||||
self.assertEqual(45,my_avgdiff(blocks1,blocks2,44))
|
||||
|
||||
def test_min_iterations(self):
|
||||
ref = (0,0,0)
|
||||
b1 = (10,10,10) #avg 30
|
||||
b2 = (20,20,20) #avg 45
|
||||
b3 = (10,10,10) #avg 40
|
||||
blocks1 = [ref,ref,ref]
|
||||
blocks2 = [b1,b2,b3]
|
||||
self.assertEqual(40,my_avgdiff(blocks1,blocks2,45 - 1,3))
|
||||
|
||||
# Bah, I don't know why this test fails, but I don't think it matters very much
|
||||
# def test_just_over_the_limit(self):
|
||||
# #A score just over the limit might return exactly the limit due to truncating. We should
|
||||
# #ceil() the result in this case.
|
||||
# ref = (0,0,0)
|
||||
# b1 = (10,0,0)
|
||||
# b2 = (11,0,0)
|
||||
# blocks1 = [ref,ref]
|
||||
# blocks2 = [b1,b2]
|
||||
# self.assertEqual(11,my_avgdiff(blocks1,blocks2,10))
|
||||
#
|
||||
def test_return_at_least_1_at_the_slightest_difference(self):
|
||||
ref = (0,0,0)
|
||||
b1 = (1,0,0)
|
||||
blocks1 = [ref for i in xrange(250)]
|
||||
blocks2 = [ref for i in xrange(250)]
|
||||
blocks2[0] = b1
|
||||
self.assertEqual(1,my_avgdiff(blocks1,blocks2))
|
||||
|
||||
def test_return_0_if_there_is_no_difference(self):
|
||||
ref = (0,0,0)
|
||||
blocks1 = [ref,ref]
|
||||
blocks2 = [ref,ref]
|
||||
self.assertEqual(0,my_avgdiff(blocks1,blocks2))
|
||||
|
||||
|
||||
# class TCmaxdiff(unittest.TestCase):
|
||||
# def test_empty(self):
|
||||
# self.assertRaises(NoBlocksError,maxdiff,[],[])
|
||||
#
|
||||
# def test_two_blocks(self):
|
||||
# b1 = (5,10,15)
|
||||
# b2 = (255,250,245)
|
||||
# b3 = (0,0,0)
|
||||
# b4 = (255,0,255)
|
||||
# blocks1 = [b1,b2]
|
||||
# blocks2 = [b3,b4]
|
||||
# expected1 = 5 + 10 + 15
|
||||
# expected2 = 0 + 250 + 10
|
||||
# expected = max(expected1,expected2)
|
||||
# self.assertEqual(expected,maxdiff(blocks1,blocks2))
|
||||
#
|
||||
# def test_blocks_not_the_same_size(self):
|
||||
# b = (0,0,0)
|
||||
# self.assertRaises(DifferentBlockCountError,maxdiff,[b,b],[b])
|
||||
#
|
||||
# def test_first_arg_is_empty_but_not_second(self):
|
||||
# #Don't return 0 (as when the 2 lists are empty), raise!
|
||||
# b = (0,0,0)
|
||||
# self.assertRaises(DifferentBlockCountError,maxdiff,[],[b])
|
||||
#
|
||||
# def test_limit(self):
|
||||
# b1 = (5,10,15)
|
||||
# b2 = (255,250,245)
|
||||
# b3 = (0,0,0)
|
||||
# b4 = (255,0,255)
|
||||
# blocks1 = [b1,b2]
|
||||
# blocks2 = [b3,b4]
|
||||
# expected1 = 5 + 10 + 15
|
||||
# expected2 = 0 + 250 + 10
|
||||
# self.assertEqual(expected1,maxdiff(blocks1,blocks2,expected1 - 1))
|
||||
#
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
134
py/picture/cache.py
Normal file
134
py/picture/cache.py
Normal file
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: hs.picture.cache
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/09/14
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4392 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
import sqlite3 as sqlite
|
||||
|
||||
import hsutil.sqlite
|
||||
|
||||
from _cache import string_to_colors
|
||||
|
||||
def colors_to_string(colors):
|
||||
"""Transform the 3 sized tuples 'colors' into a hex string.
|
||||
|
||||
[(0,100,255)] --> 0064ff
|
||||
[(1,2,3),(4,5,6)] --> 010203040506
|
||||
"""
|
||||
return ''.join(['%02x%02x%02x' % (r,g,b) for r,g,b in colors])
|
||||
|
||||
# This function is an important bottleneck of dupeGuru PE. It has been converted to Cython.
|
||||
# def string_to_colors(s):
|
||||
# """Transform the string 's' in a list of 3 sized tuples.
|
||||
# """
|
||||
# result = []
|
||||
# for i in xrange(0, len(s), 6):
|
||||
# number = int(s[i:i+6], 16)
|
||||
# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
|
||||
# return result
|
||||
|
||||
class Cache(object):
|
||||
"""A class to cache picture blocks.
|
||||
"""
|
||||
def __init__(self, db=':memory:', threaded=True):
|
||||
def create_tables():
|
||||
sql = "create table pictures(path TEXT, blocks TEXT)"
|
||||
self.con.execute(sql);
|
||||
sql = "create index idx_path on pictures (path)"
|
||||
self.con.execute(sql)
|
||||
|
||||
self.dbname = db
|
||||
if threaded:
|
||||
self.con = hsutil.sqlite.ThreadedConn(db, True)
|
||||
else:
|
||||
self.con = sqlite.connect(db, isolation_level=None)
|
||||
try:
|
||||
self.con.execute("select * from pictures where 1=2")
|
||||
except sqlite.OperationalError: # new db
|
||||
create_tables()
|
||||
except sqlite.DatabaseError, e: # corrupted db
|
||||
logging.warning('Could not create picture cache because of an error: %s', str(e))
|
||||
self.con.close()
|
||||
os.remove(db)
|
||||
if threaded:
|
||||
self.con = hsutil.sqlite.ThreadedConn(db, True)
|
||||
else:
|
||||
self.con = sqlite.connect(db, isolation_level=None)
|
||||
create_tables()
|
||||
|
||||
def __contains__(self, key):
|
||||
sql = "select count(*) from pictures where path = ?"
|
||||
result = self.con.execute(sql, [key]).fetchall()
|
||||
return result[0][0] > 0
|
||||
|
||||
def __delitem__(self, key):
|
||||
if key not in self:
|
||||
raise KeyError(key)
|
||||
sql = "delete from pictures where path = ?"
|
||||
self.con.execute(sql, [key])
|
||||
|
||||
# Optimized
|
||||
def __getitem__(self, key):
|
||||
if isinstance(key, int):
|
||||
sql = "select blocks from pictures where rowid = ?"
|
||||
else:
|
||||
sql = "select blocks from pictures where path = ?"
|
||||
result = self.con.execute(sql, [key]).fetchone()
|
||||
if result:
|
||||
result = string_to_colors(result[0])
|
||||
return result
|
||||
else:
|
||||
raise KeyError(key)
|
||||
|
||||
def __iter__(self):
|
||||
sql = "select path from pictures"
|
||||
result = self.con.execute(sql)
|
||||
return (row[0] for row in result)
|
||||
|
||||
def __len__(self):
|
||||
sql = "select count(*) from pictures"
|
||||
result = self.con.execute(sql).fetchall()
|
||||
return result[0][0]
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
value = colors_to_string(value)
|
||||
if key in self:
|
||||
sql = "update pictures set blocks = ? where path = ?"
|
||||
else:
|
||||
sql = "insert into pictures(blocks,path) values(?,?)"
|
||||
try:
|
||||
self.con.execute(sql, [value, key])
|
||||
except sqlite.OperationalError:
|
||||
logging.warning('Picture cache could not set %r for key %r', value, key)
|
||||
except sqlite.DatabaseError, e:
|
||||
logging.warning('DatabaseError while setting %r for key %r: %s', value, key, str(e))
|
||||
|
||||
def clear(self):
|
||||
sql = "delete from pictures"
|
||||
self.con.execute(sql)
|
||||
|
||||
def filter(self, func):
|
||||
to_delete = [key for key in self if not func(key)]
|
||||
for key in to_delete:
|
||||
del self[key]
|
||||
|
||||
def get_id(self, path):
|
||||
sql = "select rowid from pictures where path = ?"
|
||||
result = self.con.execute(sql, [path]).fetchone()
|
||||
if result:
|
||||
return result[0]
|
||||
else:
|
||||
raise ValueError(path)
|
||||
|
||||
def get_multiple(self, rowids):
|
||||
sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids))
|
||||
cur = self.con.execute(sql)
|
||||
return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)
|
||||
|
||||
159
py/picture/cache_test.py
Normal file
159
py/picture/cache_test.py
Normal file
@@ -0,0 +1,159 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: tests.picture.cache
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/09/14
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import unittest
|
||||
from StringIO import StringIO
|
||||
import os.path as op
|
||||
import os
|
||||
import threading
|
||||
|
||||
from hsutil.testcase import TestCase
|
||||
from .cache import *
|
||||
|
||||
class TCcolors_to_string(unittest.TestCase):
|
||||
def test_no_color(self):
|
||||
self.assertEqual('',colors_to_string([]))
|
||||
|
||||
def test_single_color(self):
|
||||
self.assertEqual('000000',colors_to_string([(0,0,0)]))
|
||||
self.assertEqual('010101',colors_to_string([(1,1,1)]))
|
||||
self.assertEqual('0a141e',colors_to_string([(10,20,30)]))
|
||||
|
||||
def test_two_colors(self):
|
||||
self.assertEqual('000102030405',colors_to_string([(0,1,2),(3,4,5)]))
|
||||
|
||||
|
||||
class TCstring_to_colors(unittest.TestCase):
|
||||
def test_empty(self):
|
||||
self.assertEqual([],string_to_colors(''))
|
||||
|
||||
def test_single_color(self):
|
||||
self.assertEqual([(0,0,0)],string_to_colors('000000'))
|
||||
self.assertEqual([(2,3,4)],string_to_colors('020304'))
|
||||
self.assertEqual([(10,20,30)],string_to_colors('0a141e'))
|
||||
|
||||
def test_two_colors(self):
|
||||
self.assertEqual([(10,20,30),(40,50,60)],string_to_colors('0a141e28323c'))
|
||||
|
||||
def test_incomplete_color(self):
|
||||
# don't return anything if it's not a complete color
|
||||
self.assertEqual([],string_to_colors('102'))
|
||||
|
||||
|
||||
class TCCache(TestCase):
|
||||
def test_empty(self):
|
||||
c = Cache()
|
||||
self.assertEqual(0,len(c))
|
||||
self.assertRaises(KeyError,c.__getitem__,'foo')
|
||||
|
||||
def test_set_then_retrieve_blocks(self):
|
||||
c = Cache()
|
||||
b = [(0,0,0),(1,2,3)]
|
||||
c['foo'] = b
|
||||
self.assertEqual(b,c['foo'])
|
||||
|
||||
def test_delitem(self):
|
||||
c = Cache()
|
||||
c['foo'] = ''
|
||||
del c['foo']
|
||||
self.assert_('foo' not in c)
|
||||
self.assertRaises(KeyError,c.__delitem__,'foo')
|
||||
|
||||
def test_persistance(self):
|
||||
DBNAME = op.join(self.tmpdir(), 'hstest.db')
|
||||
c = Cache(DBNAME)
|
||||
c['foo'] = [(1,2,3)]
|
||||
del c
|
||||
c = Cache(DBNAME)
|
||||
self.assertEqual([(1,2,3)],c['foo'])
|
||||
del c
|
||||
os.remove(DBNAME)
|
||||
|
||||
def test_filter(self):
|
||||
c = Cache()
|
||||
c['foo'] = ''
|
||||
c['bar'] = ''
|
||||
c['baz'] = ''
|
||||
c.filter(lambda p:p != 'bar') #only 'bar' is removed
|
||||
self.assertEqual(2,len(c))
|
||||
self.assert_('foo' in c)
|
||||
self.assert_('baz' in c)
|
||||
self.assert_('bar' not in c)
|
||||
|
||||
def test_clear(self):
|
||||
c = Cache()
|
||||
c['foo'] = ''
|
||||
c['bar'] = ''
|
||||
c['baz'] = ''
|
||||
c.clear()
|
||||
self.assertEqual(0,len(c))
|
||||
self.assert_('foo' not in c)
|
||||
self.assert_('baz' not in c)
|
||||
self.assert_('bar' not in c)
|
||||
|
||||
def test_corrupted_db(self):
|
||||
dbname = op.join(self.tmpdir(), 'foo.db')
|
||||
fp = open(dbname, 'w')
|
||||
fp.write('invalid sqlite content')
|
||||
fp.close()
|
||||
c = Cache(dbname) # should not raise a DatabaseError
|
||||
c['foo'] = [(1, 2, 3)]
|
||||
del c
|
||||
c = Cache(dbname)
|
||||
self.assertEqual(c['foo'], [(1, 2, 3)])
|
||||
|
||||
def test_by_id(self):
|
||||
# it's possible to use the cache by referring to the files by their row_id
|
||||
c = Cache()
|
||||
b = [(0,0,0),(1,2,3)]
|
||||
c['foo'] = b
|
||||
foo_id = c.get_id('foo')
|
||||
self.assertEqual(c[foo_id], b)
|
||||
|
||||
|
||||
class TCCacheSQLEscape(unittest.TestCase):
|
||||
def test_contains(self):
|
||||
c = Cache()
|
||||
self.assert_("foo'bar" not in c)
|
||||
|
||||
def test_getitem(self):
|
||||
c = Cache()
|
||||
self.assertRaises(KeyError, c.__getitem__, "foo'bar")
|
||||
|
||||
def test_setitem(self):
|
||||
c = Cache()
|
||||
c["foo'bar"] = []
|
||||
|
||||
def test_delitem(self):
|
||||
c = Cache()
|
||||
c["foo'bar"] = []
|
||||
try:
|
||||
del c["foo'bar"]
|
||||
except KeyError:
|
||||
self.fail()
|
||||
|
||||
|
||||
class TCCacheThreaded(unittest.TestCase):
|
||||
def test_access_cache(self):
|
||||
def thread_run():
|
||||
try:
|
||||
c['foo'] = [(1,2,3)]
|
||||
except sqlite.ProgrammingError:
|
||||
self.fail()
|
||||
|
||||
c = Cache()
|
||||
t = threading.Thread(target=thread_run)
|
||||
t.start()
|
||||
t.join()
|
||||
self.assertEqual([(1,2,3)], c['foo'])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
136
py/picture/matchbase.py
Normal file
136
py/picture/matchbase.py
Normal file
@@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: hs.picture._match
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2007/02/25
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 16:02:48 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4388 $
|
||||
Copyright 2007 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import logging
|
||||
import multiprocessing
|
||||
from Queue import Empty
|
||||
from collections import defaultdict
|
||||
|
||||
from hsutil import job
|
||||
from hs.utils.misc import dedupe
|
||||
|
||||
from dupeguru.engine import Match
|
||||
from block import avgdiff, DifferentBlockCountError, NoBlocksError
|
||||
from cache import Cache
|
||||
|
||||
MIN_ITERATIONS = 3
|
||||
|
||||
def get_match(first,second,percentage):
|
||||
if percentage < 0:
|
||||
percentage = 0
|
||||
return Match(first,second,percentage)
|
||||
|
||||
class MatchFactory(object):
|
||||
cached_blocks = None
|
||||
block_count_per_side = 15
|
||||
threshold = 75
|
||||
match_scaled = False
|
||||
|
||||
def _do_getmatches(self, files, j):
|
||||
raise NotImplementedError()
|
||||
|
||||
def getmatches(self, files, j=job.nulljob):
|
||||
# The MemoryError handlers in there use logging without first caring about whether or not
|
||||
# there is enough memory left to carry on the operation because it is assumed that the
|
||||
# MemoryError happens when trying to read an image file, which is freed from memory by the
|
||||
# time that MemoryError is raised.
|
||||
j = j.start_subjob([2, 8])
|
||||
logging.info('Preparing %d files' % len(files))
|
||||
prepared = self.prepare_files(files, j)
|
||||
logging.info('Finished preparing %d files' % len(prepared))
|
||||
return self._do_getmatches(prepared, j)
|
||||
|
||||
def prepare_files(self, files, j=job.nulljob):
|
||||
prepared = [] # only files for which there was no error getting blocks
|
||||
try:
|
||||
for picture in j.iter_with_progress(files, 'Analyzed %d/%d pictures'):
|
||||
picture.dimensions
|
||||
picture.unicode_path = unicode(picture.path)
|
||||
try:
|
||||
if picture.unicode_path not in self.cached_blocks:
|
||||
blocks = picture.get_blocks(self.block_count_per_side)
|
||||
self.cached_blocks[picture.unicode_path] = blocks
|
||||
prepared.append(picture)
|
||||
except IOError as e:
|
||||
logging.warning(unicode(e))
|
||||
except MemoryError:
|
||||
logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
|
||||
if picture.size < 10 * 1024 * 1024: # We're really running out of memory
|
||||
raise
|
||||
except MemoryError:
|
||||
logging.warning('Ran out of memory while preparing files')
|
||||
return prepared
|
||||
|
||||
|
||||
def async_compare(ref_id, other_ids, dbname, threshold):
|
||||
cache = Cache(dbname, threaded=False)
|
||||
limit = 100 - threshold
|
||||
ref_blocks = cache[ref_id]
|
||||
pairs = cache.get_multiple(other_ids)
|
||||
results = []
|
||||
for other_id, other_blocks in pairs:
|
||||
try:
|
||||
diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
|
||||
percentage = 100 - diff
|
||||
except (DifferentBlockCountError, NoBlocksError):
|
||||
percentage = 0
|
||||
if percentage >= threshold:
|
||||
results.append((ref_id, other_id, percentage))
|
||||
cache.con.close()
|
||||
return results
|
||||
|
||||
class AsyncMatchFactory(MatchFactory):
|
||||
def _do_getmatches(self, pictures, j):
|
||||
def empty_out_queue(queue, into):
|
||||
try:
|
||||
while True:
|
||||
into.append(queue.get(block=False))
|
||||
except Empty:
|
||||
pass
|
||||
|
||||
j = j.start_subjob([1, 8, 1], 'Preparing for matching')
|
||||
cache = self.cached_blocks
|
||||
id2picture = {}
|
||||
dimensions2pictures = defaultdict(set)
|
||||
for picture in pictures[:]:
|
||||
try:
|
||||
picture.cache_id = cache.get_id(picture.unicode_path)
|
||||
id2picture[picture.cache_id] = picture
|
||||
except ValueError:
|
||||
pictures.remove(picture)
|
||||
if not self.match_scaled:
|
||||
dimensions2pictures[picture.dimensions].add(picture)
|
||||
pool = multiprocessing.Pool()
|
||||
async_results = []
|
||||
pictures_copy = set(pictures)
|
||||
for ref in j.iter_with_progress(pictures):
|
||||
others = pictures_copy if self.match_scaled else dimensions2pictures[ref.dimensions]
|
||||
others.remove(ref)
|
||||
if others:
|
||||
cache_ids = [f.cache_id for f in others]
|
||||
args = (ref.cache_id, cache_ids, self.cached_blocks.dbname, self.threshold)
|
||||
async_results.append(pool.apply_async(async_compare, args))
|
||||
|
||||
matches = []
|
||||
for result in j.iter_with_progress(async_results, 'Matched %d/%d pictures'):
|
||||
matches.extend(result.get())
|
||||
|
||||
result = []
|
||||
for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
|
||||
ref = id2picture[ref_id]
|
||||
other = id2picture[other_id]
|
||||
if percentage == 100 and ref.md5 != other.md5:
|
||||
percentage = 99
|
||||
if percentage >= self.threshold:
|
||||
result.append(get_match(ref, other, percentage))
|
||||
return result
|
||||
|
||||
|
||||
multiprocessing.freeze_support()
|
||||
359
py/results.py
Normal file
359
py/results.py
Normal file
@@ -0,0 +1,359 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.results
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/02/23
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4392 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import re
|
||||
from xml.sax import handler, make_parser, SAXException
|
||||
from xml.sax.saxutils import XMLGenerator
|
||||
from xml.sax.xmlreader import AttributesImpl
|
||||
|
||||
from . import engine
|
||||
from hsutil.job import nulljob
|
||||
from hsutil.markable import Markable
|
||||
from hsutil.misc import flatten, cond, nonone
|
||||
from hsutil.str import format_size
|
||||
from hsutil.files import open_if_filename
|
||||
|
||||
class Results(Markable):
|
||||
#---Override
|
||||
def __init__(self, data_module):
|
||||
super(Results, self).__init__()
|
||||
self.__groups = []
|
||||
self.__group_of_duplicate = {}
|
||||
self.__groups_sort_descriptor = None # This is a tuple (key, asc)
|
||||
self.__dupes = None
|
||||
self.__dupes_sort_descriptor = None # This is a tuple (key, asc, delta)
|
||||
self.__filters = None
|
||||
self.__filtered_dupes = None
|
||||
self.__filtered_groups = None
|
||||
self.__recalculate_stats()
|
||||
self.__marked_size = 0
|
||||
self.data = data_module
|
||||
|
||||
def _did_mark(self, dupe):
|
||||
self.__marked_size += dupe.size
|
||||
|
||||
def _did_unmark(self, dupe):
|
||||
self.__marked_size -= dupe.size
|
||||
|
||||
def _get_markable_count(self):
|
||||
return self.__total_count
|
||||
|
||||
def _is_markable(self, dupe):
|
||||
if dupe.is_ref:
|
||||
return False
|
||||
g = self.get_group_of_duplicate(dupe)
|
||||
if not g:
|
||||
return False
|
||||
if dupe is g.ref:
|
||||
return False
|
||||
if self.__filtered_dupes and dupe not in self.__filtered_dupes:
|
||||
return False
|
||||
return True
|
||||
|
||||
#---Private
|
||||
def __get_dupe_list(self):
|
||||
if self.__dupes is None:
|
||||
self.__dupes = flatten(group.dupes for group in self.groups)
|
||||
if self.__filtered_dupes:
|
||||
self.__dupes = [dupe for dupe in self.__dupes if dupe in self.__filtered_dupes]
|
||||
sd = self.__dupes_sort_descriptor
|
||||
if sd:
|
||||
self.sort_dupes(sd[0], sd[1], sd[2])
|
||||
return self.__dupes
|
||||
|
||||
def __get_groups(self):
|
||||
if self.__filtered_groups is None:
|
||||
return self.__groups
|
||||
else:
|
||||
return self.__filtered_groups
|
||||
|
||||
def __get_stat_line(self):
|
||||
if self.__filtered_dupes is None:
|
||||
mark_count = self.mark_count
|
||||
marked_size = self.__marked_size
|
||||
total_count = self.__total_count
|
||||
total_size = self.__total_size
|
||||
else:
|
||||
mark_count = len([dupe for dupe in self.__filtered_dupes if self.is_marked(dupe)])
|
||||
marked_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_marked(dupe))
|
||||
total_count = len([dupe for dupe in self.__filtered_dupes if self.is_markable(dupe)])
|
||||
total_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_markable(dupe))
|
||||
if self.mark_inverted:
|
||||
marked_size = self.__total_size - marked_size
|
||||
result = '%d / %d (%s / %s) duplicates marked.' % (
|
||||
mark_count,
|
||||
total_count,
|
||||
format_size(marked_size, 2),
|
||||
format_size(total_size, 2),
|
||||
)
|
||||
if self.__filters:
|
||||
result += ' filter: %s' % ' --> '.join(self.__filters)
|
||||
return result
|
||||
|
||||
def __recalculate_stats(self):
|
||||
self.__total_size = 0
|
||||
self.__total_count = 0
|
||||
for group in self.groups:
|
||||
markable = [dupe for dupe in group.dupes if self._is_markable(dupe)]
|
||||
self.__total_count += len(markable)
|
||||
self.__total_size += sum(dupe.size for dupe in markable)
|
||||
|
||||
def __set_groups(self, new_groups):
|
||||
self.mark_none()
|
||||
self.__groups = new_groups
|
||||
self.__group_of_duplicate = {}
|
||||
for g in self.__groups:
|
||||
for dupe in g:
|
||||
self.__group_of_duplicate[dupe] = g
|
||||
if not hasattr(dupe, 'is_ref'):
|
||||
dupe.is_ref = False
|
||||
old_filters = nonone(self.__filters, [])
|
||||
self.apply_filter(None)
|
||||
for filter_str in old_filters:
|
||||
self.apply_filter(filter_str)
|
||||
|
||||
#---Public
|
||||
def apply_filter(self, filter_str):
|
||||
''' Applies a filter 'filter_str' to self.groups
|
||||
|
||||
When you apply the filter, only dupes with the filename matching 'filter_str' will be in
|
||||
in the results. To cancel the filter, just call apply_filter with 'filter_str' to None,
|
||||
and the results will go back to normal.
|
||||
|
||||
If call apply_filter on a filtered results, the filter will be applied
|
||||
*on the filtered results*.
|
||||
|
||||
'filter_str' is a string containing a regexp to filter dupes with.
|
||||
'''
|
||||
if not filter_str:
|
||||
self.__filtered_dupes = None
|
||||
self.__filtered_groups = None
|
||||
self.__filters = None
|
||||
else:
|
||||
if not self.__filters:
|
||||
self.__filters = []
|
||||
self.__filters.append(filter_str)
|
||||
filter_re = re.compile(filter_str, re.IGNORECASE)
|
||||
if self.__filtered_dupes is None:
|
||||
self.__filtered_dupes = flatten(g[:] for g in self.groups)
|
||||
self.__filtered_dupes = set(dupe for dupe in self.__filtered_dupes if filter_re.search(dupe.name))
|
||||
filtered_groups = set()
|
||||
for dupe in self.__filtered_dupes:
|
||||
filtered_groups.add(self.get_group_of_duplicate(dupe))
|
||||
self.__filtered_groups = list(filtered_groups)
|
||||
self.__recalculate_stats()
|
||||
sd = self.__groups_sort_descriptor
|
||||
if sd:
|
||||
self.sort_groups(sd[0], sd[1])
|
||||
self.__dupes = None
|
||||
|
||||
def get_group_of_duplicate(self, dupe):
|
||||
try:
|
||||
return self.__group_of_duplicate[dupe]
|
||||
except (TypeError, KeyError):
|
||||
return None
|
||||
|
||||
is_markable = _is_markable
|
||||
|
||||
def load_from_xml(self, infile, get_file, j=nulljob):
|
||||
self.apply_filter(None)
|
||||
handler = _ResultsHandler(get_file)
|
||||
parser = make_parser()
|
||||
parser.setContentHandler(handler)
|
||||
try:
|
||||
infile, must_close = open_if_filename(infile)
|
||||
except IOError:
|
||||
return
|
||||
BUFSIZE = 1024 * 1024 # 1mb buffer
|
||||
infile.seek(0, 2)
|
||||
j.start_job(infile.tell() // BUFSIZE)
|
||||
infile.seek(0, 0)
|
||||
try:
|
||||
while True:
|
||||
data = infile.read(BUFSIZE)
|
||||
if not data:
|
||||
break
|
||||
parser.feed(data)
|
||||
j.add_progress()
|
||||
except SAXException:
|
||||
return
|
||||
self.groups = handler.groups
|
||||
for dupe_file in handler.marked:
|
||||
self.mark(dupe_file)
|
||||
|
||||
def make_ref(self, dupe):
|
||||
g = self.get_group_of_duplicate(dupe)
|
||||
r = g.ref
|
||||
self._remove_mark_flag(dupe)
|
||||
g.switch_ref(dupe);
|
||||
if not r.is_ref:
|
||||
self.__total_count += 1
|
||||
self.__total_size += r.size
|
||||
if not dupe.is_ref:
|
||||
self.__total_count -= 1
|
||||
self.__total_size -= dupe.size
|
||||
self.__dupes = None
|
||||
|
||||
def perform_on_marked(self, func, remove_from_results):
|
||||
problems = []
|
||||
for d in self.dupes:
|
||||
if self.is_marked(d) and (not func(d)):
|
||||
problems.append(d)
|
||||
if remove_from_results:
|
||||
to_remove = [d for d in self.dupes if self.is_marked(d) and (d not in problems)]
|
||||
self.remove_duplicates(to_remove)
|
||||
self.mark_none()
|
||||
for d in problems:
|
||||
self.mark(d)
|
||||
return len(problems)
|
||||
|
||||
def remove_duplicates(self, dupes):
|
||||
'''Remove 'dupes' from their respective group, and remove the group is it ends up empty.
|
||||
'''
|
||||
affected_groups = set()
|
||||
for dupe in dupes:
|
||||
group = self.get_group_of_duplicate(dupe)
|
||||
if dupe not in group.dupes:
|
||||
return
|
||||
group.remove_dupe(dupe, False)
|
||||
self._remove_mark_flag(dupe)
|
||||
self.__total_count -= 1
|
||||
self.__total_size -= dupe.size
|
||||
if not group:
|
||||
self.__groups.remove(group)
|
||||
if self.__filtered_groups:
|
||||
self.__filtered_groups.remove(group)
|
||||
else:
|
||||
affected_groups.add(group)
|
||||
for group in affected_groups:
|
||||
group.clean_matches()
|
||||
self.__dupes = None
|
||||
|
||||
def save_to_xml(self, outfile, with_data=False):
|
||||
self.apply_filter(None)
|
||||
outfile, must_close = open_if_filename(outfile, 'wb')
|
||||
writer = XMLGenerator(outfile, 'utf-8')
|
||||
writer.startDocument()
|
||||
empty_attrs = AttributesImpl({})
|
||||
writer.startElement('results', empty_attrs)
|
||||
for g in self.groups:
|
||||
writer.startElement('group', empty_attrs)
|
||||
dupe2index = {}
|
||||
for index, d in enumerate(g):
|
||||
dupe2index[d] = index
|
||||
try:
|
||||
words = engine.unpack_fields(d.words)
|
||||
except AttributeError:
|
||||
words = ()
|
||||
attrs = AttributesImpl({
|
||||
'path': unicode(d.path),
|
||||
'is_ref': cond(d.is_ref, 'y', 'n'),
|
||||
'words': ','.join(words),
|
||||
'marked': cond(self.is_marked(d), 'y', 'n')
|
||||
})
|
||||
writer.startElement('file', attrs)
|
||||
if with_data:
|
||||
data_list = self.data.GetDisplayInfo(d, g)
|
||||
for data in data_list:
|
||||
attrs = AttributesImpl({
|
||||
'value': data,
|
||||
})
|
||||
writer.startElement('data', attrs)
|
||||
writer.endElement('data')
|
||||
writer.endElement('file')
|
||||
for match in g.matches:
|
||||
attrs = AttributesImpl({
|
||||
'first': str(dupe2index[match.first]),
|
||||
'second': str(dupe2index[match.second]),
|
||||
'percentage': str(int(match.percentage)),
|
||||
})
|
||||
writer.startElement('match', attrs)
|
||||
writer.endElement('match')
|
||||
writer.endElement('group')
|
||||
writer.endElement('results')
|
||||
writer.endDocument()
|
||||
if must_close:
|
||||
outfile.close()
|
||||
|
||||
def sort_dupes(self, key, asc=True, delta=False):
|
||||
if not self.__dupes:
|
||||
self.__get_dupe_list()
|
||||
self.__dupes.sort(key=lambda d: self.data.GetDupeSortKey(d, lambda: self.get_group_of_duplicate(d), key, delta))
|
||||
if not asc:
|
||||
self.__dupes.reverse()
|
||||
self.__dupes_sort_descriptor = (key,asc,delta)
|
||||
|
||||
def sort_groups(self,key,asc=True):
|
||||
self.groups.sort(key=lambda g: self.data.GetGroupSortKey(g, key))
|
||||
if not asc:
|
||||
self.groups.reverse()
|
||||
self.__groups_sort_descriptor = (key,asc)
|
||||
|
||||
#---Properties
|
||||
dupes = property(__get_dupe_list)
|
||||
groups = property(__get_groups, __set_groups)
|
||||
stat_line = property(__get_stat_line)
|
||||
|
||||
class _ResultsHandler(handler.ContentHandler):
|
||||
def __init__(self, get_file):
|
||||
self.group = None
|
||||
self.dupes = None
|
||||
self.marked = set()
|
||||
self.groups = []
|
||||
self.get_file = get_file
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
if name == 'group':
|
||||
self.group = engine.Group()
|
||||
self.dupes = []
|
||||
return
|
||||
if (name == 'file') and (self.group is not None):
|
||||
if not (('path' in attrs) and ('words' in attrs)):
|
||||
return
|
||||
path = attrs['path']
|
||||
file = self.get_file(path)
|
||||
if file is None:
|
||||
return
|
||||
file.words = attrs['words'].split(',')
|
||||
file.is_ref = attrs.get('is_ref') == 'y'
|
||||
self.dupes.append(file)
|
||||
if attrs.get('marked') == 'y':
|
||||
self.marked.add(file)
|
||||
if (name == 'match') and (self.group is not None):
|
||||
try:
|
||||
first_file = self.dupes[int(attrs['first'])]
|
||||
second_file = self.dupes[int(attrs['second'])]
|
||||
percentage = int(attrs['percentage'])
|
||||
self.group.add_match(engine.Match(first_file, second_file, percentage))
|
||||
except (IndexError, KeyError, ValueError): # Covers missing attr, non-int values and indexes out of bounds
|
||||
pass
|
||||
|
||||
def endElement(self, name):
|
||||
def do_match(ref_file, other_files, group):
|
||||
if not other_files:
|
||||
return
|
||||
for other_file in other_files:
|
||||
group.add_match(engine.get_match(ref_file, other_file))
|
||||
do_match(other_files[0], other_files[1:], group)
|
||||
|
||||
if name == 'group':
|
||||
group = self.group
|
||||
self.group = None
|
||||
dupes = self.dupes
|
||||
self.dupes = []
|
||||
if group is None:
|
||||
return
|
||||
if len(dupes) < 2:
|
||||
return
|
||||
if not group.matches: # <match> elements not present, do it manually, without %
|
||||
do_match(dupes[0], dupes[1:], group)
|
||||
group.prioritize(lambda x: dupes.index(x))
|
||||
self.groups.append(group)
|
||||
|
||||
742
py/results_test.py
Normal file
742
py/results_test.py
Normal file
@@ -0,0 +1,742 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.tests.results
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/02/23
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import unittest
|
||||
import StringIO
|
||||
import xml.dom.minidom
|
||||
import os.path as op
|
||||
|
||||
from hsutil.path import Path
|
||||
from hsutil.testcase import TestCase
|
||||
from hsutil.misc import first
|
||||
|
||||
from . import engine_test
|
||||
from . import data
|
||||
from . import engine
|
||||
from .results import *
|
||||
|
||||
class NamedObject(engine_test.NamedObject):
|
||||
size = 1
|
||||
path = property(lambda x:Path('basepath') + x.name)
|
||||
is_ref = False
|
||||
|
||||
def __nonzero__(self):
|
||||
return False #Make sure that operations are made correctly when the bool value of files is false.
|
||||
|
||||
# Returns a group set that looks like that:
|
||||
# "foo bar" (1)
|
||||
# "bar bleh" (1024)
|
||||
# "foo bleh" (1)
|
||||
# "ibabtu" (1)
|
||||
# "ibabtu" (1)
|
||||
def GetTestGroups():
|
||||
objects = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("foo bleh"),NamedObject("ibabtu"),NamedObject("ibabtu")]
|
||||
objects[1].size = 1024
|
||||
matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
|
||||
groups = engine.get_groups(matches) #We should have 2 groups
|
||||
for g in groups:
|
||||
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
|
||||
groups.sort(key=len, reverse=True) # We want the group with 3 members to be first.
|
||||
return (objects,matches,groups)
|
||||
|
||||
class TCResultsEmpty(TestCase):
|
||||
def setUp(self):
|
||||
self.results = Results(data)
|
||||
|
||||
def test_stat_line(self):
|
||||
self.assertEqual("0 / 0 (0.00 B / 0.00 B) duplicates marked.",self.results.stat_line)
|
||||
|
||||
def test_groups(self):
|
||||
self.assertEqual(0,len(self.results.groups))
|
||||
|
||||
def test_get_group_of_duplicate(self):
|
||||
self.assert_(self.results.get_group_of_duplicate('foo') is None)
|
||||
|
||||
def test_save_to_xml(self):
|
||||
f = StringIO.StringIO()
|
||||
self.results.save_to_xml(f)
|
||||
f.seek(0)
|
||||
doc = xml.dom.minidom.parse(f)
|
||||
root = doc.documentElement
|
||||
self.assertEqual('results',root.nodeName)
|
||||
|
||||
|
||||
class TCResultsWithSomeGroups(TestCase):
|
||||
def setUp(self):
|
||||
self.results = Results(data)
|
||||
self.objects,self.matches,self.groups = GetTestGroups()
|
||||
self.results.groups = self.groups
|
||||
|
||||
def test_stat_line(self):
|
||||
self.assertEqual("0 / 3 (0.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
|
||||
|
||||
def test_groups(self):
|
||||
self.assertEqual(2,len(self.results.groups))
|
||||
|
||||
def test_get_group_of_duplicate(self):
|
||||
for o in self.objects:
|
||||
g = self.results.get_group_of_duplicate(o)
|
||||
self.assert_(isinstance(g, engine.Group))
|
||||
self.assert_(o in g)
|
||||
self.assert_(self.results.get_group_of_duplicate(self.groups[0]) is None)
|
||||
|
||||
def test_remove_duplicates(self):
|
||||
g1,g2 = self.results.groups
|
||||
self.results.remove_duplicates([g1.dupes[0]])
|
||||
self.assertEqual(2,len(g1))
|
||||
self.assert_(g1 in self.results.groups)
|
||||
self.results.remove_duplicates([g1.ref])
|
||||
self.assertEqual(2,len(g1))
|
||||
self.assert_(g1 in self.results.groups)
|
||||
self.results.remove_duplicates([g1.dupes[0]])
|
||||
self.assertEqual(0,len(g1))
|
||||
self.assert_(g1 not in self.results.groups)
|
||||
self.results.remove_duplicates([g2.dupes[0]])
|
||||
self.assertEqual(0,len(g2))
|
||||
self.assert_(g2 not in self.results.groups)
|
||||
self.assertEqual(0,len(self.results.groups))
|
||||
|
||||
def test_remove_duplicates_with_ref_files(self):
|
||||
g1,g2 = self.results.groups
|
||||
self.objects[0].is_ref = True
|
||||
self.objects[1].is_ref = True
|
||||
self.results.remove_duplicates([self.objects[2]])
|
||||
self.assertEqual(0,len(g1))
|
||||
self.assert_(g1 not in self.results.groups)
|
||||
|
||||
def test_make_ref(self):
|
||||
g = self.results.groups[0]
|
||||
d = g.dupes[0]
|
||||
self.results.make_ref(d)
|
||||
self.assert_(d is g.ref)
|
||||
|
||||
def test_sort_groups(self):
|
||||
self.results.make_ref(self.objects[1]) #We want to make the 1024 sized object to go ref.
|
||||
g1,g2 = self.groups
|
||||
self.results.sort_groups(2) #2 is the key for size
|
||||
self.assert_(self.results.groups[0] is g2)
|
||||
self.assert_(self.results.groups[1] is g1)
|
||||
self.results.sort_groups(2,False)
|
||||
self.assert_(self.results.groups[0] is g1)
|
||||
self.assert_(self.results.groups[1] is g2)
|
||||
|
||||
def test_set_groups_when_sorted(self):
|
||||
self.results.make_ref(self.objects[1]) #We want to make the 1024 sized object to go ref.
|
||||
self.results.sort_groups(2)
|
||||
objects,matches,groups = GetTestGroups()
|
||||
g1,g2 = groups
|
||||
g1.switch_ref(objects[1])
|
||||
self.results.groups = groups
|
||||
self.assert_(self.results.groups[0] is g2)
|
||||
self.assert_(self.results.groups[1] is g1)
|
||||
|
||||
def test_get_dupe_list(self):
|
||||
self.assertEqual([self.objects[1],self.objects[2],self.objects[4]],self.results.dupes)
|
||||
|
||||
def test_dupe_list_is_cached(self):
|
||||
self.assert_(self.results.dupes is self.results.dupes)
|
||||
|
||||
def test_dupe_list_cache_is_invalidated_when_needed(self):
|
||||
o1,o2,o3,o4,o5 = self.objects
|
||||
self.assertEqual([o2,o3,o5],self.results.dupes)
|
||||
self.results.make_ref(o2)
|
||||
self.assertEqual([o1,o3,o5],self.results.dupes)
|
||||
objects,matches,groups = GetTestGroups()
|
||||
o1,o2,o3,o4,o5 = objects
|
||||
self.results.groups = groups
|
||||
self.assertEqual([o2,o3,o5],self.results.dupes)
|
||||
|
||||
def test_dupe_list_sort(self):
|
||||
o1,o2,o3,o4,o5 = self.objects
|
||||
o1.size = 5
|
||||
o2.size = 4
|
||||
o3.size = 3
|
||||
o4.size = 2
|
||||
o5.size = 1
|
||||
self.results.sort_dupes(2)
|
||||
self.assertEqual([o5,o3,o2],self.results.dupes)
|
||||
self.results.sort_dupes(2,False)
|
||||
self.assertEqual([o2,o3,o5],self.results.dupes)
|
||||
|
||||
def test_dupe_list_remember_sort(self):
|
||||
o1,o2,o3,o4,o5 = self.objects
|
||||
o1.size = 5
|
||||
o2.size = 4
|
||||
o3.size = 3
|
||||
o4.size = 2
|
||||
o5.size = 1
|
||||
self.results.sort_dupes(2)
|
||||
self.results.make_ref(o2)
|
||||
self.assertEqual([o5,o3,o1],self.results.dupes)
|
||||
|
||||
def test_dupe_list_sort_delta_values(self):
|
||||
o1,o2,o3,o4,o5 = self.objects
|
||||
o1.size = 10
|
||||
o2.size = 2 #-8
|
||||
o3.size = 3 #-7
|
||||
o4.size = 20
|
||||
o5.size = 1 #-19
|
||||
self.results.sort_dupes(2,delta=True)
|
||||
self.assertEqual([o5,o2,o3],self.results.dupes)
|
||||
|
||||
def test_sort_empty_list(self):
|
||||
#There was an infinite loop when sorting an empty list.
|
||||
r = Results(data)
|
||||
r.sort_dupes(0)
|
||||
self.assertEqual([],r.dupes)
|
||||
|
||||
def test_dupe_list_update_on_remove_duplicates(self):
|
||||
o1,o2,o3,o4,o5 = self.objects
|
||||
self.assertEqual(3,len(self.results.dupes))
|
||||
self.results.remove_duplicates([o2])
|
||||
self.assertEqual(2,len(self.results.dupes))
|
||||
|
||||
|
||||
class TCResultsMarkings(TestCase):
|
||||
def setUp(self):
|
||||
self.results = Results(data)
|
||||
self.objects,self.matches,self.groups = GetTestGroups()
|
||||
self.results.groups = self.groups
|
||||
|
||||
def test_stat_line(self):
|
||||
self.assertEqual("0 / 3 (0.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
|
||||
self.results.mark(self.objects[1])
|
||||
self.assertEqual("1 / 3 (1.00 KB / 1.01 KB) duplicates marked.",self.results.stat_line)
|
||||
self.results.mark_invert()
|
||||
self.assertEqual("2 / 3 (2.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
|
||||
self.results.mark_invert()
|
||||
self.results.unmark(self.objects[1])
|
||||
self.results.mark(self.objects[2])
|
||||
self.results.mark(self.objects[4])
|
||||
self.assertEqual("2 / 3 (2.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
|
||||
self.results.mark(self.objects[0]) #this is a ref, it can't be counted
|
||||
self.assertEqual("2 / 3 (2.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
|
||||
self.results.groups = self.groups
|
||||
self.assertEqual("0 / 3 (0.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
|
||||
|
||||
def test_with_ref_duplicate(self):
|
||||
self.objects[1].is_ref = True
|
||||
self.results.groups = self.groups
|
||||
self.assert_(not self.results.mark(self.objects[1]))
|
||||
self.results.mark(self.objects[2])
|
||||
self.assertEqual("1 / 2 (1.00 B / 2.00 B) duplicates marked.",self.results.stat_line)
|
||||
|
||||
def test_perform_on_marked(self):
|
||||
def log_object(o):
|
||||
log.append(o)
|
||||
return True
|
||||
|
||||
log = []
|
||||
self.results.mark_all()
|
||||
self.results.perform_on_marked(log_object,False)
|
||||
self.assert_(self.objects[1] in log)
|
||||
self.assert_(self.objects[2] in log)
|
||||
self.assert_(self.objects[4] in log)
|
||||
self.assertEqual(3,len(log))
|
||||
log = []
|
||||
self.results.mark_none()
|
||||
self.results.mark(self.objects[4])
|
||||
self.results.perform_on_marked(log_object,True)
|
||||
self.assertEqual(1,len(log))
|
||||
self.assert_(self.objects[4] in log)
|
||||
self.assertEqual(1,len(self.results.groups))
|
||||
|
||||
def test_perform_on_marked_with_problems(self):
|
||||
def log_object(o):
|
||||
log.append(o)
|
||||
return o is not self.objects[1]
|
||||
|
||||
log = []
|
||||
self.results.mark_all()
|
||||
self.assert_(self.results.is_marked(self.objects[1]))
|
||||
self.assertEqual(1,self.results.perform_on_marked(log_object, True))
|
||||
self.assertEqual(3,len(log))
|
||||
self.assertEqual(1,len(self.results.groups))
|
||||
self.assertEqual(2,len(self.results.groups[0]))
|
||||
self.assert_(self.objects[1] in self.results.groups[0])
|
||||
self.assert_(not self.results.is_marked(self.objects[2]))
|
||||
self.assert_(self.results.is_marked(self.objects[1]))
|
||||
|
||||
def test_perform_on_marked_with_ref(self):
|
||||
def log_object(o):
|
||||
log.append(o)
|
||||
return True
|
||||
|
||||
log = []
|
||||
self.objects[0].is_ref = True
|
||||
self.objects[1].is_ref = True
|
||||
self.results.mark_all()
|
||||
self.results.perform_on_marked(log_object,True)
|
||||
self.assert_(self.objects[1] not in log)
|
||||
self.assert_(self.objects[2] in log)
|
||||
self.assert_(self.objects[4] in log)
|
||||
self.assertEqual(2,len(log))
|
||||
self.assertEqual(0,len(self.results.groups))
|
||||
|
||||
def test_perform_on_marked_remove_objects_only_at_the_end(self):
|
||||
def check_groups(o):
|
||||
self.assertEqual(3,len(g1))
|
||||
self.assertEqual(2,len(g2))
|
||||
return True
|
||||
|
||||
g1,g2 = self.results.groups
|
||||
self.results.mark_all()
|
||||
self.results.perform_on_marked(check_groups,True)
|
||||
self.assertEqual(0,len(g1))
|
||||
self.assertEqual(0,len(g2))
|
||||
self.assertEqual(0,len(self.results.groups))
|
||||
|
||||
def test_remove_duplicates(self):
|
||||
g1 = self.results.groups[0]
|
||||
g2 = self.results.groups[1]
|
||||
self.results.mark(g1.dupes[0])
|
||||
self.assertEqual("1 / 3 (1.00 KB / 1.01 KB) duplicates marked.",self.results.stat_line)
|
||||
self.results.remove_duplicates([g1.dupes[1]])
|
||||
self.assertEqual("1 / 2 (1.00 KB / 1.01 KB) duplicates marked.",self.results.stat_line)
|
||||
self.results.remove_duplicates([g1.dupes[0]])
|
||||
self.assertEqual("0 / 1 (0.00 B / 1.00 B) duplicates marked.",self.results.stat_line)
|
||||
|
||||
def test_make_ref(self):
|
||||
g = self.results.groups[0]
|
||||
d = g.dupes[0]
|
||||
self.results.mark(d)
|
||||
self.assertEqual("1 / 3 (1.00 KB / 1.01 KB) duplicates marked.",self.results.stat_line)
|
||||
self.results.make_ref(d)
|
||||
self.assertEqual("0 / 3 (0.00 B / 3.00 B) duplicates marked.",self.results.stat_line)
|
||||
self.results.make_ref(d)
|
||||
self.assertEqual("0 / 3 (0.00 B / 3.00 B) duplicates marked.",self.results.stat_line)
|
||||
|
||||
def test_SaveXML(self):
|
||||
self.results.mark(self.objects[1])
|
||||
self.results.mark_invert()
|
||||
f = StringIO.StringIO()
|
||||
self.results.save_to_xml(f)
|
||||
f.seek(0)
|
||||
doc = xml.dom.minidom.parse(f)
|
||||
root = doc.documentElement
|
||||
g1,g2 = root.getElementsByTagName('group')
|
||||
d1,d2,d3 = g1.getElementsByTagName('file')
|
||||
self.assertEqual('n',d1.getAttributeNode('marked').nodeValue)
|
||||
self.assertEqual('n',d2.getAttributeNode('marked').nodeValue)
|
||||
self.assertEqual('y',d3.getAttributeNode('marked').nodeValue)
|
||||
d1,d2 = g2.getElementsByTagName('file')
|
||||
self.assertEqual('n',d1.getAttributeNode('marked').nodeValue)
|
||||
self.assertEqual('y',d2.getAttributeNode('marked').nodeValue)
|
||||
|
||||
def test_LoadXML(self):
|
||||
def get_file(path):
|
||||
return [f for f in self.objects if str(f.path) == path][0]
|
||||
|
||||
self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
|
||||
self.results.mark(self.objects[1])
|
||||
self.results.mark_invert()
|
||||
f = StringIO.StringIO()
|
||||
self.results.save_to_xml(f)
|
||||
f.seek(0)
|
||||
r = Results(data)
|
||||
r.load_from_xml(f,get_file)
|
||||
self.assert_(not r.is_marked(self.objects[0]))
|
||||
self.assert_(not r.is_marked(self.objects[1]))
|
||||
self.assert_(r.is_marked(self.objects[2]))
|
||||
self.assert_(not r.is_marked(self.objects[3]))
|
||||
self.assert_(r.is_marked(self.objects[4]))
|
||||
|
||||
|
||||
class TCResultsXML(TestCase):
|
||||
def setUp(self):
|
||||
self.results = Results(data)
|
||||
self.objects, self.matches, self.groups = GetTestGroups()
|
||||
self.results.groups = self.groups
|
||||
|
||||
def get_file(self, path): # use this as a callback for load_from_xml
|
||||
return [o for o in self.objects if o.path == path][0]
|
||||
|
||||
def test_save_to_xml(self):
|
||||
self.objects[0].is_ref = True
|
||||
self.objects[0].words = [['foo','bar']]
|
||||
f = StringIO.StringIO()
|
||||
self.results.save_to_xml(f)
|
||||
f.seek(0)
|
||||
doc = xml.dom.minidom.parse(f)
|
||||
root = doc.documentElement
|
||||
self.assertEqual('results',root.nodeName)
|
||||
children = [c for c in root.childNodes if c.localName]
|
||||
self.assertEqual(2,len(children))
|
||||
self.assertEqual(2,len([c for c in children if c.nodeName == 'group']))
|
||||
g1,g2 = children
|
||||
children = [c for c in g1.childNodes if c.localName]
|
||||
self.assertEqual(6,len(children))
|
||||
self.assertEqual(3,len([c for c in children if c.nodeName == 'file']))
|
||||
self.assertEqual(3,len([c for c in children if c.nodeName == 'match']))
|
||||
d1,d2,d3 = [c for c in children if c.nodeName == 'file']
|
||||
self.assertEqual(op.join('basepath','foo bar'),d1.getAttributeNode('path').nodeValue)
|
||||
self.assertEqual(op.join('basepath','bar bleh'),d2.getAttributeNode('path').nodeValue)
|
||||
self.assertEqual(op.join('basepath','foo bleh'),d3.getAttributeNode('path').nodeValue)
|
||||
self.assertEqual('y',d1.getAttributeNode('is_ref').nodeValue)
|
||||
self.assertEqual('n',d2.getAttributeNode('is_ref').nodeValue)
|
||||
self.assertEqual('n',d3.getAttributeNode('is_ref').nodeValue)
|
||||
self.assertEqual('foo,bar',d1.getAttributeNode('words').nodeValue)
|
||||
self.assertEqual('bar,bleh',d2.getAttributeNode('words').nodeValue)
|
||||
self.assertEqual('foo,bleh',d3.getAttributeNode('words').nodeValue)
|
||||
children = [c for c in g2.childNodes if c.localName]
|
||||
self.assertEqual(3,len(children))
|
||||
self.assertEqual(2,len([c for c in children if c.nodeName == 'file']))
|
||||
self.assertEqual(1,len([c for c in children if c.nodeName == 'match']))
|
||||
d1,d2 = [c for c in children if c.nodeName == 'file']
|
||||
self.assertEqual(op.join('basepath','ibabtu'),d1.getAttributeNode('path').nodeValue)
|
||||
self.assertEqual(op.join('basepath','ibabtu'),d2.getAttributeNode('path').nodeValue)
|
||||
self.assertEqual('n',d1.getAttributeNode('is_ref').nodeValue)
|
||||
self.assertEqual('n',d2.getAttributeNode('is_ref').nodeValue)
|
||||
self.assertEqual('ibabtu',d1.getAttributeNode('words').nodeValue)
|
||||
self.assertEqual('ibabtu',d2.getAttributeNode('words').nodeValue)
|
||||
|
||||
def test_save_to_xml_with_columns(self):
|
||||
class FakeDataModule:
|
||||
def GetDisplayInfo(self,dupe,group):
|
||||
return [str(dupe.size),dupe.foo.upper()]
|
||||
|
||||
for i,object in enumerate(self.objects):
|
||||
object.size = i
|
||||
object.foo = u'bar\u00e9'
|
||||
f = StringIO.StringIO()
|
||||
self.results.data = FakeDataModule()
|
||||
self.results.save_to_xml(f,True)
|
||||
f.seek(0)
|
||||
doc = xml.dom.minidom.parse(f)
|
||||
root = doc.documentElement
|
||||
g1,g2 = root.getElementsByTagName('group')
|
||||
d1,d2,d3 = g1.getElementsByTagName('file')
|
||||
d4,d5 = g2.getElementsByTagName('file')
|
||||
self.assertEqual('0',d1.getElementsByTagName('data')[0].getAttribute('value'))
|
||||
self.assertEqual(u'BAR\u00c9',d1.getElementsByTagName('data')[1].getAttribute('value')) #\u00c9 is upper of \u00e9
|
||||
self.assertEqual('1',d2.getElementsByTagName('data')[0].getAttribute('value'))
|
||||
self.assertEqual('2',d3.getElementsByTagName('data')[0].getAttribute('value'))
|
||||
self.assertEqual('3',d4.getElementsByTagName('data')[0].getAttribute('value'))
|
||||
self.assertEqual('4',d5.getElementsByTagName('data')[0].getAttribute('value'))
|
||||
|
||||
def test_LoadXML(self):
|
||||
def get_file(path):
|
||||
return [f for f in self.objects if str(f.path) == path][0]
|
||||
|
||||
self.objects[0].is_ref = True
|
||||
self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
|
||||
f = StringIO.StringIO()
|
||||
self.results.save_to_xml(f)
|
||||
f.seek(0)
|
||||
r = Results(data)
|
||||
r.load_from_xml(f,get_file)
|
||||
self.assertEqual(2,len(r.groups))
|
||||
g1,g2 = r.groups
|
||||
self.assertEqual(3,len(g1))
|
||||
self.assert_(g1[0].is_ref)
|
||||
self.assert_(not g1[1].is_ref)
|
||||
self.assert_(not g1[2].is_ref)
|
||||
self.assert_(g1[0] is self.objects[0])
|
||||
self.assert_(g1[1] is self.objects[1])
|
||||
self.assert_(g1[2] is self.objects[2])
|
||||
self.assertEqual(['foo','bar'],g1[0].words)
|
||||
self.assertEqual(['bar','bleh'],g1[1].words)
|
||||
self.assertEqual(['foo','bleh'],g1[2].words)
|
||||
self.assertEqual(2,len(g2))
|
||||
self.assert_(not g2[0].is_ref)
|
||||
self.assert_(not g2[1].is_ref)
|
||||
self.assert_(g2[0] is self.objects[3])
|
||||
self.assert_(g2[1] is self.objects[4])
|
||||
self.assertEqual(['ibabtu'],g2[0].words)
|
||||
self.assertEqual(['ibabtu'],g2[1].words)
|
||||
|
||||
def test_LoadXML_with_filename(self):
|
||||
def get_file(path):
|
||||
return [f for f in self.objects if str(f.path) == path][0]
|
||||
|
||||
filename = op.join(self.tmpdir(), 'dupeguru_results.xml')
|
||||
self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
|
||||
self.results.save_to_xml(filename)
|
||||
r = Results(data)
|
||||
r.load_from_xml(filename,get_file)
|
||||
self.assertEqual(2,len(r.groups))
|
||||
|
||||
def test_LoadXML_with_some_files_that_dont_exist_anymore(self):
|
||||
def get_file(path):
|
||||
if path.endswith('ibabtu 2'):
|
||||
return None
|
||||
return [f for f in self.objects if str(f.path) == path][0]
|
||||
|
||||
self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
|
||||
f = StringIO.StringIO()
|
||||
self.results.save_to_xml(f)
|
||||
f.seek(0)
|
||||
r = Results(data)
|
||||
r.load_from_xml(f,get_file)
|
||||
self.assertEqual(1,len(r.groups))
|
||||
self.assertEqual(3,len(r.groups[0]))
|
||||
|
||||
def test_LoadXML_missing_attributes_and_bogus_elements(self):
|
||||
def get_file(path):
|
||||
return [f for f in self.objects if str(f.path) == path][0]
|
||||
|
||||
doc = xml.dom.minidom.Document()
|
||||
root = doc.appendChild(doc.createElement('foobar')) #The root element shouldn't matter, really.
|
||||
group_node = root.appendChild(doc.createElement('group'))
|
||||
dupe_node = group_node.appendChild(doc.createElement('file')) #Perfectly correct file
|
||||
dupe_node.setAttribute('path',op.join('basepath','foo bar'))
|
||||
dupe_node.setAttribute('is_ref','y')
|
||||
dupe_node.setAttribute('words','foo,bar')
|
||||
dupe_node = group_node.appendChild(doc.createElement('file')) #is_ref missing, default to 'n'
|
||||
dupe_node.setAttribute('path',op.join('basepath','foo bleh'))
|
||||
dupe_node.setAttribute('words','foo,bleh')
|
||||
dupe_node = group_node.appendChild(doc.createElement('file')) #words are missing, invalid.
|
||||
dupe_node.setAttribute('path',op.join('basepath','bar bleh'))
|
||||
dupe_node = group_node.appendChild(doc.createElement('file')) #path is missing, invalid.
|
||||
dupe_node.setAttribute('words','foo,bleh')
|
||||
dupe_node = group_node.appendChild(doc.createElement('foobar')) #Invalid element name
|
||||
dupe_node.setAttribute('path',op.join('basepath','bar bleh'))
|
||||
dupe_node.setAttribute('is_ref','y')
|
||||
dupe_node.setAttribute('words','bar,bleh')
|
||||
match_node = group_node.appendChild(doc.createElement('match')) # match pointing to a bad index
|
||||
match_node.setAttribute('first', '42')
|
||||
match_node.setAttribute('second', '45')
|
||||
match_node = group_node.appendChild(doc.createElement('match')) # match with missing attrs
|
||||
match_node = group_node.appendChild(doc.createElement('match')) # match with non-int values
|
||||
match_node.setAttribute('first', 'foo')
|
||||
match_node.setAttribute('second', 'bar')
|
||||
match_node.setAttribute('percentage', 'baz')
|
||||
group_node = root.appendChild(doc.createElement('foobar')) #invalid group
|
||||
group_node = root.appendChild(doc.createElement('group')) #empty group
|
||||
f = StringIO.StringIO()
|
||||
doc.writexml(f,'\t','\t','\n',encoding='utf-8')
|
||||
f.seek(0)
|
||||
r = Results(data)
|
||||
r.load_from_xml(f,get_file)
|
||||
self.assertEqual(1,len(r.groups))
|
||||
self.assertEqual(2,len(r.groups[0]))
|
||||
|
||||
def test_xml_non_ascii(self):
|
||||
def get_file(path):
|
||||
if path == op.join('basepath',u'\xe9foo bar'):
|
||||
return objects[0]
|
||||
if path == op.join('basepath',u'bar bleh'):
|
||||
return objects[1]
|
||||
|
||||
objects = [NamedObject(u"\xe9foo bar",True),NamedObject("bar bleh",True)]
|
||||
matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
|
||||
groups = engine.get_groups(matches) #We should have 2 groups
|
||||
for g in groups:
|
||||
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
|
||||
results = Results(data)
|
||||
results.groups = groups
|
||||
f = StringIO.StringIO()
|
||||
results.save_to_xml(f)
|
||||
f.seek(0)
|
||||
r = Results(data)
|
||||
r.load_from_xml(f,get_file)
|
||||
g = r.groups[0]
|
||||
self.assertEqual(u"\xe9foo bar",g[0].name)
|
||||
self.assertEqual(['efoo','bar'],g[0].words)
|
||||
|
||||
def test_load_invalid_xml(self):
|
||||
f = StringIO.StringIO()
|
||||
f.write('<this is invalid')
|
||||
f.seek(0)
|
||||
r = Results(data)
|
||||
r.load_from_xml(f,None)
|
||||
self.assertEqual(0,len(r.groups))
|
||||
|
||||
def test_load_non_existant_xml(self):
|
||||
r = Results(data)
|
||||
try:
|
||||
r.load_from_xml('does_not_exist.xml', None)
|
||||
except IOError:
|
||||
self.fail()
|
||||
self.assertEqual(0,len(r.groups))
|
||||
|
||||
def test_remember_match_percentage(self):
|
||||
group = self.groups[0]
|
||||
d1, d2, d3 = group
|
||||
fake_matches = set()
|
||||
fake_matches.add(engine.Match(d1, d2, 42))
|
||||
fake_matches.add(engine.Match(d1, d3, 43))
|
||||
fake_matches.add(engine.Match(d2, d3, 46))
|
||||
group.matches = fake_matches
|
||||
f = StringIO.StringIO()
|
||||
results = self.results
|
||||
results.save_to_xml(f)
|
||||
f.seek(0)
|
||||
results = Results(data)
|
||||
results.load_from_xml(f, self.get_file)
|
||||
group = results.groups[0]
|
||||
d1, d2, d3 = group
|
||||
match = group.get_match_of(d2) #d1 - d2
|
||||
self.assertEqual(42, match[2])
|
||||
match = group.get_match_of(d3) #d1 - d3
|
||||
self.assertEqual(43, match[2])
|
||||
group.switch_ref(d2)
|
||||
match = group.get_match_of(d3) #d2 - d3
|
||||
self.assertEqual(46, match[2])
|
||||
|
||||
def test_save_and_load(self):
|
||||
# previously, when reloading matches, they wouldn't be reloaded as namedtuples
|
||||
f = StringIO.StringIO()
|
||||
self.results.save_to_xml(f)
|
||||
f.seek(0)
|
||||
self.results.load_from_xml(f, self.get_file)
|
||||
first(self.results.groups[0].matches).percentage
|
||||
|
||||
|
||||
class TCResultsFilter(TestCase):
|
||||
def setUp(self):
|
||||
self.results = Results(data)
|
||||
self.objects, self.matches, self.groups = GetTestGroups()
|
||||
self.results.groups = self.groups
|
||||
self.results.apply_filter(r'foo')
|
||||
|
||||
def test_groups(self):
|
||||
self.assertEqual(1, len(self.results.groups))
|
||||
self.assert_(self.results.groups[0] is self.groups[0])
|
||||
|
||||
def test_dupes(self):
|
||||
# There are 2 objects matching. The first one is ref. Only the 3rd one is supposed to be in dupes.
|
||||
self.assertEqual(1, len(self.results.dupes))
|
||||
self.assert_(self.results.dupes[0] is self.objects[2])
|
||||
|
||||
def test_cancel_filter(self):
|
||||
self.results.apply_filter(None)
|
||||
self.assertEqual(3, len(self.results.dupes))
|
||||
self.assertEqual(2, len(self.results.groups))
|
||||
|
||||
def test_dupes_reconstructed_filtered(self):
|
||||
# make_ref resets self.__dupes to None. When it's reconstructed, we want it filtered
|
||||
dupe = self.results.dupes[0] #3rd object
|
||||
self.results.make_ref(dupe)
|
||||
self.assertEqual(1, len(self.results.dupes))
|
||||
self.assert_(self.results.dupes[0] is self.objects[0])
|
||||
|
||||
def test_include_ref_dupes_in_filter(self):
|
||||
# When only the ref of a group match the filter, include it in the group
|
||||
self.results.apply_filter(None)
|
||||
self.results.apply_filter(r'foo bar')
|
||||
self.assertEqual(1, len(self.results.groups))
|
||||
self.assertEqual(0, len(self.results.dupes))
|
||||
|
||||
def test_filters_build_on_one_another(self):
|
||||
self.results.apply_filter(r'bar')
|
||||
self.assertEqual(1, len(self.results.groups))
|
||||
self.assertEqual(0, len(self.results.dupes))
|
||||
|
||||
def test_stat_line(self):
|
||||
expected = '0 / 1 (0.00 B / 1.00 B) duplicates marked. filter: foo'
|
||||
self.assertEqual(expected, self.results.stat_line)
|
||||
self.results.apply_filter(r'bar')
|
||||
expected = '0 / 0 (0.00 B / 0.00 B) duplicates marked. filter: foo --> bar'
|
||||
self.assertEqual(expected, self.results.stat_line)
|
||||
self.results.apply_filter(None)
|
||||
expected = '0 / 3 (0.00 B / 1.01 KB) duplicates marked.'
|
||||
self.assertEqual(expected, self.results.stat_line)
|
||||
|
||||
def test_mark_count_is_filtered_as_well(self):
|
||||
self.results.apply_filter(None)
|
||||
# We don't want to perform mark_all() because we want the mark list to contain objects
|
||||
for dupe in self.results.dupes:
|
||||
self.results.mark(dupe)
|
||||
self.results.apply_filter(r'foo')
|
||||
expected = '1 / 1 (1.00 B / 1.00 B) duplicates marked. filter: foo'
|
||||
self.assertEqual(expected, self.results.stat_line)
|
||||
|
||||
def test_sort_groups(self):
|
||||
self.results.apply_filter(None)
|
||||
self.results.make_ref(self.objects[1]) # to have the 1024 b obkect as ref
|
||||
g1,g2 = self.groups
|
||||
self.results.apply_filter('a') # Matches both group
|
||||
self.results.sort_groups(2) #2 is the key for size
|
||||
self.assert_(self.results.groups[0] is g2)
|
||||
self.assert_(self.results.groups[1] is g1)
|
||||
self.results.apply_filter(None)
|
||||
self.assert_(self.results.groups[0] is g2)
|
||||
self.assert_(self.results.groups[1] is g1)
|
||||
self.results.sort_groups(2, False)
|
||||
self.results.apply_filter('a')
|
||||
self.assert_(self.results.groups[1] is g2)
|
||||
self.assert_(self.results.groups[0] is g1)
|
||||
|
||||
def test_set_group(self):
|
||||
#We want the new group to be filtered
|
||||
self.objects, self.matches, self.groups = GetTestGroups()
|
||||
self.results.groups = self.groups
|
||||
self.assertEqual(1, len(self.results.groups))
|
||||
self.assert_(self.results.groups[0] is self.groups[0])
|
||||
|
||||
def test_load_cancels_filter(self):
|
||||
def get_file(path):
|
||||
return [f for f in self.objects if str(f.path) == path][0]
|
||||
|
||||
filename = op.join(self.tmpdir(), 'dupeguru_results.xml')
|
||||
self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
|
||||
self.results.save_to_xml(filename)
|
||||
r = Results(data)
|
||||
r.apply_filter('foo')
|
||||
r.load_from_xml(filename,get_file)
|
||||
self.assertEqual(2,len(r.groups))
|
||||
|
||||
def test_remove_dupe(self):
|
||||
self.results.remove_duplicates([self.results.dupes[0]])
|
||||
self.results.apply_filter(None)
|
||||
self.assertEqual(2,len(self.results.groups))
|
||||
self.assertEqual(2,len(self.results.dupes))
|
||||
self.results.apply_filter('ibabtu')
|
||||
self.results.remove_duplicates([self.results.dupes[0]])
|
||||
self.results.apply_filter(None)
|
||||
self.assertEqual(1,len(self.results.groups))
|
||||
self.assertEqual(1,len(self.results.dupes))
|
||||
|
||||
def test_filter_is_case_insensitive(self):
|
||||
self.results.apply_filter(None)
|
||||
self.results.apply_filter('FOO')
|
||||
self.assertEqual(1, len(self.results.dupes))
|
||||
|
||||
def test_make_ref_on_filtered_out_doesnt_mess_stats(self):
|
||||
# When filtered, a group containing filtered out dupes will display them as being reference.
|
||||
# When calling make_ref on such a dupe, the total size and dupecount stats gets messed up
|
||||
# because they are *not* counted in the stats in the first place.
|
||||
g1, g2 = self.groups
|
||||
bar_bleh = g1[1] # The "bar bleh" dupe is filtered out
|
||||
self.results.make_ref(bar_bleh)
|
||||
# Now the stats should display *2* markable dupes (instead of 1)
|
||||
expected = '0 / 2 (0.00 B / 2.00 B) duplicates marked. filter: foo'
|
||||
self.assertEqual(expected, self.results.stat_line)
|
||||
self.results.apply_filter(None) # Now let's make sure our unfiltered results aren't fucked up
|
||||
expected = '0 / 3 (0.00 B / 3.00 B) duplicates marked.'
|
||||
self.assertEqual(expected, self.results.stat_line)
|
||||
|
||||
|
||||
class TCResultsRefFile(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.results = Results(data)
|
||||
self.objects, self.matches, self.groups = GetTestGroups()
|
||||
self.objects[0].is_ref = True
|
||||
self.objects[1].is_ref = True
|
||||
self.results.groups = self.groups
|
||||
|
||||
def test_stat_line(self):
|
||||
expected = '0 / 2 (0.00 B / 2.00 B) duplicates marked.'
|
||||
self.assertEqual(expected, self.results.stat_line)
|
||||
|
||||
def test_make_ref(self):
|
||||
d = self.results.groups[0].dupes[1] #non-ref
|
||||
r = self.results.groups[0].ref
|
||||
self.results.make_ref(d)
|
||||
expected = '0 / 1 (0.00 B / 1.00 B) duplicates marked.'
|
||||
self.assertEqual(expected, self.results.stat_line)
|
||||
self.results.make_ref(r)
|
||||
expected = '0 / 2 (0.00 B / 2.00 B) duplicates marked.'
|
||||
self.assertEqual(expected, self.results.stat_line)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
131
py/scanner.py
Normal file
131
py/scanner.py
Normal file
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.scanner
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/03/03
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import logging
|
||||
|
||||
from ignore import IgnoreList
|
||||
|
||||
from hsutil import job
|
||||
from hsutil.misc import dedupe
|
||||
from hsutil.str import get_file_ext, rem_file_ext
|
||||
|
||||
from . import engine
|
||||
|
||||
(SCAN_TYPE_FILENAME,
|
||||
SCAN_TYPE_FIELDS,
|
||||
SCAN_TYPE_FIELDS_NO_ORDER,
|
||||
SCAN_TYPE_TAG,
|
||||
SCAN_TYPE_TAG_WITH_ALBUM, # Obsolete
|
||||
SCAN_TYPE_CONTENT,
|
||||
SCAN_TYPE_CONTENT_AUDIO) = range(7)
|
||||
|
||||
SCANNABLE_TAGS = ['track', 'artist', 'album', 'title', 'genre', 'year']
|
||||
|
||||
class Scanner(object):
|
||||
def __init__(self):
|
||||
self.ignore_list = IgnoreList()
|
||||
self.discarded_file_count = 0
|
||||
|
||||
def _getmatches(self, files, j):
|
||||
j = j.start_subjob(2)
|
||||
mf = engine.MatchFactory()
|
||||
if self.scan_type != SCAN_TYPE_CONTENT:
|
||||
mf.match_similar_words = self.match_similar_words
|
||||
mf.weight_words = self.word_weighting
|
||||
mf.min_match_percentage = self.min_match_percentage
|
||||
if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
|
||||
self.scan_type = SCAN_TYPE_FIELDS
|
||||
mf.no_field_order = True
|
||||
if self.scan_type == SCAN_TYPE_TAG_WITH_ALBUM:
|
||||
self.scan_type = SCAN_TYPE_TAG
|
||||
self.scanned_tags = set(['artist', 'album', 'title'])
|
||||
func = {
|
||||
SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
|
||||
SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
|
||||
SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
|
||||
SCAN_TYPE_CONTENT: lambda f: [str(f.size)],
|
||||
SCAN_TYPE_CONTENT_AUDIO: lambda f: [str(f.audiosize)]
|
||||
}[self.scan_type]
|
||||
for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
|
||||
f.words = func(f)
|
||||
return mf.getmatches(files, j)
|
||||
|
||||
@staticmethod
|
||||
def _key_func(dupe):
|
||||
return (not dupe.is_ref, -dupe.size)
|
||||
|
||||
@staticmethod
|
||||
def _tie_breaker(ref, dupe):
|
||||
refname = rem_file_ext(ref.name).lower()
|
||||
dupename = rem_file_ext(dupe.name).lower()
|
||||
if 'copy' in refname and 'copy' not in dupename:
|
||||
return True
|
||||
if refname.startswith(dupename) and (refname[len(dupename):].strip().isdigit()):
|
||||
return True
|
||||
return len(dupe.path) > len(ref.path)
|
||||
|
||||
def GetDupeGroups(self, files, j=job.nulljob):
|
||||
j = j.start_subjob([8, 2])
|
||||
for f in [f for f in files if not hasattr(f, 'is_ref')]:
|
||||
f.is_ref = False
|
||||
if self.size_threshold:
|
||||
files = [f for f in files if f.size >= self.size_threshold]
|
||||
logging.info('Getting matches')
|
||||
if self.match_factory is None:
|
||||
matches = self._getmatches(files, j)
|
||||
else:
|
||||
matches = self.match_factory.getmatches(files, j)
|
||||
logging.info('Found %d matches' % len(matches))
|
||||
if not self.mix_file_kind:
|
||||
j.set_progress(100, 'Removing false matches')
|
||||
matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
|
||||
if self.ignore_list:
|
||||
j = j.start_subjob(2)
|
||||
iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
|
||||
matches = [m for m in iter_matches
|
||||
if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
|
||||
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
|
||||
if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
|
||||
md5attrname = 'md5partial' if self.scan_type == SCAN_TYPE_CONTENT_AUDIO else 'md5'
|
||||
md5 = lambda f: getattr(f, md5attrname)
|
||||
j = j.start_subjob(2)
|
||||
for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
|
||||
md5(matched_file)
|
||||
j.set_progress(100, 'Removing false matches')
|
||||
matches = [m for m in matches if md5(m.first) == md5(m.second)]
|
||||
words_for_content = ['--'] # We compared md5. No words were involved.
|
||||
for m in matches:
|
||||
m.first.words = words_for_content
|
||||
m.second.words = words_for_content
|
||||
logging.info('Grouping matches')
|
||||
groups = engine.get_groups(matches, j)
|
||||
groups = [g for g in groups if any(not f.is_ref for f in g)]
|
||||
logging.info('Created %d groups' % len(groups))
|
||||
j.set_progress(100, 'Doing group prioritization')
|
||||
for g in groups:
|
||||
g.prioritize(self._key_func, self._tie_breaker)
|
||||
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
|
||||
self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups)
|
||||
return groups
|
||||
|
||||
match_factory = None
|
||||
match_similar_words = False
|
||||
min_match_percentage = 80
|
||||
mix_file_kind = True
|
||||
scan_type = SCAN_TYPE_FILENAME
|
||||
scanned_tags = set(['artist', 'title'])
|
||||
size_threshold = 0
|
||||
word_weighting = False
|
||||
|
||||
class ScannerME(Scanner): # Scanner for Music Edition
|
||||
@staticmethod
|
||||
def _key_func(dupe):
|
||||
return (not dupe.is_ref, -dupe.bitrate, -dupe.size)
|
||||
|
||||
468
py/scanner_test.py
Normal file
468
py/scanner_test.py
Normal file
@@ -0,0 +1,468 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: dupeguru.tests.scanner
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/03/03
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import unittest
|
||||
|
||||
from hsutil import job
|
||||
from hsutil.path import Path
|
||||
from hsutil.testcase import TestCase
|
||||
|
||||
from .engine import getwords, Match
|
||||
from .ignore import IgnoreList
|
||||
from .scanner import *
|
||||
|
||||
class NamedObject(object):
|
||||
def __init__(self, name="foobar", size=1):
|
||||
self.name = name
|
||||
self.size = size
|
||||
self.path = Path('')
|
||||
self.words = getwords(name)
|
||||
|
||||
|
||||
no = NamedObject
|
||||
|
||||
class TCScanner(TestCase):
|
||||
def test_empty(self):
|
||||
s = Scanner()
|
||||
r = s.GetDupeGroups([])
|
||||
self.assertEqual([],r)
|
||||
|
||||
def test_default_settings(self):
|
||||
s = Scanner()
|
||||
self.assertEqual(80,s.min_match_percentage)
|
||||
self.assertEqual(SCAN_TYPE_FILENAME,s.scan_type)
|
||||
self.assertEqual(True,s.mix_file_kind)
|
||||
self.assertEqual(False,s.word_weighting)
|
||||
self.assertEqual(False,s.match_similar_words)
|
||||
self.assert_(isinstance(s.ignore_list,IgnoreList))
|
||||
|
||||
def test_simple_with_default_settings(self):
|
||||
s = Scanner()
|
||||
f = [no('foo bar'),no('foo bar'),no('foo bleh')]
|
||||
r = s.GetDupeGroups(f)
|
||||
self.assertEqual(1,len(r))
|
||||
g = r[0]
|
||||
#'foo bleh' cannot be in the group because the default min match % is 80
|
||||
self.assertEqual(2,len(g))
|
||||
self.assert_(g.ref in f[:2])
|
||||
self.assert_(g.dupes[0] in f[:2])
|
||||
|
||||
def test_simple_with_lower_min_match(self):
|
||||
s = Scanner()
|
||||
s.min_match_percentage = 50
|
||||
f = [no('foo bar'),no('foo bar'),no('foo bleh')]
|
||||
r = s.GetDupeGroups(f)
|
||||
self.assertEqual(1,len(r))
|
||||
g = r[0]
|
||||
self.assertEqual(3,len(g))
|
||||
|
||||
def test_trim_all_ref_groups(self):
|
||||
s = Scanner()
|
||||
f = [no('foo'),no('foo'),no('bar'),no('bar')]
|
||||
f[2].is_ref = True
|
||||
f[3].is_ref = True
|
||||
r = s.GetDupeGroups(f)
|
||||
self.assertEqual(1,len(r))
|
||||
|
||||
def test_priorize(self):
|
||||
s = Scanner()
|
||||
f = [no('foo'),no('foo'),no('bar'),no('bar')]
|
||||
f[1].size = 2
|
||||
f[2].size = 3
|
||||
f[3].is_ref = True
|
||||
r = s.GetDupeGroups(f)
|
||||
g1,g2 = r
|
||||
self.assert_(f[1] in (g1.ref,g2.ref))
|
||||
self.assert_(f[0] in (g1.dupes[0],g2.dupes[0]))
|
||||
self.assert_(f[3] in (g1.ref,g2.ref))
|
||||
self.assert_(f[2] in (g1.dupes[0],g2.dupes[0]))
|
||||
|
||||
def test_content_scan(self):
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_CONTENT
|
||||
f = [no('foo'), no('bar'), no('bleh')]
|
||||
f[0].md5 = 'foobar'
|
||||
f[1].md5 = 'foobar'
|
||||
f[2].md5 = 'bleh'
|
||||
r = s.GetDupeGroups(f)
|
||||
self.assertEqual(len(r), 1)
|
||||
self.assertEqual(len(r[0]), 2)
|
||||
self.assertEqual(s.discarded_file_count, 0) # don't count the different md5 as discarded!
|
||||
|
||||
def test_content_scan_compare_sizes_first(self):
|
||||
class MyFile(no):
|
||||
def get_md5(file):
|
||||
self.fail()
|
||||
md5 = property(get_md5)
|
||||
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_CONTENT
|
||||
f = [MyFile('foo',1),MyFile('bar',2)]
|
||||
self.assertEqual(0,len(s.GetDupeGroups(f)))
|
||||
|
||||
def test_min_match_perc_doesnt_matter_for_content_scan(self):
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_CONTENT
|
||||
f = [no('foo'),no('bar'),no('bleh')]
|
||||
f[0].md5 = 'foobar'
|
||||
f[1].md5 = 'foobar'
|
||||
f[2].md5 = 'bleh'
|
||||
s.min_match_percentage = 101
|
||||
r = s.GetDupeGroups(f)
|
||||
self.assertEqual(1,len(r))
|
||||
self.assertEqual(2,len(r[0]))
|
||||
s.min_match_percentage = 0
|
||||
r = s.GetDupeGroups(f)
|
||||
self.assertEqual(1,len(r))
|
||||
self.assertEqual(2,len(r[0]))
|
||||
|
||||
def test_content_scan_puts_md5_in_words_at_the_end(self):
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_CONTENT
|
||||
f = [no('foo'),no('bar')]
|
||||
f[0].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
|
||||
f[1].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
|
||||
r = s.GetDupeGroups(f)
|
||||
g = r[0]
|
||||
self.assertEqual(['--'],g.ref.words)
|
||||
self.assertEqual(['--'],g.dupes[0].words)
|
||||
|
||||
def test_extension_is_not_counted_in_filename_scan(self):
|
||||
s = Scanner()
|
||||
s.min_match_percentage = 100
|
||||
f = [no('foo.bar'),no('foo.bleh')]
|
||||
r = s.GetDupeGroups(f)
|
||||
self.assertEqual(1,len(r))
|
||||
self.assertEqual(2,len(r[0]))
|
||||
|
||||
def test_job(self):
|
||||
def do_progress(progress,desc=''):
|
||||
log.append(progress)
|
||||
return True
|
||||
s = Scanner()
|
||||
log = []
|
||||
f = [no('foo bar'),no('foo bar'),no('foo bleh')]
|
||||
r = s.GetDupeGroups(f, job.Job(1,do_progress))
|
||||
self.assertEqual(0,log[0])
|
||||
self.assertEqual(100,log[-1])
|
||||
|
||||
def test_mix_file_kind(self):
|
||||
s = Scanner()
|
||||
s.mix_file_kind = False
|
||||
f = [no('foo.1'),no('foo.2')]
|
||||
r = s.GetDupeGroups(f)
|
||||
self.assertEqual(0,len(r))
|
||||
|
||||
def test_word_weighting(self):
|
||||
s = Scanner()
|
||||
s.min_match_percentage = 75
|
||||
s.word_weighting = True
|
||||
f = [no('foo bar'),no('foo bar bleh')]
|
||||
r = s.GetDupeGroups(f)
|
||||
self.assertEqual(1,len(r))
|
||||
g = r[0]
|
||||
m = g.get_match_of(g.dupes[0])
|
||||
self.assertEqual(75,m.percentage) # 16 letters, 12 matching
|
||||
|
||||
def test_similar_words(self):
|
||||
s = Scanner()
|
||||
s.match_similar_words = True
|
||||
f = [no('The White Stripes'),no('The Whites Stripe'),no('Limp Bizkit'),no('Limp Bizkitt')]
|
||||
r = s.GetDupeGroups(f)
|
||||
self.assertEqual(2,len(r))
|
||||
|
||||
def test_fields(self):
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_FIELDS
|
||||
f = [no('The White Stripes - Little Ghost'),no('The White Stripes - Little Acorn')]
|
||||
r = s.GetDupeGroups(f)
|
||||
self.assertEqual(0,len(r))
|
||||
|
||||
def test_fields_no_order(self):
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER
|
||||
f = [no('The White Stripes - Little Ghost'),no('Little Ghost - The White Stripes')]
|
||||
r = s.GetDupeGroups(f)
|
||||
self.assertEqual(1,len(r))
|
||||
|
||||
def test_tag_scan(self):
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_TAG
|
||||
o1 = no('foo')
|
||||
o2 = no('bar')
|
||||
o1.artist = 'The White Stripes'
|
||||
o1.title = 'The Air Near My Fingers'
|
||||
o2.artist = 'The White Stripes'
|
||||
o2.title = 'The Air Near My Fingers'
|
||||
r = s.GetDupeGroups([o1,o2])
|
||||
self.assertEqual(1,len(r))
|
||||
|
||||
def test_tag_with_album_scan(self):
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_TAG_WITH_ALBUM
|
||||
o1 = no('foo')
|
||||
o2 = no('bar')
|
||||
o3 = no('bleh')
|
||||
o1.artist = 'The White Stripes'
|
||||
o1.title = 'The Air Near My Fingers'
|
||||
o1.album = 'Elephant'
|
||||
o2.artist = 'The White Stripes'
|
||||
o2.title = 'The Air Near My Fingers'
|
||||
o2.album = 'Elephant'
|
||||
o3.artist = 'The White Stripes'
|
||||
o3.title = 'The Air Near My Fingers'
|
||||
o3.album = 'foobar'
|
||||
r = s.GetDupeGroups([o1,o2,o3])
|
||||
self.assertEqual(1,len(r))
|
||||
|
||||
def test_that_dash_in_tags_dont_create_new_fields(self):
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_TAG_WITH_ALBUM
|
||||
s.min_match_percentage = 50
|
||||
o1 = no('foo')
|
||||
o2 = no('bar')
|
||||
o1.artist = 'The White Stripes - a'
|
||||
o1.title = 'The Air Near My Fingers - a'
|
||||
o1.album = 'Elephant - a'
|
||||
o2.artist = 'The White Stripes - b'
|
||||
o2.title = 'The Air Near My Fingers - b'
|
||||
o2.album = 'Elephant - b'
|
||||
r = s.GetDupeGroups([o1,o2])
|
||||
self.assertEqual(1,len(r))
|
||||
|
||||
def test_tag_scan_with_different_scanned(self):
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_TAG
|
||||
s.scanned_tags = set(['track', 'year'])
|
||||
o1 = no('foo')
|
||||
o2 = no('bar')
|
||||
o1.artist = 'The White Stripes'
|
||||
o1.title = 'some title'
|
||||
o1.track = 'foo'
|
||||
o1.year = 'bar'
|
||||
o2.artist = 'The White Stripes'
|
||||
o2.title = 'another title'
|
||||
o2.track = 'foo'
|
||||
o2.year = 'bar'
|
||||
r = s.GetDupeGroups([o1, o2])
|
||||
self.assertEqual(1, len(r))
|
||||
|
||||
def test_tag_scan_only_scans_existing_tags(self):
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_TAG
|
||||
s.scanned_tags = set(['artist', 'foo'])
|
||||
o1 = no('foo')
|
||||
o2 = no('bar')
|
||||
o1.artist = 'The White Stripes'
|
||||
o1.foo = 'foo'
|
||||
o2.artist = 'The White Stripes'
|
||||
o2.foo = 'bar'
|
||||
r = s.GetDupeGroups([o1, o2])
|
||||
self.assertEqual(1, len(r)) # Because 'foo' is not scanned, they match
|
||||
|
||||
def test_tag_scan_converts_to_str(self):
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_TAG
|
||||
s.scanned_tags = set(['track'])
|
||||
o1 = no('foo')
|
||||
o2 = no('bar')
|
||||
o1.track = 42
|
||||
o2.track = 42
|
||||
try:
|
||||
r = s.GetDupeGroups([o1, o2])
|
||||
except TypeError:
|
||||
self.fail()
|
||||
self.assertEqual(1, len(r))
|
||||
|
||||
def test_tag_scan_non_ascii(self):
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_TAG
|
||||
s.scanned_tags = set(['title'])
|
||||
o1 = no('foo')
|
||||
o2 = no('bar')
|
||||
o1.title = u'foobar\u00e9'
|
||||
o2.title = u'foobar\u00e9'
|
||||
try:
|
||||
r = s.GetDupeGroups([o1, o2])
|
||||
except UnicodeEncodeError:
|
||||
self.fail()
|
||||
self.assertEqual(1, len(r))
|
||||
|
||||
def test_audio_content_scan(self):
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_CONTENT_AUDIO
|
||||
f = [no('foo'),no('bar'),no('bleh')]
|
||||
f[0].md5 = 'foo'
|
||||
f[1].md5 = 'bar'
|
||||
f[2].md5 = 'bleh'
|
||||
f[0].md5partial = 'foo'
|
||||
f[1].md5partial = 'foo'
|
||||
f[2].md5partial = 'bleh'
|
||||
f[0].audiosize = 1
|
||||
f[1].audiosize = 1
|
||||
f[2].audiosize = 1
|
||||
r = s.GetDupeGroups(f)
|
||||
self.assertEqual(1,len(r))
|
||||
self.assertEqual(2,len(r[0]))
|
||||
|
||||
def test_audio_content_scan_compare_sizes_first(self):
|
||||
class MyFile(no):
|
||||
def get_md5(file):
|
||||
self.fail()
|
||||
md5partial = property(get_md5)
|
||||
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_CONTENT_AUDIO
|
||||
f = [MyFile('foo'),MyFile('bar')]
|
||||
f[0].audiosize = 1
|
||||
f[1].audiosize = 2
|
||||
self.assertEqual(0,len(s.GetDupeGroups(f)))
|
||||
|
||||
def test_ignore_list(self):
|
||||
s = Scanner()
|
||||
f1 = no('foobar')
|
||||
f2 = no('foobar')
|
||||
f3 = no('foobar')
|
||||
f1.path = Path('dir1/foobar')
|
||||
f2.path = Path('dir2/foobar')
|
||||
f3.path = Path('dir3/foobar')
|
||||
s.ignore_list.Ignore(str(f1.path),str(f2.path))
|
||||
s.ignore_list.Ignore(str(f1.path),str(f3.path))
|
||||
r = s.GetDupeGroups([f1,f2,f3])
|
||||
self.assertEqual(1,len(r))
|
||||
g = r[0]
|
||||
self.assertEqual(1,len(g.dupes))
|
||||
self.assert_(f1 not in g)
|
||||
self.assert_(f2 in g)
|
||||
self.assert_(f3 in g)
|
||||
# Ignored matches are not counted as discarded
|
||||
self.assertEqual(s.discarded_file_count, 0)
|
||||
|
||||
def test_ignore_list_checks_for_unicode(self):
|
||||
#scanner was calling path_str for ignore list checks. Since the Path changes, it must
|
||||
#be unicode(path)
|
||||
s = Scanner()
|
||||
f1 = no('foobar')
|
||||
f2 = no('foobar')
|
||||
f3 = no('foobar')
|
||||
f1.path = Path(u'foo1\u00e9')
|
||||
f2.path = Path(u'foo2\u00e9')
|
||||
f3.path = Path(u'foo3\u00e9')
|
||||
s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path))
|
||||
s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path))
|
||||
r = s.GetDupeGroups([f1,f2,f3])
|
||||
self.assertEqual(1,len(r))
|
||||
g = r[0]
|
||||
self.assertEqual(1,len(g.dupes))
|
||||
self.assert_(f1 not in g)
|
||||
self.assert_(f2 in g)
|
||||
self.assert_(f3 in g)
|
||||
|
||||
def test_custom_match_factory(self):
|
||||
class MatchFactory(object):
|
||||
def getmatches(self,objects,j=None):
|
||||
return [Match(objects[0], objects[1], 420)]
|
||||
|
||||
|
||||
s = Scanner()
|
||||
s.match_factory = MatchFactory()
|
||||
o1,o2 = no('foo'),no('bar')
|
||||
groups = s.GetDupeGroups([o1,o2])
|
||||
self.assertEqual(1,len(groups))
|
||||
g = groups[0]
|
||||
self.assertEqual(2,len(g))
|
||||
g.switch_ref(o1)
|
||||
m = g.get_match_of(o2)
|
||||
self.assertEqual((o1,o2,420),m)
|
||||
|
||||
def test_file_evaluates_to_false(self):
|
||||
# A very wrong way to use any() was added at some point, causing resulting group list
|
||||
# to be empty.
|
||||
class FalseNamedObject(NamedObject):
|
||||
def __nonzero__(self):
|
||||
return False
|
||||
|
||||
|
||||
s = Scanner()
|
||||
f1 = FalseNamedObject('foobar')
|
||||
f2 = FalseNamedObject('foobar')
|
||||
r = s.GetDupeGroups([f1,f2])
|
||||
self.assertEqual(1,len(r))
|
||||
|
||||
def test_size_threshold(self):
|
||||
# Only file equal or higher than the size_threshold in size are scanned
|
||||
s = Scanner()
|
||||
f1 = no('foo', 1)
|
||||
f2 = no('foo', 2)
|
||||
f3 = no('foo', 3)
|
||||
s.size_threshold = 2
|
||||
groups = s.GetDupeGroups([f1,f2,f3])
|
||||
self.assertEqual(len(groups), 1)
|
||||
[group] = groups
|
||||
self.assertEqual(len(group), 2)
|
||||
self.assertTrue(f1 not in group)
|
||||
self.assertTrue(f2 in group)
|
||||
self.assertTrue(f3 in group)
|
||||
|
||||
def test_tie_breaker_path_deepness(self):
|
||||
# If there is a tie in prioritization, path deepness is used as a tie breaker
|
||||
s = Scanner()
|
||||
o1, o2 = no('foo'), no('foo')
|
||||
o1.path = Path('foo')
|
||||
o2.path = Path('foo/bar')
|
||||
[group] = s.GetDupeGroups([o1, o2])
|
||||
self.assertTrue(group.ref is o2)
|
||||
|
||||
def test_tie_breaker_copy(self):
|
||||
# if copy is in the words used (even if it has a deeper path), it becomes a dupe
|
||||
s = Scanner()
|
||||
o1, o2 = no('foo bar Copy'), no('foo bar')
|
||||
o1.path = Path('deeper/path')
|
||||
o2.path = Path('foo')
|
||||
[group] = s.GetDupeGroups([o1, o2])
|
||||
self.assertTrue(group.ref is o2)
|
||||
|
||||
def test_tie_breaker_same_name_plus_digit(self):
|
||||
# if ref has the same words as dupe, but has some just one extra word which is a digit, it
|
||||
# becomes a dupe
|
||||
s = Scanner()
|
||||
o1, o2 = no('foo bar 42'), no('foo bar')
|
||||
o1.path = Path('deeper/path')
|
||||
o2.path = Path('foo')
|
||||
[group] = s.GetDupeGroups([o1, o2])
|
||||
self.assertTrue(group.ref is o2)
|
||||
|
||||
def test_partial_group_match(self):
|
||||
# Count the number od discarded matches (when a file doesn't match all other dupes of the
|
||||
# group) in Scanner.discarded_file_count
|
||||
s = Scanner()
|
||||
o1, o2, o3 = no('a b'), no('a'), no('b')
|
||||
s.min_match_percentage = 50
|
||||
[group] = s.GetDupeGroups([o1, o2, o3])
|
||||
self.assertEqual(len(group), 2)
|
||||
self.assertTrue(o1 in group)
|
||||
self.assertTrue(o2 in group)
|
||||
self.assertTrue(o3 not in group)
|
||||
self.assertEqual(s.discarded_file_count, 1)
|
||||
|
||||
|
||||
class TCScannerME(TestCase):
|
||||
def test_priorize(self):
|
||||
# in ScannerME, bitrate goes first (right after is_ref) in priorization
|
||||
s = ScannerME()
|
||||
o1, o2 = no('foo'), no('foo')
|
||||
o1.bitrate = 1
|
||||
o2.bitrate = 2
|
||||
[group] = s.GetDupeGroups([o1, o2])
|
||||
self.assertTrue(group.ref is o2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user