1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-23 07:01:39 +00:00

Initial commit.

--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402
This commit is contained in:
hsoft
2009-06-01 09:55:11 +00:00
parent 4f197ffd5a
commit e9a97afdf8
354 changed files with 38083 additions and 0 deletions

1
py/__init__.py Normal file
View File

@@ -0,0 +1 @@

229
py/app.py Normal file
View File

@@ -0,0 +1,229 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.app
Created By: Virgil Dupras
Created On: 2006/11/11
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:02:48 +0200 (Thu, 28 May 2009) $
$Revision: 4388 $
Copyright 2006 Hardcoded Software (http://www.hardcoded.net)
"""
import os
import os.path as op
import logging
from hsfs import IT_ATTRS, IT_EXTRA
from hsutil import job, io, files
from hsutil.path import Path
from hsutil.reg import RegistrableApplication, RegistrationRequired
from hsutil.misc import flatten, first
from hsutil.str import escape
import directories
import results
import scanner
JOB_SCAN = 'job_scan'
JOB_LOAD = 'job_load'
JOB_MOVE = 'job_move'
JOB_COPY = 'job_copy'
JOB_DELETE = 'job_delete'
class NoScannableFileError(Exception):
pass
class AllFilesAreRefError(Exception):
pass
class DupeGuru(RegistrableApplication):
def __init__(self, data_module, appdata, appid):
RegistrableApplication.__init__(self, appid)
self.appdata = appdata
if not op.exists(self.appdata):
os.makedirs(self.appdata)
self.data = data_module
self.directories = directories.Directories()
self.results = results.Results(data_module)
self.scanner = scanner.Scanner()
self.action_count = 0
self.last_op_error_count = 0
self.options = {
'escape_filter_regexp': True,
'clean_empty_dirs': False,
}
def _demo_check(self):
if self.registered:
return
count = self.results.mark_count
if count + self.action_count > 10:
raise RegistrationRequired()
else:
self.action_count += count
def _do_delete(self, j):
def op(dupe):
j.add_progress()
return self._do_delete_dupe(dupe)
j.start_job(self.results.mark_count)
self.last_op_error_count = self.results.perform_on_marked(op, True)
def _do_delete_dupe(self, dupe):
if not io.exists(dupe.path):
dupe.parent = None
return True
self._recycle_dupe(dupe)
self.clean_empty_dirs(dupe.path[:-1])
if not io.exists(dupe.path):
dupe.parent = None
return True
logging.warning(u"Could not send {0} to trash.".format(unicode(dupe.path)))
return False
def _do_load(self, j):
self.directories.LoadFromFile(op.join(self.appdata, 'last_directories.xml'))
j = j.start_subjob([1, 9])
self.results.load_from_xml(op.join(self.appdata, 'last_results.xml'), self._get_file, j)
files = flatten(g[:] for g in self.results.groups)
for file in j.iter_with_progress(files, 'Reading metadata %d/%d'):
file._read_all_info(sections=[IT_ATTRS, IT_EXTRA])
def _get_file(self, str_path):
p = Path(str_path)
for d in self.directories:
if p not in d.path:
continue
result = d.find_path(p[d.path:])
if result is not None:
return result
@staticmethod
def _recycle_dupe(dupe):
raise NotImplementedError()
def _start_job(self, jobid, func):
# func(j)
raise NotImplementedError()
def AddDirectory(self, d):
try:
self.directories.add_path(Path(d))
return 0
except directories.AlreadyThereError:
return 1
except directories.InvalidPathError:
return 2
def AddToIgnoreList(self, dupe):
g = self.results.get_group_of_duplicate(dupe)
for other in g:
if other is not dupe:
self.scanner.ignore_list.Ignore(unicode(other.path), unicode(dupe.path))
def ApplyFilter(self, filter):
self.results.apply_filter(None)
if self.options['escape_filter_regexp']:
filter = escape(filter, '()[]\\.|+?^')
filter = escape(filter, '*', '.')
self.results.apply_filter(filter)
def clean_empty_dirs(self, path):
if self.options['clean_empty_dirs']:
while files.delete_if_empty(path, ['.DS_Store']):
path = path[:-1]
def CopyOrMove(self, dupe, copy, destination, dest_type):
"""
copy: True = Copy False = Move
destination: string.
dest_type: 0 = right in destination.
1 = relative re-creation.
2 = absolute re-creation.
"""
source_path = dupe.path
location_path = dupe.root.path
dest_path = Path(destination)
if dest_type == 2:
dest_path = dest_path + source_path[1:-1] #Remove drive letter and filename
elif dest_type == 1:
dest_path = dest_path + source_path[location_path:-1]
if not io.exists(dest_path):
io.makedirs(dest_path)
try:
if copy:
files.copy(source_path, dest_path)
else:
files.move(source_path, dest_path)
self.clean_empty_dirs(source_path[:-1])
except (IOError, OSError) as e:
operation = 'Copy' if copy else 'Move'
logging.warning('%s operation failed on %s. Error: %s' % (operation, unicode(dupe.path), unicode(e)))
return False
return True
def copy_or_move_marked(self, copy, destination, recreate_path):
def do(j):
def op(dupe):
j.add_progress()
return self.CopyOrMove(dupe, copy, destination, recreate_path)
j.start_job(self.results.mark_count)
self.last_op_error_count = self.results.perform_on_marked(op, not copy)
self._demo_check()
jobid = JOB_COPY if copy else JOB_MOVE
self._start_job(jobid, do)
def delete_marked(self):
self._demo_check()
self._start_job(JOB_DELETE, self._do_delete)
def load(self):
self._start_job(JOB_LOAD, self._do_load)
self.LoadIgnoreList()
def LoadIgnoreList(self):
p = op.join(self.appdata, 'ignore_list.xml')
self.scanner.ignore_list.load_from_xml(p)
def make_reference(self, duplicates):
changed_groups = set()
for dupe in duplicates:
g = self.results.get_group_of_duplicate(dupe)
if g not in changed_groups:
self.results.make_ref(dupe)
changed_groups.add(g)
def Save(self):
self.directories.SaveToFile(op.join(self.appdata, 'last_directories.xml'))
self.results.save_to_xml(op.join(self.appdata, 'last_results.xml'))
def SaveIgnoreList(self):
p = op.join(self.appdata, 'ignore_list.xml')
self.scanner.ignore_list.save_to_xml(p)
def start_scanning(self):
def do(j):
j.set_progress(0, 'Collecting files to scan')
files = list(self.directories.get_files())
logging.info('Scanning %d files' % len(files))
self.results.groups = self.scanner.GetDupeGroups(files, j)
files = self.directories.get_files()
first_file = first(files)
if first_file is None:
raise NoScannableFileError()
if first_file.is_ref and all(f.is_ref for f in files):
raise AllFilesAreRefError()
self.results.groups = []
self._start_job(JOB_SCAN, do)
#--- Properties
@property
def stat_line(self):
result = self.results.stat_line
if self.scanner.discarded_file_count:
result = '%s (%d discarded)' % (result, self.scanner.discarded_file_count)
return result

304
py/app_cocoa.py Normal file
View File

@@ -0,0 +1,304 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.app_cocoa
Created By: Virgil Dupras
Created On: 2006/11/11
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
$Revision: 4392 $
Copyright 2006 Hardcoded Software (http://www.hardcoded.net)
"""
from AppKit import *
import logging
import os.path as op
import hsfs as fs
from hsfs.phys.bundle import Bundle
from hsutil.cocoa import install_exception_hook
from hsutil.str import get_file_ext
from hsutil import io, cocoa, job
from hsutil.reg import RegistrationRequired
import export, app, data
JOBID2TITLE = {
app.JOB_SCAN: "Scanning for duplicates",
app.JOB_LOAD: "Loading",
app.JOB_MOVE: "Moving",
app.JOB_COPY: "Copying",
app.JOB_DELETE: "Sending to Trash",
}
class DGDirectory(fs.phys.Directory):
def _create_sub_dir(self,name,with_parent = True):
ext = get_file_ext(name)
if ext == 'app':
if with_parent:
parent = self
else:
parent = None
return Bundle(parent,name)
else:
return super(DGDirectory,self)._create_sub_dir(name,with_parent)
def demo_method(method):
def wrapper(self, *args, **kwargs):
try:
return method(self, *args, **kwargs)
except RegistrationRequired:
NSNotificationCenter.defaultCenter().postNotificationName_object_('RegistrationRequired', self)
return wrapper
class DupeGuru(app.DupeGuru):
def __init__(self, data_module, appdata_subdir, appid):
LOGGING_LEVEL = logging.DEBUG if NSUserDefaults.standardUserDefaults().boolForKey_('debug') else logging.WARNING
logging.basicConfig(level=LOGGING_LEVEL, format='%(levelname)s %(message)s')
logging.debug('started in debug mode')
install_exception_hook()
if data_module is None:
data_module = data
appdata = op.expanduser(op.join('~', '.hsoftdata', appdata_subdir))
app.DupeGuru.__init__(self, data_module, appdata, appid)
self.progress = cocoa.ThreadedJobPerformer()
self.directories.dirclass = DGDirectory
self.display_delta_values = False
self.selected_dupes = []
self.RefreshDetailsTable(None,None)
#--- Override
@staticmethod
def _recycle_dupe(dupe):
if not io.exists(dupe.path):
dupe.parent = None
return True
directory = unicode(dupe.parent.path)
filename = dupe.name
result, tag = NSWorkspace.sharedWorkspace().performFileOperation_source_destination_files_tag_(
NSWorkspaceRecycleOperation, directory, '', [filename])
if not io.exists(dupe.path):
dupe.parent = None
return True
logging.warning('Could not send %s to trash. tag: %d' % (unicode(dupe.path), tag))
return False
def _start_job(self, jobid, func):
try:
j = self.progress.create_job()
self.progress.run_threaded(func, args=(j, ))
except job.JobInProgressError:
NSNotificationCenter.defaultCenter().postNotificationName_object_('JobInProgress', self)
else:
ud = {'desc': JOBID2TITLE[jobid], 'jobid':jobid}
NSNotificationCenter.defaultCenter().postNotificationName_object_userInfo_('JobStarted', self, ud)
#---Helpers
def GetObjects(self,node_path):
#returns a tuple g,d
try:
g = self.results.groups[node_path[0]]
if len(node_path) == 2:
return (g,self.results.groups[node_path[0]].dupes[node_path[1]])
else:
return (g,None)
except IndexError:
return (None,None)
def GetDirectory(self,node_path,curr_dir=None):
if not node_path:
return curr_dir
if curr_dir is not None:
l = curr_dir.dirs
else:
l = self.directories
d = l[node_path[0]]
return self.GetDirectory(node_path[1:],d)
def RefreshDetailsTable(self,dupe,group):
l1 = self.data.GetDisplayInfo(dupe,group,False)
if group is not None:
l2 = self.data.GetDisplayInfo(group.ref,group,False)
else:
l2 = l1 #To have a list of empty '---' values
names = [c['display'] for c in self.data.COLUMNS]
self.details_table = zip(names,l1,l2)
#---Public
def AddSelectedToIgnoreList(self):
for dupe in self.selected_dupes:
self.AddToIgnoreList(dupe)
copy_or_move_marked = demo_method(app.DupeGuru.copy_or_move_marked)
delete_marked = demo_method(app.DupeGuru.delete_marked)
def ExportToXHTML(self,column_ids,xslt_path,css_path):
columns = []
for index,column in enumerate(self.data.COLUMNS):
display = column['display']
enabled = str(index) in column_ids
columns.append((display,enabled))
xml_path = op.join(self.appdata,'results_export.xml')
self.results.save_to_xml(xml_path,self.data.GetDisplayInfo)
return export.export_to_xhtml(xml_path,xslt_path,css_path,columns)
def MakeSelectedReference(self):
self.make_reference(self.selected_dupes)
def OpenSelected(self):
if self.selected_dupes:
path = unicode(self.selected_dupes[0].path)
NSWorkspace.sharedWorkspace().openFile_(path)
def PurgeIgnoreList(self):
self.scanner.ignore_list.Filter(lambda f,s:op.exists(f) and op.exists(s))
def RefreshDetailsWithSelected(self):
if self.selected_dupes:
self.RefreshDetailsTable(
self.selected_dupes[0],
self.results.get_group_of_duplicate(self.selected_dupes[0])
)
else:
self.RefreshDetailsTable(None,None)
def RemoveDirectory(self,index):
try:
del self.directories[index]
except IndexError:
pass
def RemoveSelected(self):
self.results.remove_duplicates(self.selected_dupes)
def RenameSelected(self,newname):
try:
d = self.selected_dupes[0]
d = d.move(d.parent,newname)
return True
except (IndexError,fs.FSError),e:
logging.warning("dupeGuru Warning: %s" % str(e))
return False
def RevealSelected(self):
if self.selected_dupes:
path = unicode(self.selected_dupes[0].path)
NSWorkspace.sharedWorkspace().selectFile_inFileViewerRootedAtPath_(path,'')
def start_scanning(self):
self.RefreshDetailsTable(None, None)
try:
app.DupeGuru.start_scanning(self)
return 0
except app.NoScannableFileError:
return 3
except app.AllFilesAreRefError:
return 1
def SelectResultNodePaths(self,node_paths):
def extract_dupe(t):
g,d = t
if d is not None:
return d
else:
if g is not None:
return g.ref
selected = [extract_dupe(self.GetObjects(p)) for p in node_paths]
self.selected_dupes = [dupe for dupe in selected if dupe is not None]
def SelectPowerMarkerNodePaths(self,node_paths):
rows = [p[0] for p in node_paths]
self.selected_dupes = [
self.results.dupes[row] for row in rows if row in xrange(len(self.results.dupes))
]
def SetDirectoryState(self,node_path,state):
d = self.GetDirectory(node_path)
self.directories.SetState(d.path,state)
def sort_dupes(self,key,asc):
self.results.sort_dupes(key,asc,self.display_delta_values)
def sort_groups(self,key,asc):
self.results.sort_groups(key,asc)
def ToggleSelectedMarkState(self):
for dupe in self.selected_dupes:
self.results.mark_toggle(dupe)
#---Data
def GetOutlineViewMaxLevel(self, tag):
if tag == 0:
return 2
elif tag == 1:
return 0
elif tag == 2:
return 1
def GetOutlineViewChildCounts(self, tag, node_path):
if self.progress._job_running:
return []
if tag == 0: #Normal results
assert not node_path # no other value is possible
return [len(g.dupes) for g in self.results.groups]
elif tag == 1: #Directories
dirs = self.GetDirectory(node_path).dirs if node_path else self.directories
return [d.dircount for d in dirs]
else: #Power Marker
assert not node_path # no other value is possible
return [0 for d in self.results.dupes]
def GetOutlineViewValues(self, tag, node_path):
if self.progress._job_running:
return
if not node_path:
return
if tag in (0,2): #Normal results / Power Marker
if tag == 0:
g, d = self.GetObjects(node_path)
if d is None:
d = g.ref
else:
d = self.results.dupes[node_path[0]]
g = self.results.get_group_of_duplicate(d)
result = self.data.GetDisplayInfo(d, g, self.display_delta_values)
return result
elif tag == 1: #Directories
d = self.GetDirectory(node_path)
return [
d.name,
self.directories.GetState(d.path)
]
def GetOutlineViewMarked(self, tag, node_path):
# 0=unmarked 1=marked 2=unmarkable
if self.progress._job_running:
return
if not node_path:
return 2
if tag == 1: #Directories
return 2
if tag == 0: #Normal results
g, d = self.GetObjects(node_path)
else: #Power Marker
d = self.results.dupes[node_path[0]]
if (d is None) or (not self.results.is_markable(d)):
return 2
elif self.results.is_marked(d):
return 1
else:
return 0
def GetTableViewCount(self, tag):
if self.progress._job_running:
return 0
return len(self.details_table)
def GetTableViewMarkedIndexes(self,tag):
return []
def GetTableViewValues(self,tag,row):
return self.details_table[row]

320
py/app_cocoa_test.py Normal file
View File

@@ -0,0 +1,320 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.tests.app_cocoa
Created By: Virgil Dupras
Created On: 2006/11/11
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-29 17:51:41 +0200 (Fri, 29 May 2009) $
$Revision: 4409 $
Copyright 2006 Hardcoded Software (http://www.hardcoded.net)
"""
import tempfile
import shutil
import logging
from hsutil.path import Path
from hsutil.testcase import TestCase
from hsutil.decorators import log_calls
import hsfs.phys
import os.path as op
from . import engine, data
try:
from .app_cocoa import DupeGuru as DupeGuruBase, DGDirectory
except ImportError:
from nose.plugins.skip import SkipTest
raise SkipTest("These tests can only be run on OS X")
from .results_test import GetTestGroups
class DupeGuru(DupeGuruBase):
def __init__(self):
DupeGuruBase.__init__(self, data, '/tmp', appid=4)
def _start_job(self, jobid, func):
func(nulljob)
def r2np(rows):
#Transforms a list of rows [1,2,3] into a list of node paths [[1],[2],[3]]
return [[i] for i in rows]
class TCDupeGuru(TestCase):
def setUp(self):
self.app = DupeGuru()
self.objects,self.matches,self.groups = GetTestGroups()
self.app.results.groups = self.groups
def test_GetObjects(self):
app = self.app
objects = self.objects
groups = self.groups
g,d = app.GetObjects([0])
self.assert_(g is groups[0])
self.assert_(d is None)
g,d = app.GetObjects([0,0])
self.assert_(g is groups[0])
self.assert_(d is objects[1])
g,d = app.GetObjects([1,0])
self.assert_(g is groups[1])
self.assert_(d is objects[4])
def test_GetObjects_after_sort(self):
app = self.app
objects = self.objects
groups = self.groups[:] #To keep the old order in memory
app.sort_groups(0,False) #0 = Filename
#Now, the group order is supposed to be reversed
g,d = app.GetObjects([0,0])
self.assert_(g is groups[1])
self.assert_(d is objects[4])
def test_GetObjects_out_of_range(self):
app = self.app
self.assertEqual((None,None),app.GetObjects([2]))
self.assertEqual((None,None),app.GetObjects([]))
self.assertEqual((None,None),app.GetObjects([1,2]))
def test_selectResultNodePaths(self):
app = self.app
objects = self.objects
app.SelectResultNodePaths([[0,0],[0,1]])
self.assertEqual(2,len(app.selected_dupes))
self.assert_(app.selected_dupes[0] is objects[1])
self.assert_(app.selected_dupes[1] is objects[2])
def test_selectResultNodePaths_with_ref(self):
app = self.app
objects = self.objects
app.SelectResultNodePaths([[0,0],[0,1],[1]])
self.assertEqual(3,len(app.selected_dupes))
self.assert_(app.selected_dupes[0] is objects[1])
self.assert_(app.selected_dupes[1] is objects[2])
self.assert_(app.selected_dupes[2] is self.groups[1].ref)
def test_selectResultNodePaths_empty(self):
self.app.SelectResultNodePaths([])
self.assertEqual(0,len(self.app.selected_dupes))
def test_selectResultNodePaths_after_sort(self):
app = self.app
objects = self.objects
groups = self.groups[:] #To keep the old order in memory
app.sort_groups(0,False) #0 = Filename
#Now, the group order is supposed to be reversed
app.SelectResultNodePaths([[0,0],[1],[1,0]])
self.assertEqual(3,len(app.selected_dupes))
self.assert_(app.selected_dupes[0] is objects[4])
self.assert_(app.selected_dupes[1] is groups[0].ref)
self.assert_(app.selected_dupes[2] is objects[1])
def test_selectResultNodePaths_out_of_range(self):
app = self.app
app.SelectResultNodePaths([[0,0],[0,1],[1],[1,1],[2]])
self.assertEqual(3,len(app.selected_dupes))
def test_selectPowerMarkerRows(self):
app = self.app
objects = self.objects
app.SelectPowerMarkerNodePaths(r2np([0,1,2]))
self.assertEqual(3,len(app.selected_dupes))
self.assert_(app.selected_dupes[0] is objects[1])
self.assert_(app.selected_dupes[1] is objects[2])
self.assert_(app.selected_dupes[2] is objects[4])
def test_selectPowerMarkerRows_empty(self):
self.app.SelectPowerMarkerNodePaths([])
self.assertEqual(0,len(self.app.selected_dupes))
def test_selectPowerMarkerRows_after_sort(self):
app = self.app
objects = self.objects
app.sort_dupes(0,False) #0 = Filename
app.SelectPowerMarkerNodePaths(r2np([0,1,2]))
self.assertEqual(3,len(app.selected_dupes))
self.assert_(app.selected_dupes[0] is objects[4])
self.assert_(app.selected_dupes[1] is objects[2])
self.assert_(app.selected_dupes[2] is objects[1])
def test_selectPowerMarkerRows_out_of_range(self):
app = self.app
app.SelectPowerMarkerNodePaths(r2np([0,1,2,3]))
self.assertEqual(3,len(app.selected_dupes))
def test_toggleSelectedMark(self):
app = self.app
objects = self.objects
app.ToggleSelectedMarkState()
self.assertEqual(0,app.results.mark_count)
app.SelectPowerMarkerNodePaths(r2np([0,2]))
app.ToggleSelectedMarkState()
self.assertEqual(2,app.results.mark_count)
self.assert_(not app.results.is_marked(objects[0]))
self.assert_(app.results.is_marked(objects[1]))
self.assert_(not app.results.is_marked(objects[2]))
self.assert_(not app.results.is_marked(objects[3]))
self.assert_(app.results.is_marked(objects[4]))
def test_refreshDetailsWithSelected(self):
def mock_refresh(dupe,group):
self.called = True
if self.app.selected_dupes:
self.assert_(dupe is self.app.selected_dupes[0])
self.assert_(group is self.app.results.get_group_of_duplicate(dupe))
else:
self.assert_(dupe is None)
self.assert_(group is None)
self.app.RefreshDetailsTable = mock_refresh
self.called = False
self.app.SelectPowerMarkerNodePaths(r2np([0,2]))
self.app.RefreshDetailsWithSelected()
self.assert_(self.called)
self.called = False
self.app.SelectPowerMarkerNodePaths([])
self.app.RefreshDetailsWithSelected()
self.assert_(self.called)
def test_makeSelectedReference(self):
app = self.app
objects = self.objects
groups = self.groups
app.SelectPowerMarkerNodePaths(r2np([0,2]))
app.MakeSelectedReference()
self.assert_(groups[0].ref is objects[1])
self.assert_(groups[1].ref is objects[4])
def test_makeSelectedReference_by_selecting_two_dupes_in_the_same_group(self):
app = self.app
objects = self.objects
groups = self.groups
app.SelectPowerMarkerNodePaths(r2np([0,1,2]))
#Only 0 and 2 must go ref, not 1 because it is a part of the same group
app.MakeSelectedReference()
self.assert_(groups[0].ref is objects[1])
self.assert_(groups[1].ref is objects[4])
def test_removeSelected(self):
app = self.app
app.SelectPowerMarkerNodePaths(r2np([0,2]))
app.RemoveSelected()
self.assertEqual(1,len(app.results.dupes))
app.RemoveSelected()
self.assertEqual(1,len(app.results.dupes))
app.SelectPowerMarkerNodePaths(r2np([0,2]))
app.RemoveSelected()
self.assertEqual(0,len(app.results.dupes))
def test_addDirectory_simple(self):
app = self.app
self.assertEqual(0,app.AddDirectory(self.datadirpath()))
self.assertEqual(1,len(app.directories))
def test_addDirectory_already_there(self):
app = self.app
self.assertEqual(0,app.AddDirectory(self.datadirpath()))
self.assertEqual(1,app.AddDirectory(self.datadirpath()))
def test_addDirectory_does_not_exist(self):
app = self.app
self.assertEqual(2,app.AddDirectory('/does_not_exist'))
def test_ignore(self):
app = self.app
app.SelectPowerMarkerNodePaths(r2np([2])) #The dupe of the second, 2 sized group
app.AddSelectedToIgnoreList()
self.assertEqual(1,len(app.scanner.ignore_list))
app.SelectPowerMarkerNodePaths(r2np([0])) #first dupe of the 3 dupes group
app.AddSelectedToIgnoreList()
#BOTH the ref and the other dupe should have been added
self.assertEqual(3,len(app.scanner.ignore_list))
def test_purgeIgnoreList(self):
app = self.app
p1 = self.filepath('zerofile')
p2 = self.filepath('zerofill')
dne = '/does_not_exist'
app.scanner.ignore_list.Ignore(dne,p1)
app.scanner.ignore_list.Ignore(p2,dne)
app.scanner.ignore_list.Ignore(p1,p2)
app.PurgeIgnoreList()
self.assertEqual(1,len(app.scanner.ignore_list))
self.assert_(app.scanner.ignore_list.AreIgnored(p1,p2))
self.assert_(not app.scanner.ignore_list.AreIgnored(dne,p1))
def test_only_unicode_is_added_to_ignore_list(self):
def FakeIgnore(first,second):
if not isinstance(first,unicode):
self.fail()
if not isinstance(second,unicode):
self.fail()
app = self.app
app.scanner.ignore_list.Ignore = FakeIgnore
app.SelectPowerMarkerNodePaths(r2np([2])) #The dupe of the second, 2 sized group
app.AddSelectedToIgnoreList()
def test_dirclass(self):
self.assert_(self.app.directories.dirclass is DGDirectory)
class TCDupeGuru_renameSelected(TestCase):
def setUp(self):
p = Path(tempfile.mkdtemp())
fp = open(str(p + 'foo bar 1'),mode='w')
fp.close()
fp = open(str(p + 'foo bar 2'),mode='w')
fp.close()
fp = open(str(p + 'foo bar 3'),mode='w')
fp.close()
refdir = hsfs.phys.Directory(None,str(p))
matches = engine.MatchFactory().getmatches(refdir.files)
groups = engine.get_groups(matches)
g = groups[0]
g.prioritize(lambda x:x.name)
app = DupeGuru()
app.results.groups = groups
self.app = app
self.groups = groups
self.p = p
self.refdir = refdir
def tearDown(self):
shutil.rmtree(str(self.p))
def test_simple(self):
app = self.app
refdir = self.refdir
g = self.groups[0]
app.SelectPowerMarkerNodePaths(r2np([0]))
self.assert_(app.RenameSelected('renamed'))
self.assert_('renamed' in refdir)
self.assert_('foo bar 2' not in refdir)
self.assert_(g.dupes[0] is refdir['renamed'])
self.assert_(g.dupes[0] in refdir)
def test_none_selected(self):
app = self.app
refdir = self.refdir
g = self.groups[0]
app.SelectPowerMarkerNodePaths([])
self.mock(logging, 'warning', log_calls(lambda msg: None))
self.assert_(not app.RenameSelected('renamed'))
msg = logging.warning.calls[0]['msg']
self.assertEqual('dupeGuru Warning: list index out of range', msg)
self.assert_('renamed' not in refdir)
self.assert_('foo bar 2' in refdir)
self.assert_(g.dupes[0] is refdir['foo bar 2'])
def test_name_already_exists(self):
app = self.app
refdir = self.refdir
g = self.groups[0]
app.SelectPowerMarkerNodePaths(r2np([0]))
self.mock(logging, 'warning', log_calls(lambda msg: None))
self.assert_(not app.RenameSelected('foo bar 1'))
msg = logging.warning.calls[0]['msg']
self.assert_(msg.startswith('dupeGuru Warning: \'foo bar 2\' already exists in'))
self.assert_('foo bar 1' in refdir)
self.assert_('foo bar 2' in refdir)
self.assert_(g.dupes[0] is refdir['foo bar 2'])

68
py/app_me_cocoa.py Normal file
View File

@@ -0,0 +1,68 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.app_me_cocoa
Created By: Virgil Dupras
Created On: 2006/11/16
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
$Revision: 4392 $
Copyright 2006 Hardcoded Software (http://www.hardcoded.net)
"""
import os.path as op
import logging
from appscript import app, k, CommandError
import time
from hsutil.cocoa import as_fetch
import hsfs.phys.music
import app_cocoa, data_me, scanner
JOB_REMOVE_DEAD_TRACKS = 'jobRemoveDeadTracks'
JOB_SCAN_DEAD_TRACKS = 'jobScanDeadTracks'
app_cocoa.JOBID2TITLE.update({
JOB_REMOVE_DEAD_TRACKS: "Removing dead tracks from your iTunes Library",
JOB_SCAN_DEAD_TRACKS: "Scanning the iTunes Library",
})
class DupeGuruME(app_cocoa.DupeGuru):
def __init__(self):
app_cocoa.DupeGuru.__init__(self, data_me, 'dupeguru_me', appid=1)
self.scanner = scanner.ScannerME()
self.directories.dirclass = hsfs.phys.music.Directory
self.dead_tracks = []
def remove_dead_tracks(self):
def do(j):
a = app('iTunes')
for index, track in enumerate(j.iter_with_progress(self.dead_tracks)):
if index % 100 == 0:
time.sleep(.1)
try:
track.delete()
except CommandError as e:
logging.warning('Error while trying to remove a track from iTunes: %s' % unicode(e))
self._start_job(JOB_REMOVE_DEAD_TRACKS, do)
def scan_dead_tracks(self):
def do(j):
a = app('iTunes')
try:
[source] = [s for s in a.sources() if s.kind() == k.library]
[library] = source.library_playlists()
except ValueError:
logging.warning('Some unexpected iTunes configuration encountered')
return
self.dead_tracks = []
tracks = as_fetch(library.file_tracks, k.file_track)
for index, track in enumerate(j.iter_with_progress(tracks)):
if index % 100 == 0:
time.sleep(.1)
if track.location() == k.missing_value:
self.dead_tracks.append(track)
logging.info('Found %d dead tracks' % len(self.dead_tracks))
self._start_job(JOB_SCAN_DEAD_TRACKS, do)

212
py/app_pe_cocoa.py Normal file
View File

@@ -0,0 +1,212 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.app_pe_cocoa
Created By: Virgil Dupras
Created On: 2006/11/13
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
$Revision: 4392 $
Copyright 2006 Hardcoded Software (http://www.hardcoded.net)
"""
import os
import os.path as op
import logging
import plistlib
import objc
from Foundation import *
from AppKit import *
from appscript import app, k
from hsutil import job, io
import hsfs as fs
from hsfs import phys
from hsutil import files
from hsutil.str import get_file_ext
from hsutil.path import Path
from hsutil.cocoa import as_fetch
import app_cocoa, data_pe, directories, picture.matchbase
from picture.cache import string_to_colors, Cache
mainBundle = NSBundle.mainBundle()
PictureBlocks = mainBundle.classNamed_('PictureBlocks')
assert PictureBlocks is not None
class Photo(phys.File):
cls_info_map = {
'size': fs.IT_ATTRS,
'ctime': fs.IT_ATTRS,
'mtime': fs.IT_ATTRS,
'md5': fs.IT_MD5,
'md5partial': fs.IT_MD5,
'dimensions': fs.IT_EXTRA,
}
def _initialize_info(self,section):
super(Photo, self)._initialize_info(section)
if section == fs.IT_EXTRA:
self._info.update({
'dimensions': (0,0),
})
def _read_info(self,section):
super(Photo, self)._read_info(section)
if section == fs.IT_EXTRA:
size = PictureBlocks.getImageSize_(unicode(self.path))
self._info['dimensions'] = (size.width, size.height)
def get_blocks(self, block_count_per_side):
try:
blocks = PictureBlocks.getBlocksFromImagePath_blockCount_scanArea_(unicode(self.path), block_count_per_side, 0)
except Exception, e:
raise IOError('The reading of "%s" failed with "%s"' % (unicode(self.path), unicode(e)))
if not blocks:
raise IOError('The picture %s could not be read' % unicode(self.path))
return string_to_colors(blocks)
class IPhoto(Photo):
def __init__(self, parent, whole_path):
super(IPhoto, self).__init__(parent, whole_path[-1])
self.whole_path = whole_path
def _build_path(self):
return self.whole_path
@property
def display_path(self):
return super(IPhoto, self)._build_path()
class Directory(phys.Directory):
cls_file_class = Photo
cls_supported_exts = ('png', 'jpg', 'jpeg', 'gif', 'psd', 'bmp', 'tiff', 'nef', 'cr2')
def _fetch_subitems(self):
subdirs, subfiles = super(Directory,self)._fetch_subitems()
return subdirs, [name for name in subfiles if get_file_ext(name) in self.cls_supported_exts]
class IPhotoLibrary(fs.Directory):
def __init__(self, plistpath):
self.plistpath = plistpath
self.refpath = plistpath[:-1]
# the AlbumData.xml file lives right in the library path
super(IPhotoLibrary, self).__init__(None, 'iPhoto Library')
def _update_photo(self, photo_data):
if photo_data['MediaType'] != 'Image':
return
photo_path = Path(photo_data['ImagePath'])
subpath = photo_path[len(self.refpath):-1]
subdir = self
for element in subpath:
try:
subdir = subdir[element]
except KeyError:
subdir = fs.Directory(subdir, element)
IPhoto(subdir, photo_path)
def update(self):
self.clear()
s = open(unicode(self.plistpath)).read()
# There was a case where a guy had 0x10 chars in his plist, causing expat errors on loading
s = s.replace('\x10', '')
plist = plistlib.readPlistFromString(s)
for photo_data in plist['Master Image List'].values():
self._update_photo(photo_data)
def force_update(self): # Don't update
pass
class DupeGuruPE(app_cocoa.DupeGuru):
def __init__(self):
app_cocoa.DupeGuru.__init__(self, data_pe, 'dupeguru_pe', appid=5)
self.scanner.match_factory = picture.matchbase.AsyncMatchFactory()
self.directories.dirclass = Directory
self.directories.special_dirclasses[Path('iPhoto Library')] = lambda _, __: self._create_iphoto_library()
p = op.join(self.appdata, 'cached_pictures.db')
self.scanner.match_factory.cached_blocks = Cache(p)
def _create_iphoto_library(self):
ud = NSUserDefaults.standardUserDefaults()
prefs = ud.persistentDomainForName_('com.apple.iApps')
plisturl = NSURL.URLWithString_(prefs['iPhotoRecentDatabases'][0])
plistpath = Path(plisturl.path())
return IPhotoLibrary(plistpath)
def _do_delete(self, j):
def op(dupe):
j.add_progress()
return self._do_delete_dupe(dupe)
marked = [dupe for dupe in self.results.dupes if self.results.is_marked(dupe)]
self.path2iphoto = {}
if any(isinstance(dupe, IPhoto) for dupe in marked):
a = app('iPhoto')
a.select(a.photo_library_album())
photos = as_fetch(a.photo_library_album().photos, k.item)
for photo in photos:
self.path2iphoto[photo.image_path()] = photo
self.last_op_error_count = self.results.perform_on_marked(op, True)
del self.path2iphoto
def _do_delete_dupe(self, dupe):
if isinstance(dupe, IPhoto):
photo = self.path2iphoto[unicode(dupe.path)]
app('iPhoto').remove(photo)
return True
else:
return app_cocoa.DupeGuru._do_delete_dupe(self, dupe)
def _do_load(self, j):
self.directories.LoadFromFile(op.join(self.appdata, 'last_directories.xml'))
for d in self.directories:
if isinstance(d, IPhotoLibrary):
d.update()
self.results.load_from_xml(op.join(self.appdata, 'last_results.xml'), self._get_file, j)
def _get_file(self, str_path):
p = Path(str_path)
for d in self.directories:
result = None
if p in d.path:
result = d.find_path(p[d.path:])
if isinstance(d, IPhotoLibrary) and p in d.refpath:
result = d.find_path(p[d.refpath:])
if result is not None:
return result
def AddDirectory(self, d):
try:
added = self.directories.add_path(Path(d))
if d == 'iPhoto Library':
added.update()
return 0
except directories.AlreadyThereError:
return 1
def CopyOrMove(self, dupe, copy, destination, dest_type):
if isinstance(dupe, IPhoto):
copy = True
return app_cocoa.DupeGuru.CopyOrMove(self, dupe, copy, destination, dest_type)
def start_scanning(self):
for directory in self.directories:
if isinstance(directory, IPhotoLibrary):
self.directories.SetState(directory.refpath, directories.STATE_EXCLUDED)
return app_cocoa.DupeGuru.start_scanning(self)
def selected_dupe_path(self):
if not self.selected_dupes:
return None
return self.selected_dupes[0].path
def selected_dupe_ref_path(self):
if not self.selected_dupes:
return None
ref = self.results.get_group_of_duplicate(self.selected_dupes[0]).ref
return ref.path

13
py/app_se_cocoa.py Normal file
View File

@@ -0,0 +1,13 @@
#!/usr/bin/env python
# Unit Name: app_se_cocoa
# Created By: Virgil Dupras
# Created On: 2009-05-24
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
import app_cocoa, data
class DupeGuru(app_cocoa.DupeGuru):
def __init__(self):
app_cocoa.DupeGuru.__init__(self, data, 'dupeguru', appid=4)

137
py/app_test.py Normal file
View File

@@ -0,0 +1,137 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.tests.app
Created By: Virgil Dupras
Created On: 2007-06-23
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:02:48 +0200 (Thu, 28 May 2009) $
$Revision: 4388 $
Copyright 2007 Hardcoded Software (http://www.hardcoded.net)
"""
import unittest
import os
from hsutil.testcase import TestCase
from hsutil import io
from hsutil.path import Path
from hsutil.decorators import log_calls
import hsfs as fs
import hsfs.phys
import hsutil.files
from hsutil.job import nulljob
from . import data, app
from .app import DupeGuru as DupeGuruBase
class DupeGuru(DupeGuruBase):
def __init__(self):
DupeGuruBase.__init__(self, data, '/tmp', appid=4)
def _start_job(self, jobid, func):
func(nulljob)
class TCDupeGuru(TestCase):
cls_tested_module = app
def test_ApplyFilter_calls_results_apply_filter(self):
app = DupeGuru()
self.mock(app.results, 'apply_filter', log_calls(app.results.apply_filter))
app.ApplyFilter('foo')
self.assertEqual(2, len(app.results.apply_filter.calls))
call = app.results.apply_filter.calls[0]
self.assert_(call['filter_str'] is None)
call = app.results.apply_filter.calls[1]
self.assertEqual('foo', call['filter_str'])
def test_ApplyFilter_escapes_regexp(self):
app = DupeGuru()
self.mock(app.results, 'apply_filter', log_calls(app.results.apply_filter))
app.ApplyFilter('()[]\\.|+?^abc')
call = app.results.apply_filter.calls[1]
self.assertEqual('\\(\\)\\[\\]\\\\\\.\\|\\+\\?\\^abc', call['filter_str'])
app.ApplyFilter('(*)') # In "simple mode", we want the * to behave as a wilcard
call = app.results.apply_filter.calls[3]
self.assertEqual('\(.*\)', call['filter_str'])
app.options['escape_filter_regexp'] = False
app.ApplyFilter('(abc)')
call = app.results.apply_filter.calls[5]
self.assertEqual('(abc)', call['filter_str'])
def test_CopyOrMove(self):
# The goal here is just to have a test for a previous blowup I had. I know my test coverage
# for this unit is pathetic. What's done is done. My approach now is to add tests for
# every change I want to make. The blowup was caused by a missing import.
dupe_parent = fs.Directory(None, 'foo')
dupe = fs.File(dupe_parent, 'bar')
dupe.copy = log_calls(lambda dest, newname: None)
self.mock(hsutil.files, 'copy', log_calls(lambda source_path, dest_path: None))
self.mock(os, 'makedirs', lambda path: None) # We don't want the test to create that fake directory
self.mock(fs.phys, 'Directory', fs.Directory) # We don't want an error because makedirs didn't work
app = DupeGuru()
app.CopyOrMove(dupe, True, 'some_destination', 0)
self.assertEqual(1, len(hsutil.files.copy.calls))
call = hsutil.files.copy.calls[0]
self.assertEqual('some_destination', call['dest_path'])
self.assertEqual(dupe.path, call['source_path'])
def test_CopyOrMove_clean_empty_dirs(self):
tmppath = Path(self.tmpdir())
sourcepath = tmppath + 'source'
io.mkdir(sourcepath)
io.open(sourcepath + 'myfile', 'w')
tmpdir = hsfs.phys.Directory(None, unicode(tmppath))
myfile = tmpdir['source']['myfile']
app = DupeGuru()
self.mock(app, 'clean_empty_dirs', log_calls(lambda path: None))
app.CopyOrMove(myfile, False, tmppath + 'dest', 0)
calls = app.clean_empty_dirs.calls
self.assertEqual(1, len(calls))
self.assertEqual(sourcepath, calls[0]['path'])
def test_Scan_with_objects_evaluating_to_false(self):
# At some point, any() was used in a wrong way that made Scan() wrongly return 1
app = DupeGuru()
f1, f2 = [fs.File(None, 'foo') for i in range(2)]
f1.is_ref, f2.is_ref = (False, False)
assert not (bool(f1) and bool(f2))
app.directories.get_files = lambda: [f1, f2]
app.directories._dirs.append('this is just so Scan() doesnt return 3')
app.start_scanning() # no exception
class TCDupeGuru_clean_empty_dirs(TestCase):
cls_tested_module = app
def setUp(self):
self.mock(hsutil.files, 'delete_if_empty', log_calls(lambda path, files_to_delete=[]: None))
self.app = DupeGuru()
def test_option_off(self):
self.app.clean_empty_dirs(Path('/foo/bar'))
self.assertEqual(0, len(hsutil.files.delete_if_empty.calls))
def test_option_on(self):
self.app.options['clean_empty_dirs'] = True
self.app.clean_empty_dirs(Path('/foo/bar'))
calls = hsutil.files.delete_if_empty.calls
self.assertEqual(1, len(calls))
self.assertEqual(Path('/foo/bar'), calls[0]['path'])
self.assertEqual(['.DS_Store'], calls[0]['files_to_delete'])
def test_recurse_up(self):
# delete_if_empty must be recursively called up in the path until it returns False
@log_calls
def mock_delete_if_empty(path, files_to_delete=[]):
return len(path) > 1
self.mock(hsutil.files, 'delete_if_empty', mock_delete_if_empty)
self.app.options['clean_empty_dirs'] = True
self.app.clean_empty_dirs(Path('not-empty/empty/empty'))
calls = hsutil.files.delete_if_empty.calls
self.assertEqual(3, len(calls))
self.assertEqual(Path('not-empty/empty/empty'), calls[0]['path'])
self.assertEqual(Path('not-empty/empty'), calls[1]['path'])
self.assertEqual(Path('not-empty'), calls[2]['path'])
if __name__ == '__main__':
unittest.main()

105
py/data.py Normal file
View File

@@ -0,0 +1,105 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.data
Created By: Virgil Dupras
Created On: 2006/03/15
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
from hsutil.str import format_time, FT_DECIMAL, format_size
import time
def format_path(p):
return unicode(p[:-1])
def format_timestamp(t, delta):
if delta:
return format_time(t, FT_DECIMAL)
else:
if t > 0:
return time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(t))
else:
return '---'
def format_words(w):
def do_format(w):
if isinstance(w, list):
return '(%s)' % ', '.join(do_format(item) for item in w)
else:
return w.replace('\n', ' ')
return ', '.join(do_format(item) for item in w)
def format_perc(p):
return "%0.0f" % p
def format_dupe_count(c):
return str(c) if c else '---'
def cmp_value(value):
return value.lower() if isinstance(value, basestring) else value
COLUMNS = [
{'attr':'name','display':'Filename'},
{'attr':'path','display':'Directory'},
{'attr':'size','display':'Size (KB)'},
{'attr':'extension','display':'Kind'},
{'attr':'ctime','display':'Creation'},
{'attr':'mtime','display':'Modification'},
{'attr':'percentage','display':'Match %'},
{'attr':'words','display':'Words Used'},
{'attr':'dupe_count','display':'Dupe Count'},
]
def GetDisplayInfo(dupe, group, delta=False):
if (dupe is None) or (group is None):
return ['---'] * len(COLUMNS)
size = dupe.size
ctime = dupe.ctime
mtime = dupe.mtime
m = group.get_match_of(dupe)
if m:
percentage = m.percentage
dupe_count = 0
if delta:
r = group.ref
size -= r.size
ctime -= r.ctime
mtime -= r.mtime
else:
percentage = group.percentage
dupe_count = len(group.dupes)
return [
dupe.name,
format_path(dupe.path),
format_size(size, 0, 1, False),
dupe.extension,
format_timestamp(ctime, delta and m),
format_timestamp(mtime, delta and m),
format_perc(percentage),
format_words(dupe.words),
format_dupe_count(dupe_count)
]
def GetDupeSortKey(dupe, get_group, key, delta):
if key == 6:
m = get_group().get_match_of(dupe)
return m.percentage
if key == 8:
return 0
r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
if delta and (key in (2, 4, 5)):
r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
return r
def GetGroupSortKey(group, key):
if key == 6:
return group.percentage
if key == 8:
return len(group)
return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))

100
py/data_me.py Normal file
View File

@@ -0,0 +1,100 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.data
Created By: Virgil Dupras
Created On: 2006/03/15
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
from hsutil.str import format_time, FT_MINUTES, format_size
from .data import (format_path, format_timestamp, format_words, format_perc,
format_dupe_count, cmp_value)
COLUMNS = [
{'attr':'name','display':'Filename'},
{'attr':'path','display':'Directory'},
{'attr':'size','display':'Size (MB)'},
{'attr':'duration','display':'Time'},
{'attr':'bitrate','display':'Bitrate'},
{'attr':'samplerate','display':'Sample Rate'},
{'attr':'extension','display':'Kind'},
{'attr':'ctime','display':'Creation'},
{'attr':'mtime','display':'Modification'},
{'attr':'title','display':'Title'},
{'attr':'artist','display':'Artist'},
{'attr':'album','display':'Album'},
{'attr':'genre','display':'Genre'},
{'attr':'year','display':'Year'},
{'attr':'track','display':'Track Number'},
{'attr':'comment','display':'Comment'},
{'attr':'percentage','display':'Match %'},
{'attr':'words','display':'Words Used'},
{'attr':'dupe_count','display':'Dupe Count'},
]
def GetDisplayInfo(dupe, group, delta=False):
if (dupe is None) or (group is None):
return ['---'] * len(COLUMNS)
size = dupe.size
duration = dupe.duration
bitrate = dupe.bitrate
samplerate = dupe.samplerate
ctime = dupe.ctime
mtime = dupe.mtime
m = group.get_match_of(dupe)
if m:
percentage = m.percentage
dupe_count = 0
if delta:
r = group.ref
size -= r.size
duration -= r.duration
bitrate -= r.bitrate
samplerate -= r.samplerate
ctime -= r.ctime
mtime -= r.mtime
else:
percentage = group.percentage
dupe_count = len(group.dupes)
return [
dupe.name,
format_path(dupe.path),
format_size(size, 2, 2, False),
format_time(duration, FT_MINUTES),
str(bitrate),
str(samplerate),
dupe.extension,
format_timestamp(ctime,delta and m),
format_timestamp(mtime,delta and m),
dupe.title,
dupe.artist,
dupe.album,
dupe.genre,
dupe.year,
str(dupe.track),
dupe.comment,
format_perc(percentage),
format_words(dupe.words),
format_dupe_count(dupe_count)
]
def GetDupeSortKey(dupe, get_group, key, delta):
if key == 16:
m = get_group().get_match_of(dupe)
return m.percentage
if key == 18:
return 0
r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
if delta and (key in (2, 3, 4, 7, 8)):
r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
return r
def GetGroupSortKey(group, key):
if key == 16:
return group.percentage
if key == 18:
return len(group)
return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))

77
py/data_pe.py Normal file
View File

@@ -0,0 +1,77 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.data
Created By: Virgil Dupras
Created On: 2006/03/15
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
from hsutil.str import format_size
from .data import format_path, format_timestamp, format_perc, format_dupe_count, cmp_value
def format_dimensions(dimensions):
return '%d x %d' % (dimensions[0], dimensions[1])
COLUMNS = [
{'attr':'name','display':'Filename'},
{'attr':'path','display':'Directory'},
{'attr':'size','display':'Size (KB)'},
{'attr':'extension','display':'Kind'},
{'attr':'dimensions','display':'Dimensions'},
{'attr':'ctime','display':'Creation'},
{'attr':'mtime','display':'Modification'},
{'attr':'percentage','display':'Match %'},
{'attr':'dupe_count','display':'Dupe Count'},
]
def GetDisplayInfo(dupe,group,delta=False):
if (dupe is None) or (group is None):
return ['---'] * len(COLUMNS)
size = dupe.size
ctime = dupe.ctime
mtime = dupe.mtime
m = group.get_match_of(dupe)
if m:
percentage = m.percentage
dupe_count = 0
if delta:
r = group.ref
size -= r.size
ctime -= r.ctime
mtime -= r.mtime
else:
percentage = group.percentage
dupe_count = len(group.dupes)
dupe_path = getattr(dupe, 'display_path', dupe.path)
return [
dupe.name,
format_path(dupe_path),
format_size(size, 0, 1, False),
dupe.extension,
format_dimensions(dupe.dimensions),
format_timestamp(ctime, delta and m),
format_timestamp(mtime, delta and m),
format_perc(percentage),
format_dupe_count(dupe_count)
]
def GetDupeSortKey(dupe, get_group, key, delta):
if key == 7:
m = get_group().get_match_of(dupe)
return m.percentage
if key == 8:
return 0
r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
if delta and (key in (2, 5, 6)):
r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
return r
def GetGroupSortKey(group, key):
if key == 7:
return group.percentage
if key == 8:
return len(group)
return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))

161
py/directories.py Normal file
View File

@@ -0,0 +1,161 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.directories
Created By: Virgil Dupras
Created On: 2006/02/27
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:02:48 +0200 (Thu, 28 May 2009) $
$Revision: 4388 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import xml.dom.minidom
from hsfs import phys
import hsfs as fs
from hsutil.files import FileOrPath
from hsutil.path import Path
(STATE_NORMAL,
STATE_REFERENCE,
STATE_EXCLUDED) = range(3)
class AlreadyThereError(Exception):
"""The path being added is already in the directory list"""
class InvalidPathError(Exception):
"""The path being added is invalid"""
class Directories(object):
#---Override
def __init__(self):
self._dirs = []
self.states = {}
self.dirclass = phys.Directory
self.special_dirclasses = {}
def __contains__(self,path):
for d in self._dirs:
if path in d.path:
return True
return False
def __delitem__(self,key):
self._dirs.__delitem__(key)
def __getitem__(self,key):
return self._dirs.__getitem__(key)
def __len__(self):
return len(self._dirs)
#---Private
def _get_files(self, from_dir, state=STATE_NORMAL):
state = self.states.get(from_dir.path, state)
result = []
for subdir in from_dir.dirs:
for file in self._get_files(subdir, state):
yield file
if state != STATE_EXCLUDED:
for file in from_dir.files:
file.is_ref = state == STATE_REFERENCE
yield file
#---Public
def add_path(self, path):
"""Adds 'path' to self, if not already there.
Raises AlreadyThereError if 'path' is already in self. If path is a directory containing
some of the directories already present in self, 'path' will be added, but all directories
under it will be removed. Can also raise InvalidPathError if 'path' does not exist.
"""
if path in self:
raise AlreadyThereError
self._dirs = [d for d in self._dirs if d.path not in path]
try:
dirclass = self.special_dirclasses.get(path, self.dirclass)
d = dirclass(None, unicode(path))
d[:] #If an InvalidPath exception has to be raised, it will be raised here
self._dirs.append(d)
return d
except fs.InvalidPath:
raise InvalidPathError
def get_files(self):
"""Returns a list of all files that are not excluded.
Returned files also have their 'is_ref' attr set.
"""
for d in self._dirs:
d.force_update()
try:
for file in self._get_files(d):
yield file
except fs.InvalidPath:
pass
def GetState(self, path):
"""Returns the state of 'path' (One of the STATE_* const.)
Raises LookupError if 'path' is not in self.
"""
if path not in self:
raise LookupError("The path '%s' is not in the directory list." % str(path))
try:
return self.states[path]
except KeyError:
if path[-1].startswith('.'): # hidden
return STATE_EXCLUDED
parent = path[:-1]
if parent in self:
return self.GetState(parent)
else:
return STATE_NORMAL
def LoadFromFile(self,infile):
try:
doc = xml.dom.minidom.parse(infile)
except:
return
root_dir_nodes = doc.getElementsByTagName('root_directory')
for rdn in root_dir_nodes:
if not rdn.getAttributeNode('path'):
continue
path = rdn.getAttributeNode('path').nodeValue
try:
self.add_path(Path(path))
except (AlreadyThereError,InvalidPathError):
pass
state_nodes = doc.getElementsByTagName('state')
for sn in state_nodes:
if not (sn.getAttributeNode('path') and sn.getAttributeNode('value')):
continue
path = sn.getAttributeNode('path').nodeValue
state = sn.getAttributeNode('value').nodeValue
self.SetState(Path(path), int(state))
def Remove(self,directory):
self._dirs.remove(directory)
def SaveToFile(self,outfile):
with FileOrPath(outfile, 'wb') as fp:
doc = xml.dom.minidom.Document()
root = doc.appendChild(doc.createElement('directories'))
for root_dir in self:
root_dir_node = root.appendChild(doc.createElement('root_directory'))
root_dir_node.setAttribute('path', unicode(root_dir.path).encode('utf-8'))
for path,state in self.states.iteritems():
state_node = root.appendChild(doc.createElement('state'))
state_node.setAttribute('path', unicode(path).encode('utf-8'))
state_node.setAttribute('value', str(state))
doc.writexml(fp,'\t','\t','\n',encoding='utf-8')
def SetState(self,path,state):
try:
if self.GetState(path) == state:
return
self.states[path] = state
if (self.GetState(path[:-1]) == state) and (not path[-1].startswith('.')):
del self.states[path]
except LookupError:
pass

280
py/directories_test.py Normal file
View File

@@ -0,0 +1,280 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.tests.directories
Created By: Virgil Dupras
Created On: 2006/02/27
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-29 08:51:14 +0200 (Fri, 29 May 2009) $
$Revision: 4398 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import unittest
import os.path as op
import os
import time
import shutil
from hsutil import job, io
from hsutil.path import Path
from hsutil.testcase import TestCase
import hsfs.phys
from hsfs.phys import phys_test
from directories import *
testpath = Path(TestCase.datadirpath())
class TCDirectories(TestCase):
def test_empty(self):
d = Directories()
self.assertEqual(0,len(d))
self.assert_('foobar' not in d)
def test_add_path(self):
d = Directories()
p = testpath + 'utils'
added = d.add_path(p)
self.assertEqual(1,len(d))
self.assert_(p in d)
self.assert_((p + 'foobar') in d)
self.assert_(p[:-1] not in d)
self.assertEqual(p,added.path)
self.assert_(d[0] is added)
p = self.tmppath()
d.add_path(p)
self.assertEqual(2,len(d))
self.assert_(p in d)
def test_AddPath_when_path_is_already_there(self):
d = Directories()
p = testpath + 'utils'
d.add_path(p)
self.assertRaises(AlreadyThereError, d.add_path, p)
self.assertRaises(AlreadyThereError, d.add_path, p + 'foobar')
self.assertEqual(1, len(d))
def test_AddPath_containing_paths_already_there(self):
d = Directories()
d.add_path(testpath + 'utils')
self.assertEqual(1, len(d))
added = d.add_path(testpath)
self.assertEqual(1, len(d))
self.assert_(added is d[0])
def test_AddPath_non_latin(self):
p = Path(self.tmpdir())
to_add = p + u'unicode\u201a'
os.mkdir(unicode(to_add))
d = Directories()
try:
d.add_path(to_add)
except UnicodeDecodeError:
self.fail()
def test_del(self):
d = Directories()
d.add_path(testpath + 'utils')
try:
del d[1]
self.fail()
except IndexError:
pass
d.add_path(self.tmppath())
del d[1]
self.assertEqual(1, len(d))
def test_states(self):
d = Directories()
p = testpath + 'utils'
d.add_path(p)
self.assertEqual(STATE_NORMAL,d.GetState(p))
d.SetState(p,STATE_REFERENCE)
self.assertEqual(STATE_REFERENCE,d.GetState(p))
self.assertEqual(STATE_REFERENCE,d.GetState(p + 'dir1'))
self.assertEqual(1,len(d.states))
self.assertEqual(p,d.states.keys()[0])
self.assertEqual(STATE_REFERENCE,d.states[p])
def test_GetState_with_path_not_there(self):
d = Directories()
d.add_path(testpath + 'utils')
self.assertRaises(LookupError,d.GetState,testpath)
def test_states_remain_when_larger_directory_eat_smaller_ones(self):
d = Directories()
p = testpath + 'utils'
d.add_path(p)
d.SetState(p,STATE_EXCLUDED)
d.add_path(testpath)
d.SetState(testpath,STATE_REFERENCE)
self.assertEqual(STATE_EXCLUDED,d.GetState(p))
self.assertEqual(STATE_EXCLUDED,d.GetState(p + 'dir1'))
self.assertEqual(STATE_REFERENCE,d.GetState(testpath))
def test_SetState_keep_state_dict_size_to_minimum(self):
d = Directories()
p = Path(phys_test.create_fake_fs(self.tmpdir()))
d.add_path(p)
d.SetState(p,STATE_REFERENCE)
d.SetState(p + 'dir1',STATE_REFERENCE)
self.assertEqual(1,len(d.states))
self.assertEqual(STATE_REFERENCE,d.GetState(p + 'dir1'))
d.SetState(p + 'dir1',STATE_NORMAL)
self.assertEqual(2,len(d.states))
self.assertEqual(STATE_NORMAL,d.GetState(p + 'dir1'))
d.SetState(p + 'dir1',STATE_REFERENCE)
self.assertEqual(1,len(d.states))
self.assertEqual(STATE_REFERENCE,d.GetState(p + 'dir1'))
def test_get_files(self):
d = Directories()
p = Path(phys_test.create_fake_fs(self.tmpdir()))
d.add_path(p)
d.SetState(p + 'dir1',STATE_REFERENCE)
d.SetState(p + 'dir2',STATE_EXCLUDED)
files = d.get_files()
self.assertEqual(5, len(list(files)))
for f in files:
if f.parent.path == p + 'dir1':
self.assert_(f.is_ref)
else:
self.assert_(not f.is_ref)
def test_get_files_with_inherited_exclusion(self):
d = Directories()
p = testpath + 'utils'
d.add_path(p)
d.SetState(p,STATE_EXCLUDED)
self.assertEqual([], list(d.get_files()))
def test_save_and_load(self):
d1 = Directories()
d2 = Directories()
p1 = self.tmppath()
p2 = self.tmppath()
d1.add_path(p1)
d1.add_path(p2)
d1.SetState(p1, STATE_REFERENCE)
d1.SetState(p1 + 'dir1',STATE_EXCLUDED)
tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
d1.SaveToFile(tmpxml)
d2.LoadFromFile(tmpxml)
self.assertEqual(2, len(d2))
self.assertEqual(STATE_REFERENCE,d2.GetState(p1))
self.assertEqual(STATE_EXCLUDED,d2.GetState(p1 + 'dir1'))
def test_invalid_path(self):
d = Directories()
p = Path('does_not_exist')
self.assertRaises(InvalidPathError, d.add_path, p)
self.assertEqual(0, len(d))
def test_SetState_on_invalid_path(self):
d = Directories()
try:
d.SetState(Path('foobar',),STATE_NORMAL)
except LookupError:
self.fail()
def test_default_dirclass(self):
self.assert_(Directories().dirclass is hsfs.phys.Directory)
def test_dirclass(self):
class MySpecialDirclass(hsfs.phys.Directory): pass
d = Directories()
d.dirclass = MySpecialDirclass
d.add_path(testpath)
self.assert_(isinstance(d[0], MySpecialDirclass))
def test_LoadFromFile_with_invalid_path(self):
#This test simulates a load from file resulting in a
#InvalidPath raise. Other directories must be loaded.
d1 = Directories()
d1.add_path(testpath + 'utils')
#Will raise InvalidPath upon loading
d1.add_path(self.tmppath()).name = 'does_not_exist'
tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
d1.SaveToFile(tmpxml)
d2 = Directories()
d2.LoadFromFile(tmpxml)
self.assertEqual(1, len(d2))
def test_LoadFromFile_with_same_paths(self):
#This test simulates a load from file resulting in a
#AlreadyExists raise. Other directories must be loaded.
d1 = Directories()
p1 = self.tmppath()
p2 = self.tmppath()
d1.add_path(p1)
d1.add_path(p2)
#Will raise AlreadyExists upon loading
d1.add_path(self.tmppath()).name = unicode(p1)
tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
d1.SaveToFile(tmpxml)
d2 = Directories()
d2.LoadFromFile(tmpxml)
self.assertEqual(2, len(d2))
def test_Remove(self):
d = Directories()
d1 = d.add_path(self.tmppath())
d2 = d.add_path(self.tmppath())
d.Remove(d1)
self.assertEqual(1, len(d))
self.assert_(d[0] is d2)
def test_unicode_save(self):
d = Directories()
p1 = self.tmppath() + u'hello\xe9'
io.mkdir(p1)
io.mkdir(p1 + u'foo\xe9')
d.add_path(p1)
d.SetState(d[0][0].path, STATE_EXCLUDED)
tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
try:
d.SaveToFile(tmpxml)
except UnicodeDecodeError:
self.fail()
def test_get_files_refreshes_its_directories(self):
d = Directories()
p = Path(phys_test.create_fake_fs(self.tmpdir()))
d.add_path(p)
files = d.get_files()
self.assertEqual(6, len(list(files)))
time.sleep(1)
os.remove(str(p + ('dir1','file1.test')))
files = d.get_files()
self.assertEqual(5, len(list(files)))
def test_get_files_does_not_choke_on_non_existing_directories(self):
d = Directories()
p = Path(self.tmpdir())
d.add_path(p)
io.rmtree(p)
self.assertEqual([], list(d.get_files()))
def test_GetState_returns_excluded_by_default_for_hidden_directories(self):
d = Directories()
p = Path(self.tmpdir())
hidden_dir_path = p + '.foo'
io.mkdir(p + '.foo')
d.add_path(p)
self.assertEqual(d.GetState(hidden_dir_path), STATE_EXCLUDED)
# But it can be overriden
d.SetState(hidden_dir_path, STATE_NORMAL)
self.assertEqual(d.GetState(hidden_dir_path), STATE_NORMAL)
def test_special_dirclasses(self):
# if a path is in special_dirclasses, use this class instead
class MySpecialDirclass(hsfs.phys.Directory): pass
d = Directories()
p1 = self.tmppath()
p2 = self.tmppath()
d.special_dirclasses[p1] = MySpecialDirclass
self.assert_(isinstance(d.add_path(p2), hsfs.phys.Directory))
self.assert_(isinstance(d.add_path(p1), MySpecialDirclass))
if __name__ == "__main__":
unittest.main()

360
py/engine.py Normal file
View File

@@ -0,0 +1,360 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.engine
Created By: Virgil Dupras
Created On: 2006/01/29
Last modified by:$Author: virgil $
Last modified on:$Date: $
$Revision: $
Copyright 2007 Hardcoded Software (http://www.hardcoded.net)
"""
from __future__ import division
import difflib
import logging
import string
from collections import defaultdict, namedtuple
from unicodedata import normalize
from hsutil.str import multi_replace
from hsutil import job
(WEIGHT_WORDS,
MATCH_SIMILAR_WORDS,
NO_FIELD_ORDER) = range(3)
JOB_REFRESH_RATE = 100
def getwords(s):
if isinstance(s, unicode):
s = normalize('NFD', s)
s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", ' ').lower()
s = ''.join(c for c in s if c in string.ascii_letters + string.digits + string.whitespace)
return filter(None, s.split(' ')) # filter() is to remove empty elements
def getfields(s):
fields = [getwords(field) for field in s.split(' - ')]
return filter(None, fields)
def unpack_fields(fields):
result = []
for field in fields:
if isinstance(field, list):
result += field
else:
result.append(field)
return result
def compare(first, second, flags=()):
"""Returns the % of words that match between first and second
The result is a int in the range 0..100.
First and second can be either a string or a list.
"""
if not (first and second):
return 0
if any(isinstance(element, list) for element in first):
return compare_fields(first, second, flags)
second = second[:] #We must use a copy of second because we remove items from it
match_similar = MATCH_SIMILAR_WORDS in flags
weight_words = WEIGHT_WORDS in flags
joined = first + second
total_count = (sum(len(word) for word in joined) if weight_words else len(joined))
match_count = 0
in_order = True
for word in first:
if match_similar and (word not in second):
similar = difflib.get_close_matches(word, second, 1, 0.8)
if similar:
word = similar[0]
if word in second:
if second[0] != word:
in_order = False
second.remove(word)
match_count += (len(word) if weight_words else 1)
result = round(((match_count * 2) / total_count) * 100)
if (result == 100) and (not in_order):
result = 99 # We cannot consider a match exact unless the ordering is the same
return result
def compare_fields(first, second, flags=()):
"""Returns the score for the lowest matching fields.
first and second must be lists of lists of string.
"""
if len(first) != len(second):
return 0
if NO_FIELD_ORDER in flags:
results = []
#We don't want to remove field directly in the list. We must work on a copy.
second = second[:]
for field1 in first:
max = 0
matched_field = None
for field2 in second:
r = compare(field1, field2, flags)
if r > max:
max = r
matched_field = field2
results.append(max)
if matched_field:
second.remove(matched_field)
else:
results = [compare(word1, word2, flags) for word1, word2 in zip(first, second)]
return min(results) if results else 0
def build_word_dict(objects, j=job.nulljob):
"""Returns a dict of objects mapped by their words.
objects must have a 'words' attribute being a list of strings or a list of lists of strings.
The result will be a dict with words as keys, lists of objects as values.
"""
result = defaultdict(set)
for object in j.iter_with_progress(objects, 'Prepared %d/%d files', JOB_REFRESH_RATE):
for word in unpack_fields(object.words):
result[word].add(object)
return result
def merge_similar_words(word_dict):
"""Take all keys in word_dict that are similar, and merge them together.
"""
keys = word_dict.keys()
keys.sort(key=len)# we want the shortest word to stay
while keys:
key = keys.pop(0)
similars = difflib.get_close_matches(key, keys, 100, 0.8)
if not similars:
continue
objects = word_dict[key]
for similar in similars:
objects |= word_dict[similar]
del word_dict[similar]
keys.remove(similar)
def reduce_common_words(word_dict, threshold):
"""Remove all objects from word_dict values where the object count >= threshold
The exception to this removal are the objects where all the words of the object are common.
Because if we remove them, we will miss some duplicates!
"""
uncommon_words = set(word for word, objects in word_dict.items() if len(objects) < threshold)
for word, objects in word_dict.items():
if len(objects) < threshold:
continue
reduced = set()
for o in objects:
if not any(w in uncommon_words for w in unpack_fields(o.words)):
reduced.add(o)
if reduced:
word_dict[word] = reduced
else:
del word_dict[word]
Match = namedtuple('Match', 'first second percentage')
def get_match(first, second, flags=()):
#it is assumed here that first and second both have a "words" attribute
percentage = compare(first.words, second.words, flags)
return Match(first, second, percentage)
class MatchFactory(object):
common_word_threshold = 50
match_similar_words = False
min_match_percentage = 0
weight_words = False
no_field_order = False
limit = 5000000
def getmatches(self, objects, j=job.nulljob):
j = j.start_subjob(2)
sj = j.start_subjob(2)
for o in objects:
if not hasattr(o, 'words'):
o.words = getwords(o.name)
word_dict = build_word_dict(objects, sj)
reduce_common_words(word_dict, self.common_word_threshold)
if self.match_similar_words:
merge_similar_words(word_dict)
match_flags = []
if self.weight_words:
match_flags.append(WEIGHT_WORDS)
if self.match_similar_words:
match_flags.append(MATCH_SIMILAR_WORDS)
if self.no_field_order:
match_flags.append(NO_FIELD_ORDER)
j.start_job(len(word_dict), '0 matches found')
compared = defaultdict(set)
result = []
try:
# This whole 'popping' thing is there to avoid taking too much memory at the same time.
while word_dict:
items = word_dict.popitem()[1]
while items:
ref = items.pop()
compared_already = compared[ref]
to_compare = items - compared_already
compared_already |= to_compare
for other in to_compare:
m = get_match(ref, other, match_flags)
if m.percentage >= self.min_match_percentage:
result.append(m)
if len(result) >= self.limit:
return result
j.add_progress(desc='%d matches found' % len(result))
except MemoryError:
# This is the place where the memory usage is at its peak during the scan.
# Just continue the process with an incomplete list of matches.
del compared # This should give us enough room to call logging.
logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
return result
return result
class Group(object):
#---Override
def __init__(self):
self._clear()
def __contains__(self, item):
return item in self.unordered
def __getitem__(self, key):
return self.ordered.__getitem__(key)
def __iter__(self):
return iter(self.ordered)
def __len__(self):
return len(self.ordered)
#---Private
def _clear(self):
self._percentage = None
self._matches_for_ref = None
self.matches = set()
self.candidates = defaultdict(set)
self.ordered = []
self.unordered = set()
def _get_matches_for_ref(self):
if self._matches_for_ref is None:
ref = self.ref
self._matches_for_ref = [match for match in self.matches if ref in match]
return self._matches_for_ref
#---Public
def add_match(self, match):
def add_candidate(item, match):
matches = self.candidates[item]
matches.add(match)
if self.unordered <= matches:
self.ordered.append(item)
self.unordered.add(item)
if match in self.matches:
return
self.matches.add(match)
first, second, _ = match
if first not in self.unordered:
add_candidate(first, second)
if second not in self.unordered:
add_candidate(second, first)
self._percentage = None
self._matches_for_ref = None
def clean_matches(self):
self.matches = set(m for m in self.matches if (m.first in self.unordered) and (m.second in self.unordered))
self.candidates = defaultdict(set)
def get_match_of(self, item):
if item is self.ref:
return
for m in self._get_matches_for_ref():
if item in m:
return m
def prioritize(self, key_func, tie_breaker=None):
# tie_breaker(ref, dupe) --> True if dupe should be ref
self.ordered.sort(key=key_func)
if tie_breaker is None:
return
ref = self.ref
key_value = key_func(ref)
for dupe in self.dupes:
if key_func(dupe) != key_value:
break
if tie_breaker(ref, dupe):
ref = dupe
if ref is not self.ref:
self.switch_ref(ref)
def remove_dupe(self, item, clean_matches=True):
try:
self.ordered.remove(item)
self.unordered.remove(item)
self._percentage = None
self._matches_for_ref = None
if (len(self) > 1) and any(not getattr(item, 'is_ref', False) for item in self):
if clean_matches:
self.matches = set(m for m in self.matches if item not in m)
else:
self._clear()
except ValueError:
pass
def switch_ref(self, with_dupe):
try:
self.ordered.remove(with_dupe)
self.ordered.insert(0, with_dupe)
self._percentage = None
self._matches_for_ref = None
except ValueError:
pass
dupes = property(lambda self: self[1:])
@property
def percentage(self):
if self._percentage is None:
if self.dupes:
matches = self._get_matches_for_ref()
self._percentage = sum(match.percentage for match in matches) // len(matches)
else:
self._percentage = 0
return self._percentage
@property
def ref(self):
if self:
return self[0]
def get_groups(matches, j=job.nulljob):
matches.sort(key=lambda match: -match.percentage)
dupe2group = {}
groups = []
for match in j.iter_with_progress(matches, 'Grouped %d/%d matches', JOB_REFRESH_RATE):
first, second, _ = match
first_group = dupe2group.get(first)
second_group = dupe2group.get(second)
if first_group:
if second_group:
if first_group is second_group:
target_group = first_group
else:
continue
else:
target_group = first_group
dupe2group[second] = target_group
else:
if second_group:
target_group = second_group
dupe2group[first] = target_group
else:
target_group = Group()
groups.append(target_group)
dupe2group[first] = target_group
dupe2group[second] = target_group
target_group.add_match(match)
for group in groups:
group.clean_matches()
return groups

822
py/engine_test.py Normal file
View File

@@ -0,0 +1,822 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.engine_test
Created By: Virgil Dupras
Created On: 2006/01/29
Last modified by:$Author: virgil $
Last modified on:$Date: $
$Revision: $
Copyright 2004-2008 Hardcoded Software (http://www.hardcoded.net)
"""
import unittest
import sys
from hsutil import job
from hsutil.decorators import log_calls
from hsutil.testcase import TestCase
from . import engine
from .engine import *
class NamedObject(object):
def __init__(self, name="foobar", with_words=False):
self.name = name
if with_words:
self.words = getwords(name)
def get_match_triangle():
o1 = NamedObject(with_words=True)
o2 = NamedObject(with_words=True)
o3 = NamedObject(with_words=True)
m1 = get_match(o1,o2)
m2 = get_match(o1,o3)
m3 = get_match(o2,o3)
return [m1, m2, m3]
def get_test_group():
m1, m2, m3 = get_match_triangle()
result = Group()
result.add_match(m1)
result.add_match(m2)
result.add_match(m3)
return result
class TCgetwords(TestCase):
def test_spaces(self):
self.assertEqual(['a', 'b', 'c', 'd'], getwords("a b c d"))
self.assertEqual(['a', 'b', 'c', 'd'], getwords(" a b c d "))
def test_splitter_chars(self):
self.assertEqual(
[chr(i) for i in xrange(ord('a'),ord('z')+1)],
getwords("a-b_c&d+e(f)g;h\\i[j]k{l}m:n.o,p<q>r/s?t~u!v@w#x$y*z")
)
def test_joiner_chars(self):
self.assertEqual(["aec"], getwords(u"a'e\u0301c"))
def test_empty(self):
self.assertEqual([], getwords(''))
def test_returns_lowercase(self):
self.assertEqual(['foo', 'bar'], getwords('FOO BAR'))
def test_decompose_unicode(self):
self.assertEqual(getwords(u'foo\xe9bar'), ['fooebar'])
class TCgetfields(TestCase):
def test_simple(self):
self.assertEqual([['a', 'b'], ['c', 'd', 'e']], getfields('a b - c d e'))
def test_empty(self):
self.assertEqual([], getfields(''))
def test_cleans_empty_fields(self):
expected = [['a', 'bc', 'def']]
actual = getfields(' - a bc def')
self.assertEqual(expected, actual)
expected = [['bc', 'def']]
class TCunpack_fields(TestCase):
def test_with_fields(self):
expected = ['a', 'b', 'c', 'd', 'e', 'f']
actual = unpack_fields([['a'], ['b', 'c'], ['d', 'e', 'f']])
self.assertEqual(expected, actual)
def test_without_fields(self):
expected = ['a', 'b', 'c', 'd', 'e', 'f']
actual = unpack_fields(['a', 'b', 'c', 'd', 'e', 'f'])
self.assertEqual(expected, actual)
def test_empty(self):
self.assertEqual([], unpack_fields([]))
class TCWordCompare(TestCase):
def test_list(self):
self.assertEqual(100, compare(['a', 'b', 'c', 'd'],['a', 'b', 'c', 'd']))
self.assertEqual(86, compare(['a', 'b', 'c', 'd'],['a', 'b', 'c']))
def test_unordered(self):
#Sometimes, users don't want fuzzy matching too much When they set the slider
#to 100, they don't expect a filename with the same words, but not the same order, to match.
#Thus, we want to return 99 in that case.
self.assertEqual(99, compare(['a', 'b', 'c', 'd'], ['d', 'b', 'c', 'a']))
def test_word_occurs_twice(self):
#if a word occurs twice in first, but once in second, we want the word to be only counted once
self.assertEqual(89, compare(['a', 'b', 'c', 'd', 'a'], ['d', 'b', 'c', 'a']))
def test_uses_copy_of_lists(self):
first = ['foo', 'bar']
second = ['bar', 'bleh']
compare(first, second)
self.assertEqual(['foo', 'bar'], first)
self.assertEqual(['bar', 'bleh'], second)
def test_word_weight(self):
self.assertEqual(int((6.0 / 13.0) * 100), compare(['foo', 'bar'], ['bar', 'bleh'], (WEIGHT_WORDS, )))
def test_similar_words(self):
self.assertEqual(100, compare(['the', 'white', 'stripes'],['the', 'whites', 'stripe'], (MATCH_SIMILAR_WORDS, )))
def test_empty(self):
self.assertEqual(0, compare([], []))
def test_with_fields(self):
self.assertEqual(67, compare([['a', 'b'], ['c', 'd', 'e']], [['a', 'b'], ['c', 'd', 'f']]))
def test_propagate_flags_with_fields(self):
def mock_compare(first, second, flags):
self.assertEqual((0, 1, 2, 3, 5), flags)
self.mock(engine, 'compare_fields', mock_compare)
compare([['a']], [['a']], (0, 1, 2, 3, 5))
class TCWordCompareWithFields(TestCase):
def test_simple(self):
self.assertEqual(67, compare_fields([['a', 'b'], ['c', 'd', 'e']], [['a', 'b'], ['c', 'd', 'f']]))
def test_empty(self):
self.assertEqual(0, compare_fields([], []))
def test_different_length(self):
self.assertEqual(0, compare_fields([['a'], ['b']], [['a'], ['b'], ['c']]))
def test_propagates_flags(self):
def mock_compare(first, second, flags):
self.assertEqual((0, 1, 2, 3, 5), flags)
self.mock(engine, 'compare_fields', mock_compare)
compare_fields([['a']], [['a']],(0, 1, 2, 3, 5))
def test_order(self):
first = [['a', 'b'], ['c', 'd', 'e']]
second = [['c', 'd', 'f'], ['a', 'b']]
self.assertEqual(0, compare_fields(first, second))
def test_no_order(self):
first = [['a','b'],['c','d','e']]
second = [['c','d','f'],['a','b']]
self.assertEqual(67, compare_fields(first, second, (NO_FIELD_ORDER, )))
first = [['a','b'],['a','b']] #a field can only be matched once.
second = [['c','d','f'],['a','b']]
self.assertEqual(0, compare_fields(first, second, (NO_FIELD_ORDER, )))
first = [['a','b'],['a','b','c']]
second = [['c','d','f'],['a','b']]
self.assertEqual(33, compare_fields(first, second, (NO_FIELD_ORDER, )))
def test_compare_fields_without_order_doesnt_alter_fields(self):
#The NO_ORDER comp type altered the fields!
first = [['a','b'],['c','d','e']]
second = [['c','d','f'],['a','b']]
self.assertEqual(67, compare_fields(first, second, (NO_FIELD_ORDER, )))
self.assertEqual([['a','b'],['c','d','e']],first)
self.assertEqual([['c','d','f'],['a','b']],second)
class TCbuild_word_dict(TestCase):
def test_with_standard_words(self):
l = [NamedObject('foo bar',True)]
l.append(NamedObject('bar baz',True))
l.append(NamedObject('baz bleh foo',True))
d = build_word_dict(l)
self.assertEqual(4,len(d))
self.assertEqual(2,len(d['foo']))
self.assert_(l[0] in d['foo'])
self.assert_(l[2] in d['foo'])
self.assertEqual(2,len(d['bar']))
self.assert_(l[0] in d['bar'])
self.assert_(l[1] in d['bar'])
self.assertEqual(2,len(d['baz']))
self.assert_(l[1] in d['baz'])
self.assert_(l[2] in d['baz'])
self.assertEqual(1,len(d['bleh']))
self.assert_(l[2] in d['bleh'])
def test_unpack_fields(self):
o = NamedObject('')
o.words = [['foo','bar'],['baz']]
d = build_word_dict([o])
self.assertEqual(3,len(d))
self.assertEqual(1,len(d['foo']))
def test_words_are_unaltered(self):
o = NamedObject('')
o.words = [['foo','bar'],['baz']]
d = build_word_dict([o])
self.assertEqual([['foo','bar'],['baz']],o.words)
def test_object_instances_can_only_be_once_in_words_object_list(self):
o = NamedObject('foo foo',True)
d = build_word_dict([o])
self.assertEqual(1,len(d['foo']))
def test_job(self):
def do_progress(p,d=''):
self.log.append(p)
return True
j = job.Job(1,do_progress)
self.log = []
s = "foo bar"
build_word_dict([NamedObject(s, True), NamedObject(s, True), NamedObject(s, True)], j)
self.assertEqual(0,self.log[0])
self.assertEqual(33,self.log[1])
self.assertEqual(66,self.log[2])
self.assertEqual(100,self.log[3])
class TCmerge_similar_words(TestCase):
def test_some_similar_words(self):
d = {
'foobar':set([1]),
'foobar1':set([2]),
'foobar2':set([3]),
}
merge_similar_words(d)
self.assertEqual(1,len(d))
self.assertEqual(3,len(d['foobar']))
class TCreduce_common_words(TestCase):
def test_typical(self):
d = {
'foo': set([NamedObject('foo bar',True) for i in range(50)]),
'bar': set([NamedObject('foo bar',True) for i in range(49)])
}
reduce_common_words(d, 50)
self.assert_('foo' not in d)
self.assertEqual(49,len(d['bar']))
def test_dont_remove_objects_with_only_common_words(self):
d = {
'common': set([NamedObject("common uncommon",True) for i in range(50)] + [NamedObject("common",True)]),
'uncommon': set([NamedObject("common uncommon",True)])
}
reduce_common_words(d, 50)
self.assertEqual(1,len(d['common']))
self.assertEqual(1,len(d['uncommon']))
def test_values_still_are_set_instances(self):
d = {
'common': set([NamedObject("common uncommon",True) for i in range(50)] + [NamedObject("common",True)]),
'uncommon': set([NamedObject("common uncommon",True)])
}
reduce_common_words(d, 50)
self.assert_(isinstance(d['common'],set))
self.assert_(isinstance(d['uncommon'],set))
def test_dont_raise_KeyError_when_a_word_has_been_removed(self):
#If a word has been removed by the reduce, an object in a subsequent common word that
#contains the word that has been removed would cause a KeyError.
d = {
'foo': set([NamedObject('foo bar baz',True) for i in range(50)]),
'bar': set([NamedObject('foo bar baz',True) for i in range(50)]),
'baz': set([NamedObject('foo bar baz',True) for i in range(49)])
}
try:
reduce_common_words(d, 50)
except KeyError:
self.fail()
def test_unpack_fields(self):
#object.words may be fields.
def create_it():
o = NamedObject('')
o.words = [['foo','bar'],['baz']]
return o
d = {
'foo': set([create_it() for i in range(50)])
}
try:
reduce_common_words(d, 50)
except TypeError:
self.fail("must support fields.")
def test_consider_a_reduced_common_word_common_even_after_reduction(self):
#There was a bug in the code that causeda word that has already been reduced not to
#be counted as a common word for subsequent words. For example, if 'foo' is processed
#as a common word, keeping a "foo bar" file in it, and the 'bar' is processed, "foo bar"
#would not stay in 'bar' because 'foo' is not a common word anymore.
only_common = NamedObject('foo bar',True)
d = {
'foo': set([NamedObject('foo bar baz',True) for i in range(49)] + [only_common]),
'bar': set([NamedObject('foo bar baz',True) for i in range(49)] + [only_common]),
'baz': set([NamedObject('foo bar baz',True) for i in range(49)])
}
reduce_common_words(d, 50)
self.assertEqual(1,len(d['foo']))
self.assertEqual(1,len(d['bar']))
self.assertEqual(49,len(d['baz']))
class TCget_match(TestCase):
def test_simple(self):
o1 = NamedObject("foo bar",True)
o2 = NamedObject("bar bleh",True)
m = get_match(o1,o2)
self.assertEqual(50,m.percentage)
self.assertEqual(['foo','bar'],m.first.words)
self.assertEqual(['bar','bleh'],m.second.words)
self.assert_(m.first is o1)
self.assert_(m.second is o2)
def test_in(self):
o1 = NamedObject("foo",True)
o2 = NamedObject("bar",True)
m = get_match(o1,o2)
self.assert_(o1 in m)
self.assert_(o2 in m)
self.assert_(object() not in m)
def test_word_weight(self):
self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage)
class TCMatchFactory(TestCase):
def test_empty(self):
self.assertEqual([],MatchFactory().getmatches([]))
def test_defaults(self):
mf = MatchFactory()
self.assertEqual(50,mf.common_word_threshold)
self.assertEqual(False,mf.weight_words)
self.assertEqual(False,mf.match_similar_words)
self.assertEqual(False,mf.no_field_order)
self.assertEqual(0,mf.min_match_percentage)
def test_simple(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
r = MatchFactory().getmatches(l)
self.assertEqual(2,len(r))
seek = [m for m in r if m.percentage == 50] #"foo bar" and "bar bleh"
m = seek[0]
self.assertEqual(['foo','bar'],m.first.words)
self.assertEqual(['bar','bleh'],m.second.words)
seek = [m for m in r if m.percentage == 33] #"foo bar" and "a b c foo"
m = seek[0]
self.assertEqual(['foo','bar'],m.first.words)
self.assertEqual(['a','b','c','foo'],m.second.words)
def test_null_and_unrelated_objects(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")]
r = MatchFactory().getmatches(l)
self.assertEqual(1,len(r))
m = r[0]
self.assertEqual(50,m.percentage)
self.assertEqual(['foo','bar'],m.first.words)
self.assertEqual(['bar','bleh'],m.second.words)
def test_twice_the_same_word(self):
l = [NamedObject("foo foo bar"),NamedObject("bar bleh")]
r = MatchFactory().getmatches(l)
self.assertEqual(1,len(r))
def test_twice_the_same_word_when_preworded(self):
l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)]
r = MatchFactory().getmatches(l)
self.assertEqual(1,len(r))
def test_two_words_match(self):
l = [NamedObject("foo bar"),NamedObject("foo bar bleh")]
r = MatchFactory().getmatches(l)
self.assertEqual(1,len(r))
def test_match_files_with_only_common_words(self):
#If a word occurs more than 50 times, it is excluded from the matching process
#The problem with the common_word_threshold is that the files containing only common
#words will never be matched together. We *should* match them.
mf = MatchFactory()
mf.common_word_threshold = 50
l = [NamedObject("foo") for i in range(50)]
r = mf.getmatches(l)
self.assertEqual(1225,len(r))
def test_use_words_already_there_if_there(self):
o1 = NamedObject('foo')
o2 = NamedObject('bar')
o2.words = ['foo']
self.assertEqual(1,len(MatchFactory().getmatches([o1,o2])))
def test_job(self):
def do_progress(p,d=''):
self.log.append(p)
return True
j = job.Job(1,do_progress)
self.log = []
s = "foo bar"
MatchFactory().getmatches([NamedObject(s),NamedObject(s),NamedObject(s)],j)
self.assert_(len(self.log) > 2)
self.assertEqual(0,self.log[0])
self.assertEqual(100,self.log[-1])
def test_weight_words(self):
mf = MatchFactory()
mf.weight_words = True
l = [NamedObject("foo bar"),NamedObject("bar bleh")]
m = mf.getmatches(l)[0]
self.assertEqual(int((6.0 / 13.0) * 100),m.percentage)
def test_similar_word(self):
mf = MatchFactory()
mf.match_similar_words = True
l = [NamedObject("foobar"),NamedObject("foobars")]
self.assertEqual(1,len(mf.getmatches(l)))
self.assertEqual(100,mf.getmatches(l)[0].percentage)
l = [NamedObject("foobar"),NamedObject("foo")]
self.assertEqual(0,len(mf.getmatches(l))) #too far
l = [NamedObject("bizkit"),NamedObject("bizket")]
self.assertEqual(1,len(mf.getmatches(l)))
l = [NamedObject("foobar"),NamedObject("foosbar")]
self.assertEqual(1,len(mf.getmatches(l)))
def test_single_object_with_similar_words(self):
mf = MatchFactory()
mf.match_similar_words = True
l = [NamedObject("foo foos")]
self.assertEqual(0,len(mf.getmatches(l)))
def test_double_words_get_counted_only_once(self):
mf = MatchFactory()
l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")]
m = mf.getmatches(l)[0]
self.assertEqual(75,m.percentage)
def test_with_fields(self):
mf = MatchFactory()
o1 = NamedObject("foo bar - foo bleh")
o2 = NamedObject("foo bar - bleh bar")
o1.words = getfields(o1.name)
o2.words = getfields(o2.name)
m = mf.getmatches([o1, o2])[0]
self.assertEqual(50, m.percentage)
def test_with_fields_no_order(self):
mf = MatchFactory()
mf.no_field_order = True
o1 = NamedObject("foo bar - foo bleh")
o2 = NamedObject("bleh bang - foo bar")
o1.words = getfields(o1.name)
o2.words = getfields(o2.name)
m = mf.getmatches([o1, o2])[0]
self.assertEqual(50 ,m.percentage)
def test_only_match_similar_when_the_option_is_set(self):
mf = MatchFactory()
mf.match_similar_words = False
l = [NamedObject("foobar"),NamedObject("foobars")]
self.assertEqual(0,len(mf.getmatches(l)))
def test_dont_recurse_do_match(self):
# with nosetests, the stack is increased. The number has to be high enough not to be failing falsely
sys.setrecursionlimit(100)
mf = MatchFactory()
files = [NamedObject('foo bar') for i in range(101)]
try:
mf.getmatches(files)
except RuntimeError:
self.fail()
finally:
sys.setrecursionlimit(1000)
def test_min_match_percentage(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
mf = MatchFactory()
mf.min_match_percentage = 50
r = mf.getmatches(l)
self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match
def test_limit(self):
l = [NamedObject(),NamedObject(),NamedObject()]
mf = MatchFactory()
mf.limit = 2
r = mf.getmatches(l)
self.assertEqual(2,len(r))
def test_MemoryError(self):
@log_calls
def mocked_match(first, second, flags):
if len(mocked_match.calls) > 42:
raise MemoryError()
return Match(first, second, 0)
objects = [NamedObject() for i in range(10)] # results in 45 matches
self.mock(engine, 'get_match', mocked_match)
mf = MatchFactory()
try:
r = mf.getmatches(objects)
except MemoryError:
self.fail('MemorryError must be handled')
self.assertEqual(42, len(r))
class TCGroup(TestCase):
def test_empy(self):
g = Group()
self.assertEqual(None,g.ref)
self.assertEqual([],g.dupes)
self.assertEqual(0,len(g.matches))
def test_add_match(self):
g = Group()
m = get_match(NamedObject("foo",True),NamedObject("bar",True))
g.add_match(m)
self.assert_(g.ref is m.first)
self.assertEqual([m.second],g.dupes)
self.assertEqual(1,len(g.matches))
self.assert_(m in g.matches)
def test_multiple_add_match(self):
g = Group()
o1 = NamedObject("a",True)
o2 = NamedObject("b",True)
o3 = NamedObject("c",True)
o4 = NamedObject("d",True)
g.add_match(get_match(o1,o2))
self.assert_(g.ref is o1)
self.assertEqual([o2],g.dupes)
self.assertEqual(1,len(g.matches))
g.add_match(get_match(o1,o3))
self.assertEqual([o2],g.dupes)
self.assertEqual(2,len(g.matches))
g.add_match(get_match(o2,o3))
self.assertEqual([o2,o3],g.dupes)
self.assertEqual(3,len(g.matches))
g.add_match(get_match(o1,o4))
self.assertEqual([o2,o3],g.dupes)
self.assertEqual(4,len(g.matches))
g.add_match(get_match(o2,o4))
self.assertEqual([o2,o3],g.dupes)
self.assertEqual(5,len(g.matches))
g.add_match(get_match(o3,o4))
self.assertEqual([o2,o3,o4],g.dupes)
self.assertEqual(6,len(g.matches))
def test_len(self):
g = Group()
self.assertEqual(0,len(g))
g.add_match(get_match(NamedObject("foo",True),NamedObject("bar",True)))
self.assertEqual(2,len(g))
def test_add_same_match_twice(self):
g = Group()
m = get_match(NamedObject("foo",True),NamedObject("foo",True))
g.add_match(m)
self.assertEqual(2,len(g))
self.assertEqual(1,len(g.matches))
g.add_match(m)
self.assertEqual(2,len(g))
self.assertEqual(1,len(g.matches))
def test_in(self):
g = Group()
o1 = NamedObject("foo",True)
o2 = NamedObject("bar",True)
self.assert_(o1 not in g)
g.add_match(get_match(o1,o2))
self.assert_(o1 in g)
self.assert_(o2 in g)
def test_remove(self):
g = Group()
o1 = NamedObject("foo",True)
o2 = NamedObject("bar",True)
o3 = NamedObject("bleh",True)
g.add_match(get_match(o1,o2))
g.add_match(get_match(o1,o3))
g.add_match(get_match(o2,o3))
self.assertEqual(3,len(g.matches))
self.assertEqual(3,len(g))
g.remove_dupe(o3)
self.assertEqual(1,len(g.matches))
self.assertEqual(2,len(g))
g.remove_dupe(o1)
self.assertEqual(0,len(g.matches))
self.assertEqual(0,len(g))
def test_remove_with_ref_dupes(self):
g = Group()
o1 = NamedObject("foo",True)
o2 = NamedObject("bar",True)
o3 = NamedObject("bleh",True)
g.add_match(get_match(o1,o2))
g.add_match(get_match(o1,o3))
g.add_match(get_match(o2,o3))
o1.is_ref = True
o2.is_ref = True
g.remove_dupe(o3)
self.assertEqual(0,len(g))
def test_switch_ref(self):
o1 = NamedObject(with_words=True)
o2 = NamedObject(with_words=True)
g = Group()
g.add_match(get_match(o1,o2))
self.assert_(o1 is g.ref)
g.switch_ref(o2)
self.assert_(o2 is g.ref)
self.assertEqual([o1],g.dupes)
g.switch_ref(o2)
self.assert_(o2 is g.ref)
g.switch_ref(NamedObject('',True))
self.assert_(o2 is g.ref)
def test_get_match_of(self):
g = Group()
for m in get_match_triangle():
g.add_match(m)
o = g.dupes[0]
m = g.get_match_of(o)
self.assert_(g.ref in m)
self.assert_(o in m)
self.assert_(g.get_match_of(NamedObject('',True)) is None)
self.assert_(g.get_match_of(g.ref) is None)
def test_percentage(self):
#percentage should return the avg percentage in relation to the ref
m1,m2,m3 = get_match_triangle()
m1 = Match(m1[0], m1[1], 100)
m2 = Match(m2[0], m2[1], 50)
m3 = Match(m3[0], m3[1], 33)
g = Group()
g.add_match(m1)
g.add_match(m2)
g.add_match(m3)
self.assertEqual(75,g.percentage)
g.switch_ref(g.dupes[0])
self.assertEqual(66,g.percentage)
g.remove_dupe(g.dupes[0])
self.assertEqual(33,g.percentage)
g.add_match(m1)
g.add_match(m2)
self.assertEqual(66,g.percentage)
def test_percentage_on_empty_group(self):
g = Group()
self.assertEqual(0,g.percentage)
def test_prioritize(self):
m1,m2,m3 = get_match_triangle()
o1 = m1.first
o2 = m1.second
o3 = m2.second
o1.name = 'c'
o2.name = 'b'
o3.name = 'a'
g = Group()
g.add_match(m1)
g.add_match(m2)
g.add_match(m3)
self.assert_(o1 is g.ref)
g.prioritize(lambda x:x.name)
self.assert_(o3 is g.ref)
def test_prioritize_with_tie_breaker(self):
# if the ref has the same key as one or more of the dupe, run the tie_breaker func among them
g = get_test_group()
o1, o2, o3 = g.ordered
tie_breaker = lambda ref, dupe: dupe is o3
g.prioritize(lambda x:0, tie_breaker)
self.assertTrue(g.ref is o3)
def test_prioritize_with_tie_breaker_runs_on_all_dupes(self):
# Even if a dupe is chosen to switch with ref with a tie breaker, we still run the tie breaker
# with other dupes and the newly chosen ref
g = get_test_group()
o1, o2, o3 = g.ordered
o1.foo = 1
o2.foo = 2
o3.foo = 3
tie_breaker = lambda ref, dupe: dupe.foo > ref.foo
g.prioritize(lambda x:0, tie_breaker)
self.assertTrue(g.ref is o3)
def test_prioritize_with_tie_breaker_runs_only_on_tie_dupes(self):
# The tie breaker only runs on dupes that had the same value for the key_func
g = get_test_group()
o1, o2, o3 = g.ordered
o1.foo = 2
o2.foo = 2
o3.foo = 1
o1.bar = 1
o2.bar = 2
o3.bar = 3
key_func = lambda x: -x.foo
tie_breaker = lambda ref, dupe: dupe.bar > ref.bar
g.prioritize(key_func, tie_breaker)
self.assertTrue(g.ref is o2)
def test_list_like(self):
g = Group()
o1,o2 = (NamedObject("foo",True),NamedObject("bar",True))
g.add_match(get_match(o1,o2))
self.assert_(g[0] is o1)
self.assert_(g[1] is o2)
def test_clean_matches(self):
g = Group()
o1,o2,o3 = (NamedObject("foo",True),NamedObject("bar",True),NamedObject("baz",True))
g.add_match(get_match(o1,o2))
g.add_match(get_match(o1,o3))
g.clean_matches()
self.assertEqual(1,len(g.matches))
self.assertEqual(0,len(g.candidates))
class TCget_groups(TestCase):
def test_empty(self):
r = get_groups([])
self.assertEqual([],r)
def test_simple(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh")]
matches = MatchFactory().getmatches(l)
m = matches[0]
r = get_groups(matches)
self.assertEqual(1,len(r))
g = r[0]
self.assert_(g.ref is m.first)
self.assertEqual([m.second],g.dupes)
def test_group_with_multiple_matches(self):
#This results in 3 matches
l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")]
matches = MatchFactory().getmatches(l)
r = get_groups(matches)
self.assertEqual(1,len(r))
g = r[0]
self.assertEqual(3,len(g))
def test_must_choose_a_group(self):
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")]
#There will be 2 groups here: group "a b" and group "c d"
#"b c" can go either of them, but not both.
matches = MatchFactory().getmatches(l)
r = get_groups(matches)
self.assertEqual(2,len(r))
self.assertEqual(5,len(r[0])+len(r[1]))
def test_should_all_go_in_the_same_group(self):
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")]
#There will be 2 groups here: group "a b" and group "c d"
#"b c" can fit in both, but it must be in only one of them
matches = MatchFactory().getmatches(l)
r = get_groups(matches)
self.assertEqual(1,len(r))
def test_give_priority_to_matches_with_higher_percentage(self):
o1 = NamedObject(with_words=True)
o2 = NamedObject(with_words=True)
o3 = NamedObject(with_words=True)
m1 = Match(o1, o2, 1)
m2 = Match(o2, o3, 2)
r = get_groups([m1,m2])
self.assertEqual(1,len(r))
g = r[0]
self.assertEqual(2,len(g))
self.assert_(o1 not in g)
self.assert_(o2 in g)
self.assert_(o3 in g)
def test_four_sized_group(self):
l = [NamedObject("foobar") for i in xrange(4)]
m = MatchFactory().getmatches(l)
r = get_groups(m)
self.assertEqual(1,len(r))
self.assertEqual(4,len(r[0]))
def test_referenced_by_ref2(self):
o1 = NamedObject(with_words=True)
o2 = NamedObject(with_words=True)
o3 = NamedObject(with_words=True)
m1 = get_match(o1,o2)
m2 = get_match(o3,o1)
m3 = get_match(o3,o2)
r = get_groups([m1,m2,m3])
self.assertEqual(3,len(r[0]))
def test_job(self):
def do_progress(p,d=''):
self.log.append(p)
return True
self.log = []
j = job.Job(1,do_progress)
m1,m2,m3 = get_match_triangle()
#101%: To make sure it is processed first so the job test works correctly
m4 = Match(NamedObject('a',True), NamedObject('a',True), 101)
get_groups([m1,m2,m3,m4],j)
self.assertEqual(0,self.log[0])
self.assertEqual(100,self.log[-1])
if __name__ == "__main__":
unittest.main()

67
py/export.py Normal file
View File

@@ -0,0 +1,67 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.export
Created By: Virgil Dupras
Created On: 2006/09/16
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
from xml.dom import minidom
import tempfile
import os.path as op
import os
from StringIO import StringIO
from hsutil.files import FileOrPath
def output_column_xml(outfile, columns):
"""Creates a xml file outfile with the supplied columns.
outfile can be a filename or a file object.
columns is a list of 2 sized tuples (display,enabled)
"""
doc = minidom.Document()
root = doc.appendChild(doc.createElement('columns'))
for display,enabled in columns:
col_node = root.appendChild(doc.createElement('column'))
col_node.setAttribute('display', display)
col_node.setAttribute('enabled', {True:'y',False:'n'}[enabled])
with FileOrPath(outfile, 'wb') as fp:
doc.writexml(fp, '\t','\t','\n', encoding='utf-8')
def merge_css_into_xhtml(xhtml, css):
with FileOrPath(xhtml, 'r+') as xhtml:
with FileOrPath(css) as css:
try:
doc = minidom.parse(xhtml)
except Exception:
return False
head = doc.getElementsByTagName('head')[0]
links = head.getElementsByTagName('link')
for link in links:
if link.getAttribute('rel') == 'stylesheet':
head.removeChild(link)
style = head.appendChild(doc.createElement('style'))
style.setAttribute('type','text/css')
style.appendChild(doc.createTextNode(css.read()))
xhtml.truncate(0)
doc.writexml(xhtml, '\t','\t','\n', encoding='utf-8')
xhtml.seek(0)
return True
def export_to_xhtml(xml, xslt, css, columns, cmd='xsltproc --path "%(folder)s" "%(xslt)s" "%(xml)s"'):
folder = op.split(xml)[0]
output_column_xml(op.join(folder,'columns.xml'),columns)
html = StringIO()
cmd = cmd % {'folder': folder, 'xslt': xslt, 'xml': xml}
html.write(os.popen(cmd).read())
html.seek(0)
merge_css_into_xhtml(html,css)
html.seek(0)
html_path = op.join(folder,'export.htm')
html_file = open(html_path,'w')
html_file.write(html.read().encode('utf-8'))
html_file.close()
return html_path

91
py/export_test.py Normal file
View File

@@ -0,0 +1,91 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.tests.export
Created By: Virgil Dupras
Created On: 2006/09/16
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import unittest
from xml.dom import minidom
from StringIO import StringIO
from hsutil.testcase import TestCase
from .export import *
from . import export
class TCoutput_columns_xml(TestCase):
def test_empty_columns(self):
f = StringIO()
output_column_xml(f,[])
f.seek(0)
doc = minidom.parse(f)
root = doc.documentElement
self.assertEqual('columns',root.nodeName)
self.assertEqual(0,len(root.childNodes))
def test_some_columns(self):
f = StringIO()
output_column_xml(f,[('foo',True),('bar',False),('baz',True)])
f.seek(0)
doc = minidom.parse(f)
columns = doc.getElementsByTagName('column')
self.assertEqual(3,len(columns))
c1,c2,c3 = columns
self.assertEqual('foo',c1.getAttribute('display'))
self.assertEqual('bar',c2.getAttribute('display'))
self.assertEqual('baz',c3.getAttribute('display'))
self.assertEqual('y',c1.getAttribute('enabled'))
self.assertEqual('n',c2.getAttribute('enabled'))
self.assertEqual('y',c3.getAttribute('enabled'))
class TCmerge_css_into_xhtml(TestCase):
def test_main(self):
css = StringIO()
css.write('foobar')
css.seek(0)
xhtml = StringIO()
xhtml.write("""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>dupeGuru - Duplicate file scanner</title>
<link rel="SHORTCUT ICON" href="/favicon.ico" />
<link rel="stylesheet" href="../hardcoded.css" type="text/css" />
</head>
<body>
</body>
</html>""")
xhtml.seek(0)
self.assert_(merge_css_into_xhtml(xhtml,css))
xhtml.seek(0)
doc = minidom.parse(xhtml)
head = doc.getElementsByTagName('head')[0]
#A style node should have been added in head.
styles = head.getElementsByTagName('style')
self.assertEqual(1,len(styles))
style = styles[0]
self.assertEqual('text/css',style.getAttribute('type'))
self.assertEqual('foobar',style.firstChild.nodeValue.strip())
#all <link rel="stylesheet"> should be removed
self.assertEqual(1,len(head.getElementsByTagName('link')))
def test_empty(self):
self.assert_(not merge_css_into_xhtml(StringIO(),StringIO()))
def test_malformed(self):
xhtml = StringIO()
xhtml.write("""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">""")
xhtml.seek(0)
self.assert_(not merge_css_into_xhtml(xhtml,StringIO()))
if __name__ == "__main__":
unittest.main()

28
py/gen.py Normal file
View File

@@ -0,0 +1,28 @@
#!/usr/bin/env python
# Unit Name: gen
# Created By: Virgil Dupras
# Created On: 2009-05-26
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
import os
import os.path as op
def move(src, dst):
if not op.exists(src):
return
if op.exists(dst):
os.remove(dst)
print 'Moving %s --> %s' % (src, dst)
os.rename(src, dst)
os.chdir(op.join('modules', 'block'))
os.system('python setup.py build_ext --inplace')
os.chdir(op.join('..', 'cache'))
os.system('python setup.py build_ext --inplace')
os.chdir(op.join('..', '..'))
move(op.join('modules', 'block', '_block.so'), op.join('picture', '_block.so'))
move(op.join('modules', 'block', '_block.pyd'), op.join('picture', '_block.pyd'))
move(op.join('modules', 'cache', '_cache.so'), op.join('picture', '_cache.so'))
move(op.join('modules', 'cache', '_cache.pyd'), op.join('picture', '_cache.pyd'))

117
py/ignore.py Normal file
View File

@@ -0,0 +1,117 @@
#!/usr/bin/env python
"""
Unit Name: ignore
Created By: Virgil Dupras
Created On: 2006/05/02
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
from hsutil.files import FileOrPath
import xml.dom.minidom
class IgnoreList(object):
"""An ignore list implementation that is iterable, filterable and exportable to XML.
Call Ignore to add an ignore list entry, and AreIgnore to check if 2 items are in the list.
When iterated, 2 sized tuples will be returned, the tuples containing 2 items ignored together.
"""
#---Override
def __init__(self):
self._ignored = {}
self._count = 0
def __iter__(self):
for first,seconds in self._ignored.iteritems():
for second in seconds:
yield (first,second)
def __len__(self):
return self._count
#---Public
def AreIgnored(self,first,second):
def do_check(first,second):
try:
matches = self._ignored[first]
return second in matches
except KeyError:
return False
return do_check(first,second) or do_check(second,first)
def Clear(self):
self._ignored = {}
self._count = 0
def Filter(self,func):
"""Applies a filter on all ignored items, and remove all matches where func(first,second)
doesn't return True.
"""
filtered = IgnoreList()
for first,second in self:
if func(first,second):
filtered.Ignore(first,second)
self._ignored = filtered._ignored
self._count = filtered._count
def Ignore(self,first,second):
if self.AreIgnored(first,second):
return
try:
matches = self._ignored[first]
matches.add(second)
except KeyError:
try:
matches = self._ignored[second]
matches.add(first)
except KeyError:
matches = set()
matches.add(second)
self._ignored[first] = matches
self._count += 1
def load_from_xml(self,infile):
"""Loads the ignore list from a XML created with save_to_xml.
infile can be a file object or a filename.
"""
try:
doc = xml.dom.minidom.parse(infile)
except Exception:
return
file_nodes = doc.getElementsByTagName('file')
for fn in file_nodes:
if not fn.getAttributeNode('path'):
continue
file_path = fn.getAttributeNode('path').nodeValue
subfile_nodes = fn.getElementsByTagName('file')
for sfn in subfile_nodes:
if not sfn.getAttributeNode('path'):
continue
subfile_path = sfn.getAttributeNode('path').nodeValue
self.Ignore(file_path,subfile_path)
def save_to_xml(self,outfile):
"""Create a XML file that can be used by load_from_xml.
outfile can be a file object or a filename.
"""
doc = xml.dom.minidom.Document()
root = doc.appendChild(doc.createElement('ignore_list'))
for file,subfiles in self._ignored.items():
file_node = root.appendChild(doc.createElement('file'))
if isinstance(file,unicode):
file = file.encode('utf-8')
file_node.setAttribute('path',file)
for subfile in subfiles:
subfile_node = file_node.appendChild(doc.createElement('file'))
if isinstance(subfile,unicode):
subfile = subfile.encode('utf-8')
subfile_node.setAttribute('path',subfile)
with FileOrPath(outfile, 'wb') as fp:
doc.writexml(fp,'\t','\t','\n',encoding='utf-8')

158
py/ignore_test.py Normal file
View File

@@ -0,0 +1,158 @@
#!/usr/bin/env python
"""
Unit Name: ignore
Created By: Virgil Dupras
Created On: 2006/05/02
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import unittest
import cStringIO
import xml.dom.minidom
from .ignore import *
class TCIgnoreList(unittest.TestCase):
def test_empty(self):
il = IgnoreList()
self.assertEqual(0,len(il))
self.assert_(not il.AreIgnored('foo','bar'))
def test_simple(self):
il = IgnoreList()
il.Ignore('foo','bar')
self.assert_(il.AreIgnored('foo','bar'))
self.assert_(il.AreIgnored('bar','foo'))
self.assert_(not il.AreIgnored('foo','bleh'))
self.assert_(not il.AreIgnored('bleh','bar'))
self.assertEqual(1,len(il))
def test_multiple(self):
il = IgnoreList()
il.Ignore('foo','bar')
il.Ignore('foo','bleh')
il.Ignore('bleh','bar')
il.Ignore('aybabtu','bleh')
self.assert_(il.AreIgnored('foo','bar'))
self.assert_(il.AreIgnored('bar','foo'))
self.assert_(il.AreIgnored('foo','bleh'))
self.assert_(il.AreIgnored('bleh','bar'))
self.assert_(not il.AreIgnored('aybabtu','bar'))
self.assertEqual(4,len(il))
def test_clear(self):
il = IgnoreList()
il.Ignore('foo','bar')
il.Clear()
self.assert_(not il.AreIgnored('foo','bar'))
self.assert_(not il.AreIgnored('bar','foo'))
self.assertEqual(0,len(il))
def test_add_same_twice(self):
il = IgnoreList()
il.Ignore('foo','bar')
il.Ignore('bar','foo')
self.assertEqual(1,len(il))
def test_save_to_xml(self):
il = IgnoreList()
il.Ignore('foo','bar')
il.Ignore('foo','bleh')
il.Ignore('bleh','bar')
f = cStringIO.StringIO()
il.save_to_xml(f)
f.seek(0)
doc = xml.dom.minidom.parse(f)
root = doc.documentElement
self.assertEqual('ignore_list',root.nodeName)
children = [c for c in root.childNodes if c.localName]
self.assertEqual(2,len(children))
self.assertEqual(2,len([c for c in children if c.nodeName == 'file']))
f1,f2 = children
subchildren = [c for c in f1.childNodes if c.localName == 'file'] +\
[c for c in f2.childNodes if c.localName == 'file']
self.assertEqual(3,len(subchildren))
def test_SaveThenLoad(self):
il = IgnoreList()
il.Ignore('foo','bar')
il.Ignore('foo','bleh')
il.Ignore('bleh','bar')
il.Ignore(u'\u00e9','bar')
f = cStringIO.StringIO()
il.save_to_xml(f)
f.seek(0)
il = IgnoreList()
il.load_from_xml(f)
self.assertEqual(4,len(il))
self.assert_(il.AreIgnored(u'\u00e9','bar'))
def test_LoadXML_with_empty_file_tags(self):
f = cStringIO.StringIO()
f.write('<?xml version="1.0" encoding="utf-8"?><ignore_list><file><file/></file></ignore_list>')
f.seek(0)
il = IgnoreList()
il.load_from_xml(f)
self.assertEqual(0,len(il))
def test_AreIgnore_works_when_a_child_is_a_key_somewhere_else(self):
il = IgnoreList()
il.Ignore('foo','bar')
il.Ignore('bar','baz')
self.assert_(il.AreIgnored('bar','foo'))
def test_no_dupes_when_a_child_is_a_key_somewhere_else(self):
il = IgnoreList()
il.Ignore('foo','bar')
il.Ignore('bar','baz')
il.Ignore('bar','foo')
self.assertEqual(2,len(il))
def test_iterate(self):
#It must be possible to iterate through ignore list
il = IgnoreList()
expected = [('foo','bar'),('bar','baz'),('foo','baz')]
for i in expected:
il.Ignore(i[0],i[1])
for i in il:
expected.remove(i) #No exception should be raised
self.assert_(not expected) #expected should be empty
def test_filter(self):
il = IgnoreList()
il.Ignore('foo','bar')
il.Ignore('bar','baz')
il.Ignore('foo','baz')
il.Filter(lambda f,s: f == 'bar')
self.assertEqual(1,len(il))
self.assert_(not il.AreIgnored('foo','bar'))
self.assert_(il.AreIgnored('bar','baz'))
def test_save_with_non_ascii_non_unicode_items(self):
il = IgnoreList()
il.Ignore('\xac','\xbf')
f = cStringIO.StringIO()
try:
il.save_to_xml(f)
except Exception,e:
self.fail(str(e))
def test_len(self):
il = IgnoreList()
self.assertEqual(0,len(il))
il.Ignore('foo','bar')
self.assertEqual(1,len(il))
def test_nonzero(self):
il = IgnoreList()
self.assert_(not il)
il.Ignore('foo','bar')
self.assert_(il)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,93 @@
# Created By: Virgil Dupras
# Created On: 2009-04-23
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
cdef extern from "stdlib.h":
int abs(int n) # required so that abs() is applied on ints, not python objects
class NoBlocksError(Exception):
"""avgdiff/maxdiff has been called with empty lists"""
class DifferentBlockCountError(Exception):
"""avgdiff/maxdiff has been called with 2 block lists of different size."""
cdef object getblock(object image):
"""Returns a 3 sized tuple containing the mean color of 'image'.
image: a PIL image or crop.
"""
cdef int pixel_count, red, green, blue, r, g, b
if image.size[0]:
pixel_count = image.size[0] * image.size[1]
red = green = blue = 0
for r, g, b in image.getdata():
red += r
green += g
blue += b
return (red // pixel_count, green // pixel_count, blue // pixel_count)
else:
return (0, 0, 0)
def getblocks2(image, int block_count_per_side):
"""Returns a list of blocks (3 sized tuples).
image: A PIL image to base the blocks on.
block_count_per_side: This integer determine the number of blocks the function will return.
If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not
necessarely cover square areas. The area covered by each block will be proportional to the image
itself.
"""
if not image.size[0]:
return []
cdef int width, height, block_width, block_height, ih, iw, top, bottom, left, right
width, height = image.size
block_width = max(width // block_count_per_side, 1)
block_height = max(height // block_count_per_side, 1)
result = []
for ih in range(block_count_per_side):
top = min(ih * block_height, height - block_height)
bottom = top + block_height
for iw in range(block_count_per_side):
left = min(iw * block_width, width - block_width)
right = left + block_width
box = (left, top, right, bottom)
crop = image.crop(box)
result.append(getblock(crop))
return result
cdef int diff(first, second):
"""Returns the difference between the first block and the second.
It returns an absolute sum of the 3 differences (RGB).
"""
cdef int r1, g1, b1, r2, g2, b2
r1, g1, b1 = first
r2, g2, b2 = second
return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2)
def avgdiff(first, second, int limit, int min_iterations):
"""Returns the average diff between first blocks and seconds.
If the result surpasses limit, limit + 1 is returned, except if less than min_iterations
iterations have been made in the blocks.
"""
cdef int count, sum, i, iteration_count
count = len(first)
if count != len(second):
raise DifferentBlockCountError()
if not count:
raise NoBlocksError()
sum = 0
for i in range(count):
iteration_count = i + 1
item1 = first[i]
item2 = second[i]
sum += diff(item1, item2)
if sum > limit * iteration_count and iteration_count >= min_iterations:
return limit + 1
result = sum // count
if (not result) and sum:
result = 1
return result

14
py/modules/block/setup.py Normal file
View File

@@ -0,0 +1,14 @@
#!/usr/bin/env python
# Created By: Virgil Dupras
# Created On: 2009-04-23
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
setup(
cmdclass = {'build_ext': build_ext},
ext_modules = [Extension("_block", ["block.pyx"])]
)

34
py/modules/cache/cache.pyx vendored Normal file
View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python
# Created By: Virgil Dupras
# Created On: 2009-04-23
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
# ok, this is hacky and stuff, but I don't know C well enough to play with char buffers, copy
# them around and stuff
cdef int xchar_to_int(char c):
if 48 <= c <= 57: # 0-9
return c - 48
elif 65 <= c <= 70: # A-F
return c - 55
elif 97 <= c <= 102: # a-f
return c - 87
def string_to_colors(s):
"""Transform the string 's' in a list of 3 sized tuples.
"""
result = []
cdef int i, char_count, r, g, b
cdef char* cs
char_count = len(s)
char_count = (char_count // 6) * 6
cs = s
for i in range(0, char_count, 6):
r = xchar_to_int(cs[i]) << 4
r += xchar_to_int(cs[i+1])
g = xchar_to_int(cs[i+2]) << 4
g += xchar_to_int(cs[i+3])
b = xchar_to_int(cs[i+4]) << 4
b += xchar_to_int(cs[i+5])
result.append((r, g, b))
return result

14
py/modules/cache/setup.py vendored Normal file
View File

@@ -0,0 +1,14 @@
#!/usr/bin/env python
# Created By: Virgil Dupras
# Created On: 2009-04-23
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
setup(
cmdclass = {'build_ext': build_ext},
ext_modules = [Extension("_cache", ["cache.pyx"])]
)

0
py/picture/__init__.py Normal file
View File

124
py/picture/block.py Normal file
View File

@@ -0,0 +1,124 @@
#!/usr/bin/env python
"""
Unit Name: hs.picture.block
Created By: Virgil Dupras
Created On: 2006/09/01
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-26 18:12:39 +0200 (Tue, 26 May 2009) $
$Revision: 4365 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
from _block import NoBlocksError, DifferentBlockCountError, avgdiff, getblocks2
# Converted to Cython
# def getblock(image):
# """Returns a 3 sized tuple containing the mean color of 'image'.
#
# image: a PIL image or crop.
# """
# if image.size[0]:
# pixel_count = image.size[0] * image.size[1]
# red = green = blue = 0
# for r,g,b in image.getdata():
# red += r
# green += g
# blue += b
# return (red // pixel_count, green // pixel_count, blue // pixel_count)
# else:
# return (0,0,0)
# This is not used anymore
# def getblocks(image,blocksize):
# """Returns a list of blocks (3 sized tuples).
#
# image: A PIL image to base the blocks on.
# blocksize: The size of the blocks to be create. This is a single integer, defining
# both width and height (blocks are square).
# """
# if min(image.size) < blocksize:
# return ()
# result = []
# for i in xrange(image.size[1] // blocksize):
# for j in xrange(image.size[0] // blocksize):
# box = (blocksize * j, blocksize * i, blocksize * (j + 1), blocksize * (i + 1))
# crop = image.crop(box)
# result.append(getblock(crop))
# return result
# Converted to Cython
# def getblocks2(image,block_count_per_side):
# """Returns a list of blocks (3 sized tuples).
#
# image: A PIL image to base the blocks on.
# block_count_per_side: This integer determine the number of blocks the function will return.
# If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not
# necessarely cover square areas. The area covered by each block will be proportional to the image
# itself.
# """
# if not image.size[0]:
# return []
# width,height = image.size
# block_width = max(width // block_count_per_side,1)
# block_height = max(height // block_count_per_side,1)
# result = []
# for ih in range(block_count_per_side):
# top = min(ih * block_height, height - block_height)
# bottom = top + block_height
# for iw in range(block_count_per_side):
# left = min(iw * block_width, width - block_width)
# right = left + block_width
# box = (left,top,right,bottom)
# crop = image.crop(box)
# result.append(getblock(crop))
# return result
# Converted to Cython
# def diff(first, second):
# """Returns the difference between the first block and the second.
#
# It returns an absolute sum of the 3 differences (RGB).
# """
# r1, g1, b1 = first
# r2, g2, b2 = second
# return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2)
# Converted to Cython
# def avgdiff(first, second, limit=768, min_iterations=1):
# """Returns the average diff between first blocks and seconds.
#
# If the result surpasses limit, limit + 1 is returned, except if less than min_iterations
# iterations have been made in the blocks.
# """
# if len(first) != len(second):
# raise DifferentBlockCountError
# if not first:
# raise NoBlocksError
# count = len(first)
# sum = 0
# zipped = izip(xrange(1, count + 1), first, second)
# for i, first, second in zipped:
# sum += diff(first, second)
# if sum > limit * i and i >= min_iterations:
# return limit + 1
# result = sum // count
# if (not result) and sum:
# result = 1
# return result
# This is not used anymore
# def maxdiff(first,second,limit=768):
# """Returns the max diff between first blocks and seconds.
#
# If the result surpasses limit, the first max being over limit is returned.
# """
# if len(first) != len(second):
# raise DifferentBlockCountError
# if not first:
# raise NoBlocksError
# result = 0
# zipped = zip(first,second)
# for first,second in zipped:
# result = max(result,diff(first,second))
# if result > limit:
# return result
# return result

313
py/picture/block_test.py Normal file
View File

@@ -0,0 +1,313 @@
#!/usr/bin/env python
"""
Unit Name: tests.picture.block
Created By: Virgil Dupras
Created On: 2006/09/01
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
# The commented out tests are tests for function that have been converted to pure C for speed
import unittest
from .block import *
def my_avgdiff(first, second, limit=768, min_iter=3): # this is so I don't have to re-write every call
return avgdiff(first, second, limit, min_iter)
BLACK = (0,0,0)
RED = (0xff,0,0)
GREEN = (0,0xff,0)
BLUE = (0,0,0xff)
class FakeImage(object):
def __init__(self, size, data):
self.size = size
self.data = data
def getdata(self):
return self.data
def crop(self, box):
pixels = []
for i in range(box[1], box[3]):
for j in range(box[0], box[2]):
pixel = self.data[i * self.size[0] + j]
pixels.append(pixel)
return FakeImage((box[2] - box[0], box[3] - box[1]), pixels)
def empty():
return FakeImage((0,0), [])
def single_pixel(): #one red pixel
return FakeImage((1, 1), [(0xff,0,0)])
def four_pixels():
pixels = [RED,(0,0x80,0xff),(0x80,0,0),(0,0x40,0x80)]
return FakeImage((2, 2), pixels)
class TCgetblock(unittest.TestCase):
def test_single_pixel(self):
im = single_pixel()
[b] = getblocks2(im, 1)
self.assertEqual(RED,b)
def test_no_pixel(self):
im = empty()
self.assertEqual([], getblocks2(im, 1))
def test_four_pixels(self):
im = four_pixels()
[b] = getblocks2(im, 1)
meanred = (0xff + 0x80) // 4
meangreen = (0x80 + 0x40) // 4
meanblue = (0xff + 0x80) // 4
self.assertEqual((meanred,meangreen,meanblue),b)
# class TCdiff(unittest.TestCase):
# def test_diff(self):
# b1 = (10, 20, 30)
# b2 = (1, 2, 3)
# self.assertEqual(9 + 18 + 27,diff(b1,b2))
#
# def test_diff_negative(self):
# b1 = (10, 20, 30)
# b2 = (1, 2, 3)
# self.assertEqual(9 + 18 + 27,diff(b2,b1))
#
# def test_diff_mixed_positive_and_negative(self):
# b1 = (1, 5, 10)
# b2 = (10, 1, 15)
# self.assertEqual(9 + 4 + 5,diff(b1,b2))
#
# class TCgetblocks(unittest.TestCase):
# def test_empty_image(self):
# im = empty()
# blocks = getblocks(im,1)
# self.assertEqual(0,len(blocks))
#
# def test_one_block_image(self):
# im = four_pixels()
# blocks = getblocks2(im, 1)
# self.assertEqual(1,len(blocks))
# block = blocks[0]
# meanred = (0xff + 0x80) // 4
# meangreen = (0x80 + 0x40) // 4
# meanblue = (0xff + 0x80) // 4
# self.assertEqual((meanred,meangreen,meanblue),block)
#
# def test_not_enough_height_to_fit_a_block(self):
# im = FakeImage((2,1), [BLACK, BLACK])
# blocks = getblocks(im,2)
# self.assertEqual(0,len(blocks))
#
# def xtest_dont_include_leftovers(self):
# # this test is disabled because getblocks is not used and getblock in cdeffed
# pixels = [
# RED,(0,0x80,0xff),BLACK,
# (0x80,0,0),(0,0x40,0x80),BLACK,
# BLACK,BLACK,BLACK
# ]
# im = FakeImage((3,3), pixels)
# blocks = getblocks(im,2)
# block = blocks[0]
# #Because the block is smaller than the image, only blocksize must be considered.
# meanred = (0xff + 0x80) // 4
# meangreen = (0x80 + 0x40) // 4
# meanblue = (0xff + 0x80) // 4
# self.assertEqual((meanred,meangreen,meanblue),block)
#
# def xtest_two_blocks(self):
# # this test is disabled because getblocks is not used and getblock in cdeffed
# pixels = [BLACK for i in xrange(4 * 2)]
# pixels[0] = RED
# pixels[1] = (0,0x80,0xff)
# pixels[4] = (0x80,0,0)
# pixels[5] = (0,0x40,0x80)
# im = FakeImage((4, 2), pixels)
# blocks = getblocks(im,2)
# self.assertEqual(2,len(blocks))
# block = blocks[0]
# #Because the block is smaller than the image, only blocksize must be considered.
# meanred = (0xff + 0x80) // 4
# meangreen = (0x80 + 0x40) // 4
# meanblue = (0xff + 0x80) // 4
# self.assertEqual((meanred,meangreen,meanblue),block)
# self.assertEqual(BLACK,blocks[1])
#
# def test_four_blocks(self):
# pixels = [BLACK for i in xrange(4 * 4)]
# pixels[0] = RED
# pixels[1] = (0,0x80,0xff)
# pixels[4] = (0x80,0,0)
# pixels[5] = (0,0x40,0x80)
# im = FakeImage((4, 4), pixels)
# blocks = getblocks2(im, 2)
# self.assertEqual(4,len(blocks))
# block = blocks[0]
# #Because the block is smaller than the image, only blocksize must be considered.
# meanred = (0xff + 0x80) // 4
# meangreen = (0x80 + 0x40) // 4
# meanblue = (0xff + 0x80) // 4
# self.assertEqual((meanred,meangreen,meanblue),block)
# self.assertEqual(BLACK,blocks[1])
# self.assertEqual(BLACK,blocks[2])
# self.assertEqual(BLACK,blocks[3])
#
class TCgetblocks2(unittest.TestCase):
def test_empty_image(self):
im = empty()
blocks = getblocks2(im,1)
self.assertEqual(0,len(blocks))
def test_one_block_image(self):
im = four_pixels()
blocks = getblocks2(im,1)
self.assertEqual(1,len(blocks))
block = blocks[0]
meanred = (0xff + 0x80) // 4
meangreen = (0x80 + 0x40) // 4
meanblue = (0xff + 0x80) // 4
self.assertEqual((meanred,meangreen,meanblue),block)
def test_four_blocks_all_black(self):
im = FakeImage((2, 2), [BLACK, BLACK, BLACK, BLACK])
blocks = getblocks2(im,2)
self.assertEqual(4,len(blocks))
for block in blocks:
self.assertEqual(BLACK,block)
def test_two_pixels_image_horizontal(self):
pixels = [RED,BLUE]
im = FakeImage((2, 1), pixels)
blocks = getblocks2(im,2)
self.assertEqual(4,len(blocks))
self.assertEqual(RED,blocks[0])
self.assertEqual(BLUE,blocks[1])
self.assertEqual(RED,blocks[2])
self.assertEqual(BLUE,blocks[3])
def test_two_pixels_image_vertical(self):
pixels = [RED,BLUE]
im = FakeImage((1, 2), pixels)
blocks = getblocks2(im,2)
self.assertEqual(4,len(blocks))
self.assertEqual(RED,blocks[0])
self.assertEqual(RED,blocks[1])
self.assertEqual(BLUE,blocks[2])
self.assertEqual(BLUE,blocks[3])
class TCavgdiff(unittest.TestCase):
def test_empty(self):
self.assertRaises(NoBlocksError, my_avgdiff, [], [])
def test_two_blocks(self):
im = empty()
b1 = (5,10,15)
b2 = (255,250,245)
b3 = (0,0,0)
b4 = (255,0,255)
blocks1 = [b1,b2]
blocks2 = [b3,b4]
expected1 = 5 + 10 + 15
expected2 = 0 + 250 + 10
expected = (expected1 + expected2) // 2
self.assertEqual(expected, my_avgdiff(blocks1, blocks2))
def test_blocks_not_the_same_size(self):
b = (0,0,0)
self.assertRaises(DifferentBlockCountError,my_avgdiff,[b,b],[b])
def test_first_arg_is_empty_but_not_second(self):
#Don't return 0 (as when the 2 lists are empty), raise!
b = (0,0,0)
self.assertRaises(DifferentBlockCountError,my_avgdiff,[],[b])
def test_limit(self):
ref = (0,0,0)
b1 = (10,10,10) #avg 30
b2 = (20,20,20) #avg 45
b3 = (30,30,30) #avg 60
blocks1 = [ref,ref,ref]
blocks2 = [b1,b2,b3]
self.assertEqual(45,my_avgdiff(blocks1,blocks2,44))
def test_min_iterations(self):
ref = (0,0,0)
b1 = (10,10,10) #avg 30
b2 = (20,20,20) #avg 45
b3 = (10,10,10) #avg 40
blocks1 = [ref,ref,ref]
blocks2 = [b1,b2,b3]
self.assertEqual(40,my_avgdiff(blocks1,blocks2,45 - 1,3))
# Bah, I don't know why this test fails, but I don't think it matters very much
# def test_just_over_the_limit(self):
# #A score just over the limit might return exactly the limit due to truncating. We should
# #ceil() the result in this case.
# ref = (0,0,0)
# b1 = (10,0,0)
# b2 = (11,0,0)
# blocks1 = [ref,ref]
# blocks2 = [b1,b2]
# self.assertEqual(11,my_avgdiff(blocks1,blocks2,10))
#
def test_return_at_least_1_at_the_slightest_difference(self):
ref = (0,0,0)
b1 = (1,0,0)
blocks1 = [ref for i in xrange(250)]
blocks2 = [ref for i in xrange(250)]
blocks2[0] = b1
self.assertEqual(1,my_avgdiff(blocks1,blocks2))
def test_return_0_if_there_is_no_difference(self):
ref = (0,0,0)
blocks1 = [ref,ref]
blocks2 = [ref,ref]
self.assertEqual(0,my_avgdiff(blocks1,blocks2))
# class TCmaxdiff(unittest.TestCase):
# def test_empty(self):
# self.assertRaises(NoBlocksError,maxdiff,[],[])
#
# def test_two_blocks(self):
# b1 = (5,10,15)
# b2 = (255,250,245)
# b3 = (0,0,0)
# b4 = (255,0,255)
# blocks1 = [b1,b2]
# blocks2 = [b3,b4]
# expected1 = 5 + 10 + 15
# expected2 = 0 + 250 + 10
# expected = max(expected1,expected2)
# self.assertEqual(expected,maxdiff(blocks1,blocks2))
#
# def test_blocks_not_the_same_size(self):
# b = (0,0,0)
# self.assertRaises(DifferentBlockCountError,maxdiff,[b,b],[b])
#
# def test_first_arg_is_empty_but_not_second(self):
# #Don't return 0 (as when the 2 lists are empty), raise!
# b = (0,0,0)
# self.assertRaises(DifferentBlockCountError,maxdiff,[],[b])
#
# def test_limit(self):
# b1 = (5,10,15)
# b2 = (255,250,245)
# b3 = (0,0,0)
# b4 = (255,0,255)
# blocks1 = [b1,b2]
# blocks2 = [b3,b4]
# expected1 = 5 + 10 + 15
# expected2 = 0 + 250 + 10
# self.assertEqual(expected1,maxdiff(blocks1,blocks2,expected1 - 1))
#
if __name__ == "__main__":
unittest.main()

134
py/picture/cache.py Normal file
View File

@@ -0,0 +1,134 @@
#!/usr/bin/env python
"""
Unit Name: hs.picture.cache
Created By: Virgil Dupras
Created On: 2006/09/14
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
$Revision: 4392 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import os
import logging
import sqlite3 as sqlite
import hsutil.sqlite
from _cache import string_to_colors
def colors_to_string(colors):
"""Transform the 3 sized tuples 'colors' into a hex string.
[(0,100,255)] --> 0064ff
[(1,2,3),(4,5,6)] --> 010203040506
"""
return ''.join(['%02x%02x%02x' % (r,g,b) for r,g,b in colors])
# This function is an important bottleneck of dupeGuru PE. It has been converted to Cython.
# def string_to_colors(s):
# """Transform the string 's' in a list of 3 sized tuples.
# """
# result = []
# for i in xrange(0, len(s), 6):
# number = int(s[i:i+6], 16)
# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
# return result
class Cache(object):
"""A class to cache picture blocks.
"""
def __init__(self, db=':memory:', threaded=True):
def create_tables():
sql = "create table pictures(path TEXT, blocks TEXT)"
self.con.execute(sql);
sql = "create index idx_path on pictures (path)"
self.con.execute(sql)
self.dbname = db
if threaded:
self.con = hsutil.sqlite.ThreadedConn(db, True)
else:
self.con = sqlite.connect(db, isolation_level=None)
try:
self.con.execute("select * from pictures where 1=2")
except sqlite.OperationalError: # new db
create_tables()
except sqlite.DatabaseError, e: # corrupted db
logging.warning('Could not create picture cache because of an error: %s', str(e))
self.con.close()
os.remove(db)
if threaded:
self.con = hsutil.sqlite.ThreadedConn(db, True)
else:
self.con = sqlite.connect(db, isolation_level=None)
create_tables()
def __contains__(self, key):
sql = "select count(*) from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchall()
return result[0][0] > 0
def __delitem__(self, key):
if key not in self:
raise KeyError(key)
sql = "delete from pictures where path = ?"
self.con.execute(sql, [key])
# Optimized
def __getitem__(self, key):
if isinstance(key, int):
sql = "select blocks from pictures where rowid = ?"
else:
sql = "select blocks from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchone()
if result:
result = string_to_colors(result[0])
return result
else:
raise KeyError(key)
def __iter__(self):
sql = "select path from pictures"
result = self.con.execute(sql)
return (row[0] for row in result)
def __len__(self):
sql = "select count(*) from pictures"
result = self.con.execute(sql).fetchall()
return result[0][0]
def __setitem__(self, key, value):
value = colors_to_string(value)
if key in self:
sql = "update pictures set blocks = ? where path = ?"
else:
sql = "insert into pictures(blocks,path) values(?,?)"
try:
self.con.execute(sql, [value, key])
except sqlite.OperationalError:
logging.warning('Picture cache could not set %r for key %r', value, key)
except sqlite.DatabaseError, e:
logging.warning('DatabaseError while setting %r for key %r: %s', value, key, str(e))
def clear(self):
sql = "delete from pictures"
self.con.execute(sql)
def filter(self, func):
to_delete = [key for key in self if not func(key)]
for key in to_delete:
del self[key]
def get_id(self, path):
sql = "select rowid from pictures where path = ?"
result = self.con.execute(sql, [path]).fetchone()
if result:
return result[0]
else:
raise ValueError(path)
def get_multiple(self, rowids):
sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids))
cur = self.con.execute(sql)
return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)

159
py/picture/cache_test.py Normal file
View File

@@ -0,0 +1,159 @@
#!/usr/bin/env python
"""
Unit Name: tests.picture.cache
Created By: Virgil Dupras
Created On: 2006/09/14
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import unittest
from StringIO import StringIO
import os.path as op
import os
import threading
from hsutil.testcase import TestCase
from .cache import *
class TCcolors_to_string(unittest.TestCase):
def test_no_color(self):
self.assertEqual('',colors_to_string([]))
def test_single_color(self):
self.assertEqual('000000',colors_to_string([(0,0,0)]))
self.assertEqual('010101',colors_to_string([(1,1,1)]))
self.assertEqual('0a141e',colors_to_string([(10,20,30)]))
def test_two_colors(self):
self.assertEqual('000102030405',colors_to_string([(0,1,2),(3,4,5)]))
class TCstring_to_colors(unittest.TestCase):
def test_empty(self):
self.assertEqual([],string_to_colors(''))
def test_single_color(self):
self.assertEqual([(0,0,0)],string_to_colors('000000'))
self.assertEqual([(2,3,4)],string_to_colors('020304'))
self.assertEqual([(10,20,30)],string_to_colors('0a141e'))
def test_two_colors(self):
self.assertEqual([(10,20,30),(40,50,60)],string_to_colors('0a141e28323c'))
def test_incomplete_color(self):
# don't return anything if it's not a complete color
self.assertEqual([],string_to_colors('102'))
class TCCache(TestCase):
def test_empty(self):
c = Cache()
self.assertEqual(0,len(c))
self.assertRaises(KeyError,c.__getitem__,'foo')
def test_set_then_retrieve_blocks(self):
c = Cache()
b = [(0,0,0),(1,2,3)]
c['foo'] = b
self.assertEqual(b,c['foo'])
def test_delitem(self):
c = Cache()
c['foo'] = ''
del c['foo']
self.assert_('foo' not in c)
self.assertRaises(KeyError,c.__delitem__,'foo')
def test_persistance(self):
DBNAME = op.join(self.tmpdir(), 'hstest.db')
c = Cache(DBNAME)
c['foo'] = [(1,2,3)]
del c
c = Cache(DBNAME)
self.assertEqual([(1,2,3)],c['foo'])
del c
os.remove(DBNAME)
def test_filter(self):
c = Cache()
c['foo'] = ''
c['bar'] = ''
c['baz'] = ''
c.filter(lambda p:p != 'bar') #only 'bar' is removed
self.assertEqual(2,len(c))
self.assert_('foo' in c)
self.assert_('baz' in c)
self.assert_('bar' not in c)
def test_clear(self):
c = Cache()
c['foo'] = ''
c['bar'] = ''
c['baz'] = ''
c.clear()
self.assertEqual(0,len(c))
self.assert_('foo' not in c)
self.assert_('baz' not in c)
self.assert_('bar' not in c)
def test_corrupted_db(self):
dbname = op.join(self.tmpdir(), 'foo.db')
fp = open(dbname, 'w')
fp.write('invalid sqlite content')
fp.close()
c = Cache(dbname) # should not raise a DatabaseError
c['foo'] = [(1, 2, 3)]
del c
c = Cache(dbname)
self.assertEqual(c['foo'], [(1, 2, 3)])
def test_by_id(self):
# it's possible to use the cache by referring to the files by their row_id
c = Cache()
b = [(0,0,0),(1,2,3)]
c['foo'] = b
foo_id = c.get_id('foo')
self.assertEqual(c[foo_id], b)
class TCCacheSQLEscape(unittest.TestCase):
def test_contains(self):
c = Cache()
self.assert_("foo'bar" not in c)
def test_getitem(self):
c = Cache()
self.assertRaises(KeyError, c.__getitem__, "foo'bar")
def test_setitem(self):
c = Cache()
c["foo'bar"] = []
def test_delitem(self):
c = Cache()
c["foo'bar"] = []
try:
del c["foo'bar"]
except KeyError:
self.fail()
class TCCacheThreaded(unittest.TestCase):
def test_access_cache(self):
def thread_run():
try:
c['foo'] = [(1,2,3)]
except sqlite.ProgrammingError:
self.fail()
c = Cache()
t = threading.Thread(target=thread_run)
t.start()
t.join()
self.assertEqual([(1,2,3)], c['foo'])
if __name__ == "__main__":
unittest.main()

136
py/picture/matchbase.py Normal file
View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python
"""
Unit Name: hs.picture._match
Created By: Virgil Dupras
Created On: 2007/02/25
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:02:48 +0200 (Thu, 28 May 2009) $
$Revision: 4388 $
Copyright 2007 Hardcoded Software (http://www.hardcoded.net)
"""
import logging
import multiprocessing
from Queue import Empty
from collections import defaultdict
from hsutil import job
from hs.utils.misc import dedupe
from dupeguru.engine import Match
from block import avgdiff, DifferentBlockCountError, NoBlocksError
from cache import Cache
MIN_ITERATIONS = 3
def get_match(first,second,percentage):
if percentage < 0:
percentage = 0
return Match(first,second,percentage)
class MatchFactory(object):
cached_blocks = None
block_count_per_side = 15
threshold = 75
match_scaled = False
def _do_getmatches(self, files, j):
raise NotImplementedError()
def getmatches(self, files, j=job.nulljob):
# The MemoryError handlers in there use logging without first caring about whether or not
# there is enough memory left to carry on the operation because it is assumed that the
# MemoryError happens when trying to read an image file, which is freed from memory by the
# time that MemoryError is raised.
j = j.start_subjob([2, 8])
logging.info('Preparing %d files' % len(files))
prepared = self.prepare_files(files, j)
logging.info('Finished preparing %d files' % len(prepared))
return self._do_getmatches(prepared, j)
def prepare_files(self, files, j=job.nulljob):
prepared = [] # only files for which there was no error getting blocks
try:
for picture in j.iter_with_progress(files, 'Analyzed %d/%d pictures'):
picture.dimensions
picture.unicode_path = unicode(picture.path)
try:
if picture.unicode_path not in self.cached_blocks:
blocks = picture.get_blocks(self.block_count_per_side)
self.cached_blocks[picture.unicode_path] = blocks
prepared.append(picture)
except IOError as e:
logging.warning(unicode(e))
except MemoryError:
logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
if picture.size < 10 * 1024 * 1024: # We're really running out of memory
raise
except MemoryError:
logging.warning('Ran out of memory while preparing files')
return prepared
def async_compare(ref_id, other_ids, dbname, threshold):
cache = Cache(dbname, threaded=False)
limit = 100 - threshold
ref_blocks = cache[ref_id]
pairs = cache.get_multiple(other_ids)
results = []
for other_id, other_blocks in pairs:
try:
diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
percentage = 100 - diff
except (DifferentBlockCountError, NoBlocksError):
percentage = 0
if percentage >= threshold:
results.append((ref_id, other_id, percentage))
cache.con.close()
return results
class AsyncMatchFactory(MatchFactory):
def _do_getmatches(self, pictures, j):
def empty_out_queue(queue, into):
try:
while True:
into.append(queue.get(block=False))
except Empty:
pass
j = j.start_subjob([1, 8, 1], 'Preparing for matching')
cache = self.cached_blocks
id2picture = {}
dimensions2pictures = defaultdict(set)
for picture in pictures[:]:
try:
picture.cache_id = cache.get_id(picture.unicode_path)
id2picture[picture.cache_id] = picture
except ValueError:
pictures.remove(picture)
if not self.match_scaled:
dimensions2pictures[picture.dimensions].add(picture)
pool = multiprocessing.Pool()
async_results = []
pictures_copy = set(pictures)
for ref in j.iter_with_progress(pictures):
others = pictures_copy if self.match_scaled else dimensions2pictures[ref.dimensions]
others.remove(ref)
if others:
cache_ids = [f.cache_id for f in others]
args = (ref.cache_id, cache_ids, self.cached_blocks.dbname, self.threshold)
async_results.append(pool.apply_async(async_compare, args))
matches = []
for result in j.iter_with_progress(async_results, 'Matched %d/%d pictures'):
matches.extend(result.get())
result = []
for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
ref = id2picture[ref_id]
other = id2picture[other_id]
if percentage == 100 and ref.md5 != other.md5:
percentage = 99
if percentage >= self.threshold:
result.append(get_match(ref, other, percentage))
return result
multiprocessing.freeze_support()

359
py/results.py Normal file
View File

@@ -0,0 +1,359 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.results
Created By: Virgil Dupras
Created On: 2006/02/23
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
$Revision: 4392 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import re
from xml.sax import handler, make_parser, SAXException
from xml.sax.saxutils import XMLGenerator
from xml.sax.xmlreader import AttributesImpl
from . import engine
from hsutil.job import nulljob
from hsutil.markable import Markable
from hsutil.misc import flatten, cond, nonone
from hsutil.str import format_size
from hsutil.files import open_if_filename
class Results(Markable):
#---Override
def __init__(self, data_module):
super(Results, self).__init__()
self.__groups = []
self.__group_of_duplicate = {}
self.__groups_sort_descriptor = None # This is a tuple (key, asc)
self.__dupes = None
self.__dupes_sort_descriptor = None # This is a tuple (key, asc, delta)
self.__filters = None
self.__filtered_dupes = None
self.__filtered_groups = None
self.__recalculate_stats()
self.__marked_size = 0
self.data = data_module
def _did_mark(self, dupe):
self.__marked_size += dupe.size
def _did_unmark(self, dupe):
self.__marked_size -= dupe.size
def _get_markable_count(self):
return self.__total_count
def _is_markable(self, dupe):
if dupe.is_ref:
return False
g = self.get_group_of_duplicate(dupe)
if not g:
return False
if dupe is g.ref:
return False
if self.__filtered_dupes and dupe not in self.__filtered_dupes:
return False
return True
#---Private
def __get_dupe_list(self):
if self.__dupes is None:
self.__dupes = flatten(group.dupes for group in self.groups)
if self.__filtered_dupes:
self.__dupes = [dupe for dupe in self.__dupes if dupe in self.__filtered_dupes]
sd = self.__dupes_sort_descriptor
if sd:
self.sort_dupes(sd[0], sd[1], sd[2])
return self.__dupes
def __get_groups(self):
if self.__filtered_groups is None:
return self.__groups
else:
return self.__filtered_groups
def __get_stat_line(self):
if self.__filtered_dupes is None:
mark_count = self.mark_count
marked_size = self.__marked_size
total_count = self.__total_count
total_size = self.__total_size
else:
mark_count = len([dupe for dupe in self.__filtered_dupes if self.is_marked(dupe)])
marked_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_marked(dupe))
total_count = len([dupe for dupe in self.__filtered_dupes if self.is_markable(dupe)])
total_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_markable(dupe))
if self.mark_inverted:
marked_size = self.__total_size - marked_size
result = '%d / %d (%s / %s) duplicates marked.' % (
mark_count,
total_count,
format_size(marked_size, 2),
format_size(total_size, 2),
)
if self.__filters:
result += ' filter: %s' % ' --> '.join(self.__filters)
return result
def __recalculate_stats(self):
self.__total_size = 0
self.__total_count = 0
for group in self.groups:
markable = [dupe for dupe in group.dupes if self._is_markable(dupe)]
self.__total_count += len(markable)
self.__total_size += sum(dupe.size for dupe in markable)
def __set_groups(self, new_groups):
self.mark_none()
self.__groups = new_groups
self.__group_of_duplicate = {}
for g in self.__groups:
for dupe in g:
self.__group_of_duplicate[dupe] = g
if not hasattr(dupe, 'is_ref'):
dupe.is_ref = False
old_filters = nonone(self.__filters, [])
self.apply_filter(None)
for filter_str in old_filters:
self.apply_filter(filter_str)
#---Public
def apply_filter(self, filter_str):
''' Applies a filter 'filter_str' to self.groups
When you apply the filter, only dupes with the filename matching 'filter_str' will be in
in the results. To cancel the filter, just call apply_filter with 'filter_str' to None,
and the results will go back to normal.
If call apply_filter on a filtered results, the filter will be applied
*on the filtered results*.
'filter_str' is a string containing a regexp to filter dupes with.
'''
if not filter_str:
self.__filtered_dupes = None
self.__filtered_groups = None
self.__filters = None
else:
if not self.__filters:
self.__filters = []
self.__filters.append(filter_str)
filter_re = re.compile(filter_str, re.IGNORECASE)
if self.__filtered_dupes is None:
self.__filtered_dupes = flatten(g[:] for g in self.groups)
self.__filtered_dupes = set(dupe for dupe in self.__filtered_dupes if filter_re.search(dupe.name))
filtered_groups = set()
for dupe in self.__filtered_dupes:
filtered_groups.add(self.get_group_of_duplicate(dupe))
self.__filtered_groups = list(filtered_groups)
self.__recalculate_stats()
sd = self.__groups_sort_descriptor
if sd:
self.sort_groups(sd[0], sd[1])
self.__dupes = None
def get_group_of_duplicate(self, dupe):
try:
return self.__group_of_duplicate[dupe]
except (TypeError, KeyError):
return None
is_markable = _is_markable
def load_from_xml(self, infile, get_file, j=nulljob):
self.apply_filter(None)
handler = _ResultsHandler(get_file)
parser = make_parser()
parser.setContentHandler(handler)
try:
infile, must_close = open_if_filename(infile)
except IOError:
return
BUFSIZE = 1024 * 1024 # 1mb buffer
infile.seek(0, 2)
j.start_job(infile.tell() // BUFSIZE)
infile.seek(0, 0)
try:
while True:
data = infile.read(BUFSIZE)
if not data:
break
parser.feed(data)
j.add_progress()
except SAXException:
return
self.groups = handler.groups
for dupe_file in handler.marked:
self.mark(dupe_file)
def make_ref(self, dupe):
g = self.get_group_of_duplicate(dupe)
r = g.ref
self._remove_mark_flag(dupe)
g.switch_ref(dupe);
if not r.is_ref:
self.__total_count += 1
self.__total_size += r.size
if not dupe.is_ref:
self.__total_count -= 1
self.__total_size -= dupe.size
self.__dupes = None
def perform_on_marked(self, func, remove_from_results):
problems = []
for d in self.dupes:
if self.is_marked(d) and (not func(d)):
problems.append(d)
if remove_from_results:
to_remove = [d for d in self.dupes if self.is_marked(d) and (d not in problems)]
self.remove_duplicates(to_remove)
self.mark_none()
for d in problems:
self.mark(d)
return len(problems)
def remove_duplicates(self, dupes):
'''Remove 'dupes' from their respective group, and remove the group is it ends up empty.
'''
affected_groups = set()
for dupe in dupes:
group = self.get_group_of_duplicate(dupe)
if dupe not in group.dupes:
return
group.remove_dupe(dupe, False)
self._remove_mark_flag(dupe)
self.__total_count -= 1
self.__total_size -= dupe.size
if not group:
self.__groups.remove(group)
if self.__filtered_groups:
self.__filtered_groups.remove(group)
else:
affected_groups.add(group)
for group in affected_groups:
group.clean_matches()
self.__dupes = None
def save_to_xml(self, outfile, with_data=False):
self.apply_filter(None)
outfile, must_close = open_if_filename(outfile, 'wb')
writer = XMLGenerator(outfile, 'utf-8')
writer.startDocument()
empty_attrs = AttributesImpl({})
writer.startElement('results', empty_attrs)
for g in self.groups:
writer.startElement('group', empty_attrs)
dupe2index = {}
for index, d in enumerate(g):
dupe2index[d] = index
try:
words = engine.unpack_fields(d.words)
except AttributeError:
words = ()
attrs = AttributesImpl({
'path': unicode(d.path),
'is_ref': cond(d.is_ref, 'y', 'n'),
'words': ','.join(words),
'marked': cond(self.is_marked(d), 'y', 'n')
})
writer.startElement('file', attrs)
if with_data:
data_list = self.data.GetDisplayInfo(d, g)
for data in data_list:
attrs = AttributesImpl({
'value': data,
})
writer.startElement('data', attrs)
writer.endElement('data')
writer.endElement('file')
for match in g.matches:
attrs = AttributesImpl({
'first': str(dupe2index[match.first]),
'second': str(dupe2index[match.second]),
'percentage': str(int(match.percentage)),
})
writer.startElement('match', attrs)
writer.endElement('match')
writer.endElement('group')
writer.endElement('results')
writer.endDocument()
if must_close:
outfile.close()
def sort_dupes(self, key, asc=True, delta=False):
if not self.__dupes:
self.__get_dupe_list()
self.__dupes.sort(key=lambda d: self.data.GetDupeSortKey(d, lambda: self.get_group_of_duplicate(d), key, delta))
if not asc:
self.__dupes.reverse()
self.__dupes_sort_descriptor = (key,asc,delta)
def sort_groups(self,key,asc=True):
self.groups.sort(key=lambda g: self.data.GetGroupSortKey(g, key))
if not asc:
self.groups.reverse()
self.__groups_sort_descriptor = (key,asc)
#---Properties
dupes = property(__get_dupe_list)
groups = property(__get_groups, __set_groups)
stat_line = property(__get_stat_line)
class _ResultsHandler(handler.ContentHandler):
def __init__(self, get_file):
self.group = None
self.dupes = None
self.marked = set()
self.groups = []
self.get_file = get_file
def startElement(self, name, attrs):
if name == 'group':
self.group = engine.Group()
self.dupes = []
return
if (name == 'file') and (self.group is not None):
if not (('path' in attrs) and ('words' in attrs)):
return
path = attrs['path']
file = self.get_file(path)
if file is None:
return
file.words = attrs['words'].split(',')
file.is_ref = attrs.get('is_ref') == 'y'
self.dupes.append(file)
if attrs.get('marked') == 'y':
self.marked.add(file)
if (name == 'match') and (self.group is not None):
try:
first_file = self.dupes[int(attrs['first'])]
second_file = self.dupes[int(attrs['second'])]
percentage = int(attrs['percentage'])
self.group.add_match(engine.Match(first_file, second_file, percentage))
except (IndexError, KeyError, ValueError): # Covers missing attr, non-int values and indexes out of bounds
pass
def endElement(self, name):
def do_match(ref_file, other_files, group):
if not other_files:
return
for other_file in other_files:
group.add_match(engine.get_match(ref_file, other_file))
do_match(other_files[0], other_files[1:], group)
if name == 'group':
group = self.group
self.group = None
dupes = self.dupes
self.dupes = []
if group is None:
return
if len(dupes) < 2:
return
if not group.matches: # <match> elements not present, do it manually, without %
do_match(dupes[0], dupes[1:], group)
group.prioritize(lambda x: dupes.index(x))
self.groups.append(group)

742
py/results_test.py Normal file
View File

@@ -0,0 +1,742 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.tests.results
Created By: Virgil Dupras
Created On: 2006/02/23
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import unittest
import StringIO
import xml.dom.minidom
import os.path as op
from hsutil.path import Path
from hsutil.testcase import TestCase
from hsutil.misc import first
from . import engine_test
from . import data
from . import engine
from .results import *
class NamedObject(engine_test.NamedObject):
size = 1
path = property(lambda x:Path('basepath') + x.name)
is_ref = False
def __nonzero__(self):
return False #Make sure that operations are made correctly when the bool value of files is false.
# Returns a group set that looks like that:
# "foo bar" (1)
# "bar bleh" (1024)
# "foo bleh" (1)
# "ibabtu" (1)
# "ibabtu" (1)
def GetTestGroups():
objects = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("foo bleh"),NamedObject("ibabtu"),NamedObject("ibabtu")]
objects[1].size = 1024
matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
groups = engine.get_groups(matches) #We should have 2 groups
for g in groups:
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
groups.sort(key=len, reverse=True) # We want the group with 3 members to be first.
return (objects,matches,groups)
class TCResultsEmpty(TestCase):
def setUp(self):
self.results = Results(data)
def test_stat_line(self):
self.assertEqual("0 / 0 (0.00 B / 0.00 B) duplicates marked.",self.results.stat_line)
def test_groups(self):
self.assertEqual(0,len(self.results.groups))
def test_get_group_of_duplicate(self):
self.assert_(self.results.get_group_of_duplicate('foo') is None)
def test_save_to_xml(self):
f = StringIO.StringIO()
self.results.save_to_xml(f)
f.seek(0)
doc = xml.dom.minidom.parse(f)
root = doc.documentElement
self.assertEqual('results',root.nodeName)
class TCResultsWithSomeGroups(TestCase):
def setUp(self):
self.results = Results(data)
self.objects,self.matches,self.groups = GetTestGroups()
self.results.groups = self.groups
def test_stat_line(self):
self.assertEqual("0 / 3 (0.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
def test_groups(self):
self.assertEqual(2,len(self.results.groups))
def test_get_group_of_duplicate(self):
for o in self.objects:
g = self.results.get_group_of_duplicate(o)
self.assert_(isinstance(g, engine.Group))
self.assert_(o in g)
self.assert_(self.results.get_group_of_duplicate(self.groups[0]) is None)
def test_remove_duplicates(self):
g1,g2 = self.results.groups
self.results.remove_duplicates([g1.dupes[0]])
self.assertEqual(2,len(g1))
self.assert_(g1 in self.results.groups)
self.results.remove_duplicates([g1.ref])
self.assertEqual(2,len(g1))
self.assert_(g1 in self.results.groups)
self.results.remove_duplicates([g1.dupes[0]])
self.assertEqual(0,len(g1))
self.assert_(g1 not in self.results.groups)
self.results.remove_duplicates([g2.dupes[0]])
self.assertEqual(0,len(g2))
self.assert_(g2 not in self.results.groups)
self.assertEqual(0,len(self.results.groups))
def test_remove_duplicates_with_ref_files(self):
g1,g2 = self.results.groups
self.objects[0].is_ref = True
self.objects[1].is_ref = True
self.results.remove_duplicates([self.objects[2]])
self.assertEqual(0,len(g1))
self.assert_(g1 not in self.results.groups)
def test_make_ref(self):
g = self.results.groups[0]
d = g.dupes[0]
self.results.make_ref(d)
self.assert_(d is g.ref)
def test_sort_groups(self):
self.results.make_ref(self.objects[1]) #We want to make the 1024 sized object to go ref.
g1,g2 = self.groups
self.results.sort_groups(2) #2 is the key for size
self.assert_(self.results.groups[0] is g2)
self.assert_(self.results.groups[1] is g1)
self.results.sort_groups(2,False)
self.assert_(self.results.groups[0] is g1)
self.assert_(self.results.groups[1] is g2)
def test_set_groups_when_sorted(self):
self.results.make_ref(self.objects[1]) #We want to make the 1024 sized object to go ref.
self.results.sort_groups(2)
objects,matches,groups = GetTestGroups()
g1,g2 = groups
g1.switch_ref(objects[1])
self.results.groups = groups
self.assert_(self.results.groups[0] is g2)
self.assert_(self.results.groups[1] is g1)
def test_get_dupe_list(self):
self.assertEqual([self.objects[1],self.objects[2],self.objects[4]],self.results.dupes)
def test_dupe_list_is_cached(self):
self.assert_(self.results.dupes is self.results.dupes)
def test_dupe_list_cache_is_invalidated_when_needed(self):
o1,o2,o3,o4,o5 = self.objects
self.assertEqual([o2,o3,o5],self.results.dupes)
self.results.make_ref(o2)
self.assertEqual([o1,o3,o5],self.results.dupes)
objects,matches,groups = GetTestGroups()
o1,o2,o3,o4,o5 = objects
self.results.groups = groups
self.assertEqual([o2,o3,o5],self.results.dupes)
def test_dupe_list_sort(self):
o1,o2,o3,o4,o5 = self.objects
o1.size = 5
o2.size = 4
o3.size = 3
o4.size = 2
o5.size = 1
self.results.sort_dupes(2)
self.assertEqual([o5,o3,o2],self.results.dupes)
self.results.sort_dupes(2,False)
self.assertEqual([o2,o3,o5],self.results.dupes)
def test_dupe_list_remember_sort(self):
o1,o2,o3,o4,o5 = self.objects
o1.size = 5
o2.size = 4
o3.size = 3
o4.size = 2
o5.size = 1
self.results.sort_dupes(2)
self.results.make_ref(o2)
self.assertEqual([o5,o3,o1],self.results.dupes)
def test_dupe_list_sort_delta_values(self):
o1,o2,o3,o4,o5 = self.objects
o1.size = 10
o2.size = 2 #-8
o3.size = 3 #-7
o4.size = 20
o5.size = 1 #-19
self.results.sort_dupes(2,delta=True)
self.assertEqual([o5,o2,o3],self.results.dupes)
def test_sort_empty_list(self):
#There was an infinite loop when sorting an empty list.
r = Results(data)
r.sort_dupes(0)
self.assertEqual([],r.dupes)
def test_dupe_list_update_on_remove_duplicates(self):
o1,o2,o3,o4,o5 = self.objects
self.assertEqual(3,len(self.results.dupes))
self.results.remove_duplicates([o2])
self.assertEqual(2,len(self.results.dupes))
class TCResultsMarkings(TestCase):
def setUp(self):
self.results = Results(data)
self.objects,self.matches,self.groups = GetTestGroups()
self.results.groups = self.groups
def test_stat_line(self):
self.assertEqual("0 / 3 (0.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
self.results.mark(self.objects[1])
self.assertEqual("1 / 3 (1.00 KB / 1.01 KB) duplicates marked.",self.results.stat_line)
self.results.mark_invert()
self.assertEqual("2 / 3 (2.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
self.results.mark_invert()
self.results.unmark(self.objects[1])
self.results.mark(self.objects[2])
self.results.mark(self.objects[4])
self.assertEqual("2 / 3 (2.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
self.results.mark(self.objects[0]) #this is a ref, it can't be counted
self.assertEqual("2 / 3 (2.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
self.results.groups = self.groups
self.assertEqual("0 / 3 (0.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
def test_with_ref_duplicate(self):
self.objects[1].is_ref = True
self.results.groups = self.groups
self.assert_(not self.results.mark(self.objects[1]))
self.results.mark(self.objects[2])
self.assertEqual("1 / 2 (1.00 B / 2.00 B) duplicates marked.",self.results.stat_line)
def test_perform_on_marked(self):
def log_object(o):
log.append(o)
return True
log = []
self.results.mark_all()
self.results.perform_on_marked(log_object,False)
self.assert_(self.objects[1] in log)
self.assert_(self.objects[2] in log)
self.assert_(self.objects[4] in log)
self.assertEqual(3,len(log))
log = []
self.results.mark_none()
self.results.mark(self.objects[4])
self.results.perform_on_marked(log_object,True)
self.assertEqual(1,len(log))
self.assert_(self.objects[4] in log)
self.assertEqual(1,len(self.results.groups))
def test_perform_on_marked_with_problems(self):
def log_object(o):
log.append(o)
return o is not self.objects[1]
log = []
self.results.mark_all()
self.assert_(self.results.is_marked(self.objects[1]))
self.assertEqual(1,self.results.perform_on_marked(log_object, True))
self.assertEqual(3,len(log))
self.assertEqual(1,len(self.results.groups))
self.assertEqual(2,len(self.results.groups[0]))
self.assert_(self.objects[1] in self.results.groups[0])
self.assert_(not self.results.is_marked(self.objects[2]))
self.assert_(self.results.is_marked(self.objects[1]))
def test_perform_on_marked_with_ref(self):
def log_object(o):
log.append(o)
return True
log = []
self.objects[0].is_ref = True
self.objects[1].is_ref = True
self.results.mark_all()
self.results.perform_on_marked(log_object,True)
self.assert_(self.objects[1] not in log)
self.assert_(self.objects[2] in log)
self.assert_(self.objects[4] in log)
self.assertEqual(2,len(log))
self.assertEqual(0,len(self.results.groups))
def test_perform_on_marked_remove_objects_only_at_the_end(self):
def check_groups(o):
self.assertEqual(3,len(g1))
self.assertEqual(2,len(g2))
return True
g1,g2 = self.results.groups
self.results.mark_all()
self.results.perform_on_marked(check_groups,True)
self.assertEqual(0,len(g1))
self.assertEqual(0,len(g2))
self.assertEqual(0,len(self.results.groups))
def test_remove_duplicates(self):
g1 = self.results.groups[0]
g2 = self.results.groups[1]
self.results.mark(g1.dupes[0])
self.assertEqual("1 / 3 (1.00 KB / 1.01 KB) duplicates marked.",self.results.stat_line)
self.results.remove_duplicates([g1.dupes[1]])
self.assertEqual("1 / 2 (1.00 KB / 1.01 KB) duplicates marked.",self.results.stat_line)
self.results.remove_duplicates([g1.dupes[0]])
self.assertEqual("0 / 1 (0.00 B / 1.00 B) duplicates marked.",self.results.stat_line)
def test_make_ref(self):
g = self.results.groups[0]
d = g.dupes[0]
self.results.mark(d)
self.assertEqual("1 / 3 (1.00 KB / 1.01 KB) duplicates marked.",self.results.stat_line)
self.results.make_ref(d)
self.assertEqual("0 / 3 (0.00 B / 3.00 B) duplicates marked.",self.results.stat_line)
self.results.make_ref(d)
self.assertEqual("0 / 3 (0.00 B / 3.00 B) duplicates marked.",self.results.stat_line)
def test_SaveXML(self):
self.results.mark(self.objects[1])
self.results.mark_invert()
f = StringIO.StringIO()
self.results.save_to_xml(f)
f.seek(0)
doc = xml.dom.minidom.parse(f)
root = doc.documentElement
g1,g2 = root.getElementsByTagName('group')
d1,d2,d3 = g1.getElementsByTagName('file')
self.assertEqual('n',d1.getAttributeNode('marked').nodeValue)
self.assertEqual('n',d2.getAttributeNode('marked').nodeValue)
self.assertEqual('y',d3.getAttributeNode('marked').nodeValue)
d1,d2 = g2.getElementsByTagName('file')
self.assertEqual('n',d1.getAttributeNode('marked').nodeValue)
self.assertEqual('y',d2.getAttributeNode('marked').nodeValue)
def test_LoadXML(self):
def get_file(path):
return [f for f in self.objects if str(f.path) == path][0]
self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
self.results.mark(self.objects[1])
self.results.mark_invert()
f = StringIO.StringIO()
self.results.save_to_xml(f)
f.seek(0)
r = Results(data)
r.load_from_xml(f,get_file)
self.assert_(not r.is_marked(self.objects[0]))
self.assert_(not r.is_marked(self.objects[1]))
self.assert_(r.is_marked(self.objects[2]))
self.assert_(not r.is_marked(self.objects[3]))
self.assert_(r.is_marked(self.objects[4]))
class TCResultsXML(TestCase):
def setUp(self):
self.results = Results(data)
self.objects, self.matches, self.groups = GetTestGroups()
self.results.groups = self.groups
def get_file(self, path): # use this as a callback for load_from_xml
return [o for o in self.objects if o.path == path][0]
def test_save_to_xml(self):
self.objects[0].is_ref = True
self.objects[0].words = [['foo','bar']]
f = StringIO.StringIO()
self.results.save_to_xml(f)
f.seek(0)
doc = xml.dom.minidom.parse(f)
root = doc.documentElement
self.assertEqual('results',root.nodeName)
children = [c for c in root.childNodes if c.localName]
self.assertEqual(2,len(children))
self.assertEqual(2,len([c for c in children if c.nodeName == 'group']))
g1,g2 = children
children = [c for c in g1.childNodes if c.localName]
self.assertEqual(6,len(children))
self.assertEqual(3,len([c for c in children if c.nodeName == 'file']))
self.assertEqual(3,len([c for c in children if c.nodeName == 'match']))
d1,d2,d3 = [c for c in children if c.nodeName == 'file']
self.assertEqual(op.join('basepath','foo bar'),d1.getAttributeNode('path').nodeValue)
self.assertEqual(op.join('basepath','bar bleh'),d2.getAttributeNode('path').nodeValue)
self.assertEqual(op.join('basepath','foo bleh'),d3.getAttributeNode('path').nodeValue)
self.assertEqual('y',d1.getAttributeNode('is_ref').nodeValue)
self.assertEqual('n',d2.getAttributeNode('is_ref').nodeValue)
self.assertEqual('n',d3.getAttributeNode('is_ref').nodeValue)
self.assertEqual('foo,bar',d1.getAttributeNode('words').nodeValue)
self.assertEqual('bar,bleh',d2.getAttributeNode('words').nodeValue)
self.assertEqual('foo,bleh',d3.getAttributeNode('words').nodeValue)
children = [c for c in g2.childNodes if c.localName]
self.assertEqual(3,len(children))
self.assertEqual(2,len([c for c in children if c.nodeName == 'file']))
self.assertEqual(1,len([c for c in children if c.nodeName == 'match']))
d1,d2 = [c for c in children if c.nodeName == 'file']
self.assertEqual(op.join('basepath','ibabtu'),d1.getAttributeNode('path').nodeValue)
self.assertEqual(op.join('basepath','ibabtu'),d2.getAttributeNode('path').nodeValue)
self.assertEqual('n',d1.getAttributeNode('is_ref').nodeValue)
self.assertEqual('n',d2.getAttributeNode('is_ref').nodeValue)
self.assertEqual('ibabtu',d1.getAttributeNode('words').nodeValue)
self.assertEqual('ibabtu',d2.getAttributeNode('words').nodeValue)
def test_save_to_xml_with_columns(self):
class FakeDataModule:
def GetDisplayInfo(self,dupe,group):
return [str(dupe.size),dupe.foo.upper()]
for i,object in enumerate(self.objects):
object.size = i
object.foo = u'bar\u00e9'
f = StringIO.StringIO()
self.results.data = FakeDataModule()
self.results.save_to_xml(f,True)
f.seek(0)
doc = xml.dom.minidom.parse(f)
root = doc.documentElement
g1,g2 = root.getElementsByTagName('group')
d1,d2,d3 = g1.getElementsByTagName('file')
d4,d5 = g2.getElementsByTagName('file')
self.assertEqual('0',d1.getElementsByTagName('data')[0].getAttribute('value'))
self.assertEqual(u'BAR\u00c9',d1.getElementsByTagName('data')[1].getAttribute('value')) #\u00c9 is upper of \u00e9
self.assertEqual('1',d2.getElementsByTagName('data')[0].getAttribute('value'))
self.assertEqual('2',d3.getElementsByTagName('data')[0].getAttribute('value'))
self.assertEqual('3',d4.getElementsByTagName('data')[0].getAttribute('value'))
self.assertEqual('4',d5.getElementsByTagName('data')[0].getAttribute('value'))
def test_LoadXML(self):
def get_file(path):
return [f for f in self.objects if str(f.path) == path][0]
self.objects[0].is_ref = True
self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
f = StringIO.StringIO()
self.results.save_to_xml(f)
f.seek(0)
r = Results(data)
r.load_from_xml(f,get_file)
self.assertEqual(2,len(r.groups))
g1,g2 = r.groups
self.assertEqual(3,len(g1))
self.assert_(g1[0].is_ref)
self.assert_(not g1[1].is_ref)
self.assert_(not g1[2].is_ref)
self.assert_(g1[0] is self.objects[0])
self.assert_(g1[1] is self.objects[1])
self.assert_(g1[2] is self.objects[2])
self.assertEqual(['foo','bar'],g1[0].words)
self.assertEqual(['bar','bleh'],g1[1].words)
self.assertEqual(['foo','bleh'],g1[2].words)
self.assertEqual(2,len(g2))
self.assert_(not g2[0].is_ref)
self.assert_(not g2[1].is_ref)
self.assert_(g2[0] is self.objects[3])
self.assert_(g2[1] is self.objects[4])
self.assertEqual(['ibabtu'],g2[0].words)
self.assertEqual(['ibabtu'],g2[1].words)
def test_LoadXML_with_filename(self):
def get_file(path):
return [f for f in self.objects if str(f.path) == path][0]
filename = op.join(self.tmpdir(), 'dupeguru_results.xml')
self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
self.results.save_to_xml(filename)
r = Results(data)
r.load_from_xml(filename,get_file)
self.assertEqual(2,len(r.groups))
def test_LoadXML_with_some_files_that_dont_exist_anymore(self):
def get_file(path):
if path.endswith('ibabtu 2'):
return None
return [f for f in self.objects if str(f.path) == path][0]
self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
f = StringIO.StringIO()
self.results.save_to_xml(f)
f.seek(0)
r = Results(data)
r.load_from_xml(f,get_file)
self.assertEqual(1,len(r.groups))
self.assertEqual(3,len(r.groups[0]))
def test_LoadXML_missing_attributes_and_bogus_elements(self):
def get_file(path):
return [f for f in self.objects if str(f.path) == path][0]
doc = xml.dom.minidom.Document()
root = doc.appendChild(doc.createElement('foobar')) #The root element shouldn't matter, really.
group_node = root.appendChild(doc.createElement('group'))
dupe_node = group_node.appendChild(doc.createElement('file')) #Perfectly correct file
dupe_node.setAttribute('path',op.join('basepath','foo bar'))
dupe_node.setAttribute('is_ref','y')
dupe_node.setAttribute('words','foo,bar')
dupe_node = group_node.appendChild(doc.createElement('file')) #is_ref missing, default to 'n'
dupe_node.setAttribute('path',op.join('basepath','foo bleh'))
dupe_node.setAttribute('words','foo,bleh')
dupe_node = group_node.appendChild(doc.createElement('file')) #words are missing, invalid.
dupe_node.setAttribute('path',op.join('basepath','bar bleh'))
dupe_node = group_node.appendChild(doc.createElement('file')) #path is missing, invalid.
dupe_node.setAttribute('words','foo,bleh')
dupe_node = group_node.appendChild(doc.createElement('foobar')) #Invalid element name
dupe_node.setAttribute('path',op.join('basepath','bar bleh'))
dupe_node.setAttribute('is_ref','y')
dupe_node.setAttribute('words','bar,bleh')
match_node = group_node.appendChild(doc.createElement('match')) # match pointing to a bad index
match_node.setAttribute('first', '42')
match_node.setAttribute('second', '45')
match_node = group_node.appendChild(doc.createElement('match')) # match with missing attrs
match_node = group_node.appendChild(doc.createElement('match')) # match with non-int values
match_node.setAttribute('first', 'foo')
match_node.setAttribute('second', 'bar')
match_node.setAttribute('percentage', 'baz')
group_node = root.appendChild(doc.createElement('foobar')) #invalid group
group_node = root.appendChild(doc.createElement('group')) #empty group
f = StringIO.StringIO()
doc.writexml(f,'\t','\t','\n',encoding='utf-8')
f.seek(0)
r = Results(data)
r.load_from_xml(f,get_file)
self.assertEqual(1,len(r.groups))
self.assertEqual(2,len(r.groups[0]))
def test_xml_non_ascii(self):
def get_file(path):
if path == op.join('basepath',u'\xe9foo bar'):
return objects[0]
if path == op.join('basepath',u'bar bleh'):
return objects[1]
objects = [NamedObject(u"\xe9foo bar",True),NamedObject("bar bleh",True)]
matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
groups = engine.get_groups(matches) #We should have 2 groups
for g in groups:
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
results = Results(data)
results.groups = groups
f = StringIO.StringIO()
results.save_to_xml(f)
f.seek(0)
r = Results(data)
r.load_from_xml(f,get_file)
g = r.groups[0]
self.assertEqual(u"\xe9foo bar",g[0].name)
self.assertEqual(['efoo','bar'],g[0].words)
def test_load_invalid_xml(self):
f = StringIO.StringIO()
f.write('<this is invalid')
f.seek(0)
r = Results(data)
r.load_from_xml(f,None)
self.assertEqual(0,len(r.groups))
def test_load_non_existant_xml(self):
r = Results(data)
try:
r.load_from_xml('does_not_exist.xml', None)
except IOError:
self.fail()
self.assertEqual(0,len(r.groups))
def test_remember_match_percentage(self):
group = self.groups[0]
d1, d2, d3 = group
fake_matches = set()
fake_matches.add(engine.Match(d1, d2, 42))
fake_matches.add(engine.Match(d1, d3, 43))
fake_matches.add(engine.Match(d2, d3, 46))
group.matches = fake_matches
f = StringIO.StringIO()
results = self.results
results.save_to_xml(f)
f.seek(0)
results = Results(data)
results.load_from_xml(f, self.get_file)
group = results.groups[0]
d1, d2, d3 = group
match = group.get_match_of(d2) #d1 - d2
self.assertEqual(42, match[2])
match = group.get_match_of(d3) #d1 - d3
self.assertEqual(43, match[2])
group.switch_ref(d2)
match = group.get_match_of(d3) #d2 - d3
self.assertEqual(46, match[2])
def test_save_and_load(self):
# previously, when reloading matches, they wouldn't be reloaded as namedtuples
f = StringIO.StringIO()
self.results.save_to_xml(f)
f.seek(0)
self.results.load_from_xml(f, self.get_file)
first(self.results.groups[0].matches).percentage
class TCResultsFilter(TestCase):
def setUp(self):
self.results = Results(data)
self.objects, self.matches, self.groups = GetTestGroups()
self.results.groups = self.groups
self.results.apply_filter(r'foo')
def test_groups(self):
self.assertEqual(1, len(self.results.groups))
self.assert_(self.results.groups[0] is self.groups[0])
def test_dupes(self):
# There are 2 objects matching. The first one is ref. Only the 3rd one is supposed to be in dupes.
self.assertEqual(1, len(self.results.dupes))
self.assert_(self.results.dupes[0] is self.objects[2])
def test_cancel_filter(self):
self.results.apply_filter(None)
self.assertEqual(3, len(self.results.dupes))
self.assertEqual(2, len(self.results.groups))
def test_dupes_reconstructed_filtered(self):
# make_ref resets self.__dupes to None. When it's reconstructed, we want it filtered
dupe = self.results.dupes[0] #3rd object
self.results.make_ref(dupe)
self.assertEqual(1, len(self.results.dupes))
self.assert_(self.results.dupes[0] is self.objects[0])
def test_include_ref_dupes_in_filter(self):
# When only the ref of a group match the filter, include it in the group
self.results.apply_filter(None)
self.results.apply_filter(r'foo bar')
self.assertEqual(1, len(self.results.groups))
self.assertEqual(0, len(self.results.dupes))
def test_filters_build_on_one_another(self):
self.results.apply_filter(r'bar')
self.assertEqual(1, len(self.results.groups))
self.assertEqual(0, len(self.results.dupes))
def test_stat_line(self):
expected = '0 / 1 (0.00 B / 1.00 B) duplicates marked. filter: foo'
self.assertEqual(expected, self.results.stat_line)
self.results.apply_filter(r'bar')
expected = '0 / 0 (0.00 B / 0.00 B) duplicates marked. filter: foo --> bar'
self.assertEqual(expected, self.results.stat_line)
self.results.apply_filter(None)
expected = '0 / 3 (0.00 B / 1.01 KB) duplicates marked.'
self.assertEqual(expected, self.results.stat_line)
def test_mark_count_is_filtered_as_well(self):
self.results.apply_filter(None)
# We don't want to perform mark_all() because we want the mark list to contain objects
for dupe in self.results.dupes:
self.results.mark(dupe)
self.results.apply_filter(r'foo')
expected = '1 / 1 (1.00 B / 1.00 B) duplicates marked. filter: foo'
self.assertEqual(expected, self.results.stat_line)
def test_sort_groups(self):
self.results.apply_filter(None)
self.results.make_ref(self.objects[1]) # to have the 1024 b obkect as ref
g1,g2 = self.groups
self.results.apply_filter('a') # Matches both group
self.results.sort_groups(2) #2 is the key for size
self.assert_(self.results.groups[0] is g2)
self.assert_(self.results.groups[1] is g1)
self.results.apply_filter(None)
self.assert_(self.results.groups[0] is g2)
self.assert_(self.results.groups[1] is g1)
self.results.sort_groups(2, False)
self.results.apply_filter('a')
self.assert_(self.results.groups[1] is g2)
self.assert_(self.results.groups[0] is g1)
def test_set_group(self):
#We want the new group to be filtered
self.objects, self.matches, self.groups = GetTestGroups()
self.results.groups = self.groups
self.assertEqual(1, len(self.results.groups))
self.assert_(self.results.groups[0] is self.groups[0])
def test_load_cancels_filter(self):
def get_file(path):
return [f for f in self.objects if str(f.path) == path][0]
filename = op.join(self.tmpdir(), 'dupeguru_results.xml')
self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
self.results.save_to_xml(filename)
r = Results(data)
r.apply_filter('foo')
r.load_from_xml(filename,get_file)
self.assertEqual(2,len(r.groups))
def test_remove_dupe(self):
self.results.remove_duplicates([self.results.dupes[0]])
self.results.apply_filter(None)
self.assertEqual(2,len(self.results.groups))
self.assertEqual(2,len(self.results.dupes))
self.results.apply_filter('ibabtu')
self.results.remove_duplicates([self.results.dupes[0]])
self.results.apply_filter(None)
self.assertEqual(1,len(self.results.groups))
self.assertEqual(1,len(self.results.dupes))
def test_filter_is_case_insensitive(self):
self.results.apply_filter(None)
self.results.apply_filter('FOO')
self.assertEqual(1, len(self.results.dupes))
def test_make_ref_on_filtered_out_doesnt_mess_stats(self):
# When filtered, a group containing filtered out dupes will display them as being reference.
# When calling make_ref on such a dupe, the total size and dupecount stats gets messed up
# because they are *not* counted in the stats in the first place.
g1, g2 = self.groups
bar_bleh = g1[1] # The "bar bleh" dupe is filtered out
self.results.make_ref(bar_bleh)
# Now the stats should display *2* markable dupes (instead of 1)
expected = '0 / 2 (0.00 B / 2.00 B) duplicates marked. filter: foo'
self.assertEqual(expected, self.results.stat_line)
self.results.apply_filter(None) # Now let's make sure our unfiltered results aren't fucked up
expected = '0 / 3 (0.00 B / 3.00 B) duplicates marked.'
self.assertEqual(expected, self.results.stat_line)
class TCResultsRefFile(unittest.TestCase):
def setUp(self):
self.results = Results(data)
self.objects, self.matches, self.groups = GetTestGroups()
self.objects[0].is_ref = True
self.objects[1].is_ref = True
self.results.groups = self.groups
def test_stat_line(self):
expected = '0 / 2 (0.00 B / 2.00 B) duplicates marked.'
self.assertEqual(expected, self.results.stat_line)
def test_make_ref(self):
d = self.results.groups[0].dupes[1] #non-ref
r = self.results.groups[0].ref
self.results.make_ref(d)
expected = '0 / 1 (0.00 B / 1.00 B) duplicates marked.'
self.assertEqual(expected, self.results.stat_line)
self.results.make_ref(r)
expected = '0 / 2 (0.00 B / 2.00 B) duplicates marked.'
self.assertEqual(expected, self.results.stat_line)
if __name__ == "__main__":
unittest.main()

131
py/scanner.py Normal file
View File

@@ -0,0 +1,131 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.scanner
Created By: Virgil Dupras
Created On: 2006/03/03
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import logging
from ignore import IgnoreList
from hsutil import job
from hsutil.misc import dedupe
from hsutil.str import get_file_ext, rem_file_ext
from . import engine
(SCAN_TYPE_FILENAME,
SCAN_TYPE_FIELDS,
SCAN_TYPE_FIELDS_NO_ORDER,
SCAN_TYPE_TAG,
SCAN_TYPE_TAG_WITH_ALBUM, # Obsolete
SCAN_TYPE_CONTENT,
SCAN_TYPE_CONTENT_AUDIO) = range(7)
SCANNABLE_TAGS = ['track', 'artist', 'album', 'title', 'genre', 'year']
class Scanner(object):
def __init__(self):
self.ignore_list = IgnoreList()
self.discarded_file_count = 0
def _getmatches(self, files, j):
j = j.start_subjob(2)
mf = engine.MatchFactory()
if self.scan_type != SCAN_TYPE_CONTENT:
mf.match_similar_words = self.match_similar_words
mf.weight_words = self.word_weighting
mf.min_match_percentage = self.min_match_percentage
if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
self.scan_type = SCAN_TYPE_FIELDS
mf.no_field_order = True
if self.scan_type == SCAN_TYPE_TAG_WITH_ALBUM:
self.scan_type = SCAN_TYPE_TAG
self.scanned_tags = set(['artist', 'album', 'title'])
func = {
SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
SCAN_TYPE_CONTENT: lambda f: [str(f.size)],
SCAN_TYPE_CONTENT_AUDIO: lambda f: [str(f.audiosize)]
}[self.scan_type]
for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
f.words = func(f)
return mf.getmatches(files, j)
@staticmethod
def _key_func(dupe):
return (not dupe.is_ref, -dupe.size)
@staticmethod
def _tie_breaker(ref, dupe):
refname = rem_file_ext(ref.name).lower()
dupename = rem_file_ext(dupe.name).lower()
if 'copy' in refname and 'copy' not in dupename:
return True
if refname.startswith(dupename) and (refname[len(dupename):].strip().isdigit()):
return True
return len(dupe.path) > len(ref.path)
def GetDupeGroups(self, files, j=job.nulljob):
j = j.start_subjob([8, 2])
for f in [f for f in files if not hasattr(f, 'is_ref')]:
f.is_ref = False
if self.size_threshold:
files = [f for f in files if f.size >= self.size_threshold]
logging.info('Getting matches')
if self.match_factory is None:
matches = self._getmatches(files, j)
else:
matches = self.match_factory.getmatches(files, j)
logging.info('Found %d matches' % len(matches))
if not self.mix_file_kind:
j.set_progress(100, 'Removing false matches')
matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
if self.ignore_list:
j = j.start_subjob(2)
iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
matches = [m for m in iter_matches
if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
md5attrname = 'md5partial' if self.scan_type == SCAN_TYPE_CONTENT_AUDIO else 'md5'
md5 = lambda f: getattr(f, md5attrname)
j = j.start_subjob(2)
for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
md5(matched_file)
j.set_progress(100, 'Removing false matches')
matches = [m for m in matches if md5(m.first) == md5(m.second)]
words_for_content = ['--'] # We compared md5. No words were involved.
for m in matches:
m.first.words = words_for_content
m.second.words = words_for_content
logging.info('Grouping matches')
groups = engine.get_groups(matches, j)
groups = [g for g in groups if any(not f.is_ref for f in g)]
logging.info('Created %d groups' % len(groups))
j.set_progress(100, 'Doing group prioritization')
for g in groups:
g.prioritize(self._key_func, self._tie_breaker)
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups)
return groups
match_factory = None
match_similar_words = False
min_match_percentage = 80
mix_file_kind = True
scan_type = SCAN_TYPE_FILENAME
scanned_tags = set(['artist', 'title'])
size_threshold = 0
word_weighting = False
class ScannerME(Scanner): # Scanner for Music Edition
@staticmethod
def _key_func(dupe):
return (not dupe.is_ref, -dupe.bitrate, -dupe.size)

468
py/scanner_test.py Normal file
View File

@@ -0,0 +1,468 @@
#!/usr/bin/env python
"""
Unit Name: dupeguru.tests.scanner
Created By: Virgil Dupras
Created On: 2006/03/03
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import unittest
from hsutil import job
from hsutil.path import Path
from hsutil.testcase import TestCase
from .engine import getwords, Match
from .ignore import IgnoreList
from .scanner import *
class NamedObject(object):
def __init__(self, name="foobar", size=1):
self.name = name
self.size = size
self.path = Path('')
self.words = getwords(name)
no = NamedObject
class TCScanner(TestCase):
def test_empty(self):
s = Scanner()
r = s.GetDupeGroups([])
self.assertEqual([],r)
def test_default_settings(self):
s = Scanner()
self.assertEqual(80,s.min_match_percentage)
self.assertEqual(SCAN_TYPE_FILENAME,s.scan_type)
self.assertEqual(True,s.mix_file_kind)
self.assertEqual(False,s.word_weighting)
self.assertEqual(False,s.match_similar_words)
self.assert_(isinstance(s.ignore_list,IgnoreList))
def test_simple_with_default_settings(self):
s = Scanner()
f = [no('foo bar'),no('foo bar'),no('foo bleh')]
r = s.GetDupeGroups(f)
self.assertEqual(1,len(r))
g = r[0]
#'foo bleh' cannot be in the group because the default min match % is 80
self.assertEqual(2,len(g))
self.assert_(g.ref in f[:2])
self.assert_(g.dupes[0] in f[:2])
def test_simple_with_lower_min_match(self):
s = Scanner()
s.min_match_percentage = 50
f = [no('foo bar'),no('foo bar'),no('foo bleh')]
r = s.GetDupeGroups(f)
self.assertEqual(1,len(r))
g = r[0]
self.assertEqual(3,len(g))
def test_trim_all_ref_groups(self):
s = Scanner()
f = [no('foo'),no('foo'),no('bar'),no('bar')]
f[2].is_ref = True
f[3].is_ref = True
r = s.GetDupeGroups(f)
self.assertEqual(1,len(r))
def test_priorize(self):
s = Scanner()
f = [no('foo'),no('foo'),no('bar'),no('bar')]
f[1].size = 2
f[2].size = 3
f[3].is_ref = True
r = s.GetDupeGroups(f)
g1,g2 = r
self.assert_(f[1] in (g1.ref,g2.ref))
self.assert_(f[0] in (g1.dupes[0],g2.dupes[0]))
self.assert_(f[3] in (g1.ref,g2.ref))
self.assert_(f[2] in (g1.dupes[0],g2.dupes[0]))
def test_content_scan(self):
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'), no('bar'), no('bleh')]
f[0].md5 = 'foobar'
f[1].md5 = 'foobar'
f[2].md5 = 'bleh'
r = s.GetDupeGroups(f)
self.assertEqual(len(r), 1)
self.assertEqual(len(r[0]), 2)
self.assertEqual(s.discarded_file_count, 0) # don't count the different md5 as discarded!
def test_content_scan_compare_sizes_first(self):
class MyFile(no):
def get_md5(file):
self.fail()
md5 = property(get_md5)
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
f = [MyFile('foo',1),MyFile('bar',2)]
self.assertEqual(0,len(s.GetDupeGroups(f)))
def test_min_match_perc_doesnt_matter_for_content_scan(self):
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'),no('bar'),no('bleh')]
f[0].md5 = 'foobar'
f[1].md5 = 'foobar'
f[2].md5 = 'bleh'
s.min_match_percentage = 101
r = s.GetDupeGroups(f)
self.assertEqual(1,len(r))
self.assertEqual(2,len(r[0]))
s.min_match_percentage = 0
r = s.GetDupeGroups(f)
self.assertEqual(1,len(r))
self.assertEqual(2,len(r[0]))
def test_content_scan_puts_md5_in_words_at_the_end(self):
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'),no('bar')]
f[0].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
f[1].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
r = s.GetDupeGroups(f)
g = r[0]
self.assertEqual(['--'],g.ref.words)
self.assertEqual(['--'],g.dupes[0].words)
def test_extension_is_not_counted_in_filename_scan(self):
s = Scanner()
s.min_match_percentage = 100
f = [no('foo.bar'),no('foo.bleh')]
r = s.GetDupeGroups(f)
self.assertEqual(1,len(r))
self.assertEqual(2,len(r[0]))
def test_job(self):
def do_progress(progress,desc=''):
log.append(progress)
return True
s = Scanner()
log = []
f = [no('foo bar'),no('foo bar'),no('foo bleh')]
r = s.GetDupeGroups(f, job.Job(1,do_progress))
self.assertEqual(0,log[0])
self.assertEqual(100,log[-1])
def test_mix_file_kind(self):
s = Scanner()
s.mix_file_kind = False
f = [no('foo.1'),no('foo.2')]
r = s.GetDupeGroups(f)
self.assertEqual(0,len(r))
def test_word_weighting(self):
s = Scanner()
s.min_match_percentage = 75
s.word_weighting = True
f = [no('foo bar'),no('foo bar bleh')]
r = s.GetDupeGroups(f)
self.assertEqual(1,len(r))
g = r[0]
m = g.get_match_of(g.dupes[0])
self.assertEqual(75,m.percentage) # 16 letters, 12 matching
def test_similar_words(self):
s = Scanner()
s.match_similar_words = True
f = [no('The White Stripes'),no('The Whites Stripe'),no('Limp Bizkit'),no('Limp Bizkitt')]
r = s.GetDupeGroups(f)
self.assertEqual(2,len(r))
def test_fields(self):
s = Scanner()
s.scan_type = SCAN_TYPE_FIELDS
f = [no('The White Stripes - Little Ghost'),no('The White Stripes - Little Acorn')]
r = s.GetDupeGroups(f)
self.assertEqual(0,len(r))
def test_fields_no_order(self):
s = Scanner()
s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER
f = [no('The White Stripes - Little Ghost'),no('Little Ghost - The White Stripes')]
r = s.GetDupeGroups(f)
self.assertEqual(1,len(r))
def test_tag_scan(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
o1 = no('foo')
o2 = no('bar')
o1.artist = 'The White Stripes'
o1.title = 'The Air Near My Fingers'
o2.artist = 'The White Stripes'
o2.title = 'The Air Near My Fingers'
r = s.GetDupeGroups([o1,o2])
self.assertEqual(1,len(r))
def test_tag_with_album_scan(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG_WITH_ALBUM
o1 = no('foo')
o2 = no('bar')
o3 = no('bleh')
o1.artist = 'The White Stripes'
o1.title = 'The Air Near My Fingers'
o1.album = 'Elephant'
o2.artist = 'The White Stripes'
o2.title = 'The Air Near My Fingers'
o2.album = 'Elephant'
o3.artist = 'The White Stripes'
o3.title = 'The Air Near My Fingers'
o3.album = 'foobar'
r = s.GetDupeGroups([o1,o2,o3])
self.assertEqual(1,len(r))
def test_that_dash_in_tags_dont_create_new_fields(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG_WITH_ALBUM
s.min_match_percentage = 50
o1 = no('foo')
o2 = no('bar')
o1.artist = 'The White Stripes - a'
o1.title = 'The Air Near My Fingers - a'
o1.album = 'Elephant - a'
o2.artist = 'The White Stripes - b'
o2.title = 'The Air Near My Fingers - b'
o2.album = 'Elephant - b'
r = s.GetDupeGroups([o1,o2])
self.assertEqual(1,len(r))
def test_tag_scan_with_different_scanned(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['track', 'year'])
o1 = no('foo')
o2 = no('bar')
o1.artist = 'The White Stripes'
o1.title = 'some title'
o1.track = 'foo'
o1.year = 'bar'
o2.artist = 'The White Stripes'
o2.title = 'another title'
o2.track = 'foo'
o2.year = 'bar'
r = s.GetDupeGroups([o1, o2])
self.assertEqual(1, len(r))
def test_tag_scan_only_scans_existing_tags(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['artist', 'foo'])
o1 = no('foo')
o2 = no('bar')
o1.artist = 'The White Stripes'
o1.foo = 'foo'
o2.artist = 'The White Stripes'
o2.foo = 'bar'
r = s.GetDupeGroups([o1, o2])
self.assertEqual(1, len(r)) # Because 'foo' is not scanned, they match
def test_tag_scan_converts_to_str(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['track'])
o1 = no('foo')
o2 = no('bar')
o1.track = 42
o2.track = 42
try:
r = s.GetDupeGroups([o1, o2])
except TypeError:
self.fail()
self.assertEqual(1, len(r))
def test_tag_scan_non_ascii(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['title'])
o1 = no('foo')
o2 = no('bar')
o1.title = u'foobar\u00e9'
o2.title = u'foobar\u00e9'
try:
r = s.GetDupeGroups([o1, o2])
except UnicodeEncodeError:
self.fail()
self.assertEqual(1, len(r))
def test_audio_content_scan(self):
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT_AUDIO
f = [no('foo'),no('bar'),no('bleh')]
f[0].md5 = 'foo'
f[1].md5 = 'bar'
f[2].md5 = 'bleh'
f[0].md5partial = 'foo'
f[1].md5partial = 'foo'
f[2].md5partial = 'bleh'
f[0].audiosize = 1
f[1].audiosize = 1
f[2].audiosize = 1
r = s.GetDupeGroups(f)
self.assertEqual(1,len(r))
self.assertEqual(2,len(r[0]))
def test_audio_content_scan_compare_sizes_first(self):
class MyFile(no):
def get_md5(file):
self.fail()
md5partial = property(get_md5)
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT_AUDIO
f = [MyFile('foo'),MyFile('bar')]
f[0].audiosize = 1
f[1].audiosize = 2
self.assertEqual(0,len(s.GetDupeGroups(f)))
def test_ignore_list(self):
s = Scanner()
f1 = no('foobar')
f2 = no('foobar')
f3 = no('foobar')
f1.path = Path('dir1/foobar')
f2.path = Path('dir2/foobar')
f3.path = Path('dir3/foobar')
s.ignore_list.Ignore(str(f1.path),str(f2.path))
s.ignore_list.Ignore(str(f1.path),str(f3.path))
r = s.GetDupeGroups([f1,f2,f3])
self.assertEqual(1,len(r))
g = r[0]
self.assertEqual(1,len(g.dupes))
self.assert_(f1 not in g)
self.assert_(f2 in g)
self.assert_(f3 in g)
# Ignored matches are not counted as discarded
self.assertEqual(s.discarded_file_count, 0)
def test_ignore_list_checks_for_unicode(self):
#scanner was calling path_str for ignore list checks. Since the Path changes, it must
#be unicode(path)
s = Scanner()
f1 = no('foobar')
f2 = no('foobar')
f3 = no('foobar')
f1.path = Path(u'foo1\u00e9')
f2.path = Path(u'foo2\u00e9')
f3.path = Path(u'foo3\u00e9')
s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path))
s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path))
r = s.GetDupeGroups([f1,f2,f3])
self.assertEqual(1,len(r))
g = r[0]
self.assertEqual(1,len(g.dupes))
self.assert_(f1 not in g)
self.assert_(f2 in g)
self.assert_(f3 in g)
def test_custom_match_factory(self):
class MatchFactory(object):
def getmatches(self,objects,j=None):
return [Match(objects[0], objects[1], 420)]
s = Scanner()
s.match_factory = MatchFactory()
o1,o2 = no('foo'),no('bar')
groups = s.GetDupeGroups([o1,o2])
self.assertEqual(1,len(groups))
g = groups[0]
self.assertEqual(2,len(g))
g.switch_ref(o1)
m = g.get_match_of(o2)
self.assertEqual((o1,o2,420),m)
def test_file_evaluates_to_false(self):
# A very wrong way to use any() was added at some point, causing resulting group list
# to be empty.
class FalseNamedObject(NamedObject):
def __nonzero__(self):
return False
s = Scanner()
f1 = FalseNamedObject('foobar')
f2 = FalseNamedObject('foobar')
r = s.GetDupeGroups([f1,f2])
self.assertEqual(1,len(r))
def test_size_threshold(self):
# Only file equal or higher than the size_threshold in size are scanned
s = Scanner()
f1 = no('foo', 1)
f2 = no('foo', 2)
f3 = no('foo', 3)
s.size_threshold = 2
groups = s.GetDupeGroups([f1,f2,f3])
self.assertEqual(len(groups), 1)
[group] = groups
self.assertEqual(len(group), 2)
self.assertTrue(f1 not in group)
self.assertTrue(f2 in group)
self.assertTrue(f3 in group)
def test_tie_breaker_path_deepness(self):
# If there is a tie in prioritization, path deepness is used as a tie breaker
s = Scanner()
o1, o2 = no('foo'), no('foo')
o1.path = Path('foo')
o2.path = Path('foo/bar')
[group] = s.GetDupeGroups([o1, o2])
self.assertTrue(group.ref is o2)
def test_tie_breaker_copy(self):
# if copy is in the words used (even if it has a deeper path), it becomes a dupe
s = Scanner()
o1, o2 = no('foo bar Copy'), no('foo bar')
o1.path = Path('deeper/path')
o2.path = Path('foo')
[group] = s.GetDupeGroups([o1, o2])
self.assertTrue(group.ref is o2)
def test_tie_breaker_same_name_plus_digit(self):
# if ref has the same words as dupe, but has some just one extra word which is a digit, it
# becomes a dupe
s = Scanner()
o1, o2 = no('foo bar 42'), no('foo bar')
o1.path = Path('deeper/path')
o2.path = Path('foo')
[group] = s.GetDupeGroups([o1, o2])
self.assertTrue(group.ref is o2)
def test_partial_group_match(self):
# Count the number od discarded matches (when a file doesn't match all other dupes of the
# group) in Scanner.discarded_file_count
s = Scanner()
o1, o2, o3 = no('a b'), no('a'), no('b')
s.min_match_percentage = 50
[group] = s.GetDupeGroups([o1, o2, o3])
self.assertEqual(len(group), 2)
self.assertTrue(o1 in group)
self.assertTrue(o2 in group)
self.assertTrue(o3 not in group)
self.assertEqual(s.discarded_file_count, 1)
class TCScannerME(TestCase):
def test_priorize(self):
# in ScannerME, bitrate goes first (right after is_ref) in priorization
s = ScannerME()
o1, o2 = no('foo'), no('foo')
o1.bitrate = 1
o2.bitrate = 2
[group] = s.GetDupeGroups([o1, o2])
self.assertTrue(group.ref is o2)
if __name__ == "__main__":
unittest.main()