1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-25 16:11:39 +00:00

Compare commits

...

22 Commits

Author SHA1 Message Date
hsoft
911521d8e0 dgpe qt: build related fixes.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40217
2009-10-24 16:30:37 +00:00
hsoft
b25c1c3a3b Added dgpe 1.7.8 to the changelog.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40215
2009-10-24 14:18:36 +00:00
hsoft
37a40040b3 [#73 state:port] Fixed a bug causing some matches to be ignored in the new pe match algo.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40212
2009-10-24 13:54:57 +00:00
hsoft
25dadc83eb sgpe cocoa: adjusted to hsfs removal.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40210
2009-10-24 12:21:39 +00:00
hsoft
b8c11b5aae dgpe cocoa: removed hsfs from externals.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40209
2009-10-24 12:21:09 +00:00
hsoft
a3ab314378 dgpe qt: adjusted to the hsfs move.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40208
2009-10-23 15:04:37 +00:00
hsoft
794192835d dgme cocoa: added dupeguru_me external and removed the hsfs one.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40207
2009-10-23 14:46:00 +00:00
hsoft
385768a69b dgme qt: adjusted code to the hsfs move.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40206
2009-10-23 14:35:51 +00:00
hsoft
a281931b16 dgme qt: added the dupeguru_me external and removed the hsfs one.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40205
2009-10-23 14:34:59 +00:00
hsoft
085311d559 Added the folder me/py
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40204
2009-10-23 14:05:06 +00:00
hsoft
4d7f032889 dgse cocoa: fixed quirks created by the hsfs move.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40203
2009-10-23 13:46:18 +00:00
hsoft
cf44c93013 dgse cocoa: added the dupeguru_se external and removed the hsfs one.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40202
2009-10-23 13:45:15 +00:00
hsoft
787cbcd01f dgse qt: removed hsfs external
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40201
2009-10-23 12:59:29 +00:00
hsoft
b2b316b642 dgse qt: removed all hsfs usages.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40200
2009-10-23 12:56:52 +00:00
hsoft
49165125e4 dg se: Moved se-specific code from dupeguru to dupeguru_se.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40199
2009-10-23 08:19:48 +00:00
hsoft
54ac0fd19e dg qt: oops, *now* I added the external ref.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40198
2009-10-23 08:19:02 +00:00
hsoft
0aff7f16e5 dg qt: Added the dupeguru_se external.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40197
2009-10-23 08:17:35 +00:00
hsoft
f9abc3b35d Added a dupeguru_se sub-package.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40196
2009-10-23 08:02:43 +00:00
hsoft
b167a51243 Added dupeguru.fs, which is a simpler fork of hsfs and aims to replace it in the dupeguru project.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40195
2009-10-22 15:23:32 +00:00
hsoft
371cdda911 dgpe cocoa: adjusted to MatchFactory removal.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40194
2009-10-18 09:29:33 +00:00
hsoft
11977c6533 dgpe: adjusted to the MatchFactory removal.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193
2009-10-18 09:26:04 +00:00
hsoft
7228adf433 Changed the MatchFactory into a simple getmatch method, and added a separate getmatches_by_contents() method for contents scan, which results in faster and less memory hungry scans.
--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40192
2009-10-18 08:46:00 +00:00
44 changed files with 1168 additions and 705 deletions

View File

@@ -14,13 +14,13 @@ import os
import os.path as op
import logging
from hsutil import job, io, files
from hsutil import io, files
from hsutil.path import Path
from hsutil.reg import RegistrableApplication, RegistrationRequired
from hsutil.misc import flatten, first
from hsutil.str import escape
from . import directories, results, scanner, export
from . import directories, results, scanner, export, fs
JOB_SCAN = 'job_scan'
JOB_LOAD = 'job_load'
@@ -98,13 +98,8 @@ class DupeGuru(RegistrableApplication):
return ['---'] * len(self.data.COLUMNS)
def _get_file(self, str_path):
p = Path(str_path)
for d in self.directories:
if p not in d.path:
continue
result = d.find_path(p[d.path:])
if result is not None:
return result
path = Path(str_path)
return fs.get_file(path, self.directories.fileclasses)
@staticmethod
def _recycle_dupe(dupe):
@@ -150,7 +145,7 @@ class DupeGuru(RegistrableApplication):
2 = absolute re-creation.
"""
source_path = dupe.path
location_path = dupe.root.path
location_path = first(p for p in self.directories if dupe.path in p)
dest_path = Path(destination)
if dest_type == 2:
dest_path = dest_path + source_path[1:-1] #Remove drive letter and filename

View File

@@ -12,13 +12,12 @@ from AppKit import *
import logging
import os.path as op
import hsfs as fs
from hsutil import io, cocoa, job
from hsutil.cocoa import install_exception_hook
from hsutil.misc import stripnone
from hsutil.reg import RegistrationRequired
import app, data
from . import app, fs
JOBID2TITLE = {
app.JOB_SCAN: "Scanning for duplicates",
@@ -43,8 +42,6 @@ class DupeGuru(app.DupeGuru):
logging.basicConfig(level=LOGGING_LEVEL, format='%(levelname)s %(message)s')
logging.debug('started in debug mode')
install_exception_hook()
if data_module is None:
data_module = data
appsupport = NSSearchPathForDirectoriesInDomains(NSApplicationSupportDirectory, NSUserDomainMask, True)[0]
appdata = op.join(appsupport, appdata_subdir)
app.DupeGuru.__init__(self, data_module, appdata, appid)
@@ -91,15 +88,15 @@ class DupeGuru(app.DupeGuru):
except IndexError:
return (None,None)
def GetDirectory(self,node_path,curr_dir=None):
def get_folder_path(self, node_path, curr_path=None):
if not node_path:
return curr_dir
if curr_dir is not None:
l = curr_dir.dirs
return curr_path
current_index = node_path[0]
if curr_path is None:
curr_path = self.directories[current_index]
else:
l = self.directories
d = l[node_path[0]]
return self.GetDirectory(node_path[1:],d)
curr_path = self.directories.get_subfolders(curr_path)[current_index]
return self.get_folder_path(node_path[1:], curr_path)
def RefreshDetailsTable(self,dupe,group):
l1 = self._get_display_info(dupe, group, False)
@@ -146,13 +143,13 @@ class DupeGuru(app.DupeGuru):
def RemoveSelected(self):
self.results.remove_duplicates(self.selected_dupes)
def RenameSelected(self,newname):
def RenameSelected(self, newname):
try:
d = self.selected_dupes[0]
d = d.move(d.parent,newname)
d.rename(newname)
return True
except (IndexError,fs.FSError),e:
logging.warning("dupeGuru Warning: %s" % str(e))
except (IndexError, fs.FSError) as e:
logging.warning("dupeGuru Warning: %s" % unicode(e))
return False
def RevealSelected(self):
@@ -214,9 +211,9 @@ class DupeGuru(app.DupeGuru):
self.results.dupes[row] for row in rows if row in xrange(len(self.results.dupes))
]
def SetDirectoryState(self,node_path,state):
d = self.GetDirectory(node_path)
self.directories.set_state(d.path,state)
def SetDirectoryState(self, node_path, state):
p = self.get_folder_path(node_path)
self.directories.set_state(p, state)
def sort_dupes(self,key,asc):
self.results.sort_dupes(key,asc,self.display_delta_values)
@@ -245,8 +242,12 @@ class DupeGuru(app.DupeGuru):
return [len(g.dupes) for g in self.results.groups]
elif tag == 1: #Directories
try:
dirs = self.GetDirectory(node_path).dirs if node_path else self.directories
return [d.dircount for d in dirs]
if node_path:
path = self.get_folder_path(node_path)
subfolders = self.directories.get_subfolders(path)
else:
subfolders = self.directories
return [len(self.directories.get_subfolders(path)) for path in subfolders]
except IndexError: # node_path out of range
return []
else: #Power Marker
@@ -270,8 +271,9 @@ class DupeGuru(app.DupeGuru):
return result
elif tag == 1: #Directories
try:
d = self.GetDirectory(node_path)
return [d.name, self.directories.get_state(d.path)]
path = self.get_folder_path(node_path)
name = unicode(path) if len(node_path) == 1 else path[-1]
return [name, self.directories.get_state(path)]
except IndexError: # node_path out of range
return []

View File

@@ -40,63 +40,3 @@ def format_dupe_count(c):
def cmp_value(value):
return value.lower() if isinstance(value, basestring) else value
COLUMNS = [
{'attr':'name','display':'Filename'},
{'attr':'path','display':'Directory'},
{'attr':'size','display':'Size (KB)'},
{'attr':'extension','display':'Kind'},
{'attr':'ctime','display':'Creation'},
{'attr':'mtime','display':'Modification'},
{'attr':'percentage','display':'Match %'},
{'attr':'words','display':'Words Used'},
{'attr':'dupe_count','display':'Dupe Count'},
]
METADATA_TO_READ = ['size', 'ctime', 'mtime']
def GetDisplayInfo(dupe, group, delta):
size = dupe.size
ctime = dupe.ctime
mtime = dupe.mtime
m = group.get_match_of(dupe)
if m:
percentage = m.percentage
dupe_count = 0
if delta:
r = group.ref
size -= r.size
ctime -= r.ctime
mtime -= r.mtime
else:
percentage = group.percentage
dupe_count = len(group.dupes)
return [
dupe.name,
format_path(dupe.path),
format_size(size, 0, 1, False),
dupe.extension,
format_timestamp(ctime, delta and m),
format_timestamp(mtime, delta and m),
format_perc(percentage),
format_words(dupe.words),
format_dupe_count(dupe_count)
]
def GetDupeSortKey(dupe, get_group, key, delta):
if key == 6:
m = get_group().get_match_of(dupe)
return m.percentage
if key == 8:
return 0
r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
if delta and (key in (2, 4, 5)):
r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
return r
def GetGroupSortKey(group, key):
if key == 6:
return group.percentage
if key == 8:
return len(group)
return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))

View File

@@ -9,11 +9,12 @@
import xml.dom.minidom
from hsfs import phys
import hsfs as fs
from hsutil import io
from hsutil.files import FileOrPath
from hsutil.path import Path
from . import fs
(STATE_NORMAL,
STATE_REFERENCE,
STATE_EXCLUDED) = range(3)
@@ -26,15 +27,14 @@ class InvalidPathError(Exception):
class Directories(object):
#---Override
def __init__(self):
def __init__(self, fileclasses=[fs.File]):
self._dirs = []
self.states = {}
self.dirclass = phys.Directory
self.special_dirclasses = {}
self.fileclasses = fileclasses
def __contains__(self,path):
for d in self._dirs:
if path in d.path:
def __contains__(self, path):
for p in self._dirs:
if path in p:
return True
return False
@@ -53,8 +53,7 @@ class Directories(object):
if path[-1].startswith('.'): # hidden
return STATE_EXCLUDED
def _get_files(self, from_dir):
from_path = from_dir.path
def _get_files(self, from_path):
state = self.get_state(from_path)
if state == STATE_EXCLUDED:
# Recursively get files from folders with lots of subfolder is expensive. However, there
@@ -62,14 +61,21 @@ class Directories(object):
# through self.states and see if we must continue, or we can stop right here to save time
if not any(p[:len(from_path)] == from_path for p in self.states):
return
result = []
for subdir in from_dir.dirs:
for file in self._get_files(subdir):
yield file
if state != STATE_EXCLUDED:
for file in from_dir.files:
file.is_ref = state == STATE_REFERENCE
yield file
try:
filepaths = set()
if state != STATE_EXCLUDED:
for file in fs.get_files(from_path, fileclasses=self.fileclasses):
file.is_ref = state == STATE_REFERENCE
filepaths.add(file.path)
yield file
subpaths = [from_path + name for name in io.listdir(from_path)]
# it's possible that a folder (bundle) gets into the file list. in that case, we don't want to recurse into it
subfolders = [p for p in subpaths if not io.islink(p) and io.isdir(p) and p not in filepaths]
for subfolder in subfolders:
for file in self._get_files(subfolder):
yield file
except (EnvironmentError, fs.InvalidPath):
pass
#---Public
def add_path(self, path):
@@ -80,29 +86,30 @@ class Directories(object):
under it will be removed. Can also raise InvalidPathError if 'path' does not exist.
"""
if path in self:
raise AlreadyThereError
self._dirs = [d for d in self._dirs if d.path not in path]
try:
dirclass = self.special_dirclasses.get(path, self.dirclass)
d = dirclass(None, unicode(path))
d[:] #If an InvalidPath exception has to be raised, it will be raised here
self._dirs.append(d)
return d
except fs.InvalidPath:
raise AlreadyThereError()
if not io.exists(path):
raise InvalidPathError()
self._dirs = [p for p in self._dirs if p not in path]
self._dirs.append(path)
@staticmethod
def get_subfolders(path):
"""returns a sorted list of paths corresponding to subfolders in `path`"""
try:
names = [name for name in io.listdir(path) if io.isdir(path + name)]
names.sort(key=lambda x:x.lower())
return [path + name for name in names]
except EnvironmentError:
return []
def get_files(self):
"""Returns a list of all files that are not excluded.
Returned files also have their 'is_ref' attr set.
"""
for d in self._dirs:
d.force_update()
try:
for file in self._get_files(d):
yield file
except fs.InvalidPath:
pass
for path in self._dirs:
for file in self._get_files(path):
yield file
def get_state(self, path):
"""Returns the state of 'path' (One of the STATE_* const.)
@@ -123,8 +130,8 @@ class Directories(object):
doc = xml.dom.minidom.parse(infile)
except:
return
root_dir_nodes = doc.getElementsByTagName('root_directory')
for rdn in root_dir_nodes:
root_path_nodes = doc.getElementsByTagName('root_directory')
for rdn in root_path_nodes:
if not rdn.getAttributeNode('path'):
continue
path = rdn.getAttributeNode('path').nodeValue
@@ -144,9 +151,9 @@ class Directories(object):
with FileOrPath(outfile, 'wb') as fp:
doc = xml.dom.minidom.Document()
root = doc.appendChild(doc.createElement('directories'))
for root_dir in self:
root_dir_node = root.appendChild(doc.createElement('root_directory'))
root_dir_node.setAttribute('path', unicode(root_dir.path).encode('utf-8'))
for root_path in self:
root_path_node = root.appendChild(doc.createElement('root_directory'))
root_path_node.setAttribute('path', unicode(root_path).encode('utf-8'))
for path, state in self.states.iteritems():
state_node = root.appendChild(doc.createElement('state'))
state_node.setAttribute('path', unicode(path).encode('utf-8'))

View File

@@ -9,6 +9,7 @@
from __future__ import division
import difflib
import itertools
import logging
import string
from collections import defaultdict, namedtuple
@@ -156,58 +157,69 @@ def get_match(first, second, flags=()):
percentage = compare(first.words, second.words, flags)
return Match(first, second, percentage)
class MatchFactory(object):
common_word_threshold = 50
match_similar_words = False
min_match_percentage = 0
weight_words = False
no_field_order = False
limit = 5000000
def getmatches(self, objects, j=job.nulljob):
j = j.start_subjob(2)
sj = j.start_subjob(2)
for o in objects:
if not hasattr(o, 'words'):
o.words = getwords(o.name)
word_dict = build_word_dict(objects, sj)
reduce_common_words(word_dict, self.common_word_threshold)
if self.match_similar_words:
merge_similar_words(word_dict)
match_flags = []
if self.weight_words:
match_flags.append(WEIGHT_WORDS)
if self.match_similar_words:
match_flags.append(MATCH_SIMILAR_WORDS)
if self.no_field_order:
match_flags.append(NO_FIELD_ORDER)
j.start_job(len(word_dict), '0 matches found')
compared = defaultdict(set)
result = []
try:
# This whole 'popping' thing is there to avoid taking too much memory at the same time.
while word_dict:
items = word_dict.popitem()[1]
while items:
ref = items.pop()
compared_already = compared[ref]
to_compare = items - compared_already
compared_already |= to_compare
for other in to_compare:
m = get_match(ref, other, match_flags)
if m.percentage >= self.min_match_percentage:
result.append(m)
if len(result) >= self.limit:
return result
j.add_progress(desc='%d matches found' % len(result))
except MemoryError:
# This is the place where the memory usage is at its peak during the scan.
# Just continue the process with an incomplete list of matches.
del compared # This should give us enough room to call logging.
logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
return result
def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False,
no_field_order=False, j=job.nulljob):
COMMON_WORD_THRESHOLD = 50
LIMIT = 5000000
j = j.start_subjob(2)
sj = j.start_subjob(2)
for o in objects:
if not hasattr(o, 'words'):
o.words = getwords(o.name)
word_dict = build_word_dict(objects, sj)
reduce_common_words(word_dict, COMMON_WORD_THRESHOLD)
if match_similar_words:
merge_similar_words(word_dict)
match_flags = []
if weight_words:
match_flags.append(WEIGHT_WORDS)
if match_similar_words:
match_flags.append(MATCH_SIMILAR_WORDS)
if no_field_order:
match_flags.append(NO_FIELD_ORDER)
j.start_job(len(word_dict), '0 matches found')
compared = defaultdict(set)
result = []
try:
# This whole 'popping' thing is there to avoid taking too much memory at the same time.
while word_dict:
items = word_dict.popitem()[1]
while items:
ref = items.pop()
compared_already = compared[ref]
to_compare = items - compared_already
compared_already |= to_compare
for other in to_compare:
m = get_match(ref, other, match_flags)
if m.percentage >= min_match_percentage:
result.append(m)
if len(result) >= LIMIT:
return result
j.add_progress(desc='%d matches found' % len(result))
except MemoryError:
# This is the place where the memory usage is at its peak during the scan.
# Just continue the process with an incomplete list of matches.
del compared # This should give us enough room to call logging.
logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
return result
return result
def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob):
j = j.start_subjob([2, 8])
size2files = defaultdict(set)
for file in j.iter_with_progress(files, 'Read size of %d/%d files'):
size2files[getattr(file, sizeattr)].add(file)
possible_matches = [files for files in size2files.values() if len(files) > 1]
del size2files
result = []
j.start_job(len(possible_matches), '0 matches found')
for group in possible_matches:
for first, second in itertools.combinations(group, 2):
if first.md5partial == second.md5partial:
if partial or first.md5 == second.md5:
result.append(Match(first, second, 100))
j.add_progress(desc='%d matches found' % len(result))
return result
class Group(object):
#---Override

178
base/py/fs.py Normal file
View File

@@ -0,0 +1,178 @@
# -*- coding: utf-8 -*-
# Created By: Virgil Dupras
# Created On: 2009-10-22
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "HS" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
# This is a fork from hsfs. The reason for this fork is that hsfs has been designed for musicGuru
# and was re-used for dupeGuru. The problem is that hsfs is way over-engineered for dupeGuru,
# resulting needless complexity and memory usage. It's been a while since I wanted to do that fork,
# and I'm doing it now.
from __future__ import unicode_literals
import hashlib
import logging
from hsutil import io
from hsutil.misc import nonone, flatten
from hsutil.str import get_file_ext
class FSError(Exception):
cls_message = "An error has occured on '{name}' in '{parent}'"
def __init__(self, fsobject, parent=None):
message = self.cls_message
if isinstance(fsobject, basestring):
name = fsobject
elif isinstance(fsobject, File):
name = fsobject.name
else:
name = ''
parentname = unicode(parent) if parent is not None else ''
Exception.__init__(self, message.format(name=name, parent=parentname))
class AlreadyExistsError(FSError):
"The directory or file name we're trying to add already exists"
cls_message = "'{name}' already exists in '{parent}'"
class InvalidPath(FSError):
"The path of self is invalid, and cannot be worked with."
cls_message = "'{name}' is invalid."
class InvalidDestinationError(FSError):
"""A copy/move operation has been called, but the destination is invalid."""
cls_message = "'{name}' is an invalid destination for this operation."
class OperationError(FSError):
"""A copy/move/delete operation has been called, but the checkup after the
operation shows that it didn't work."""
cls_message = "Operation on '{name}' failed."
class File(object):
INITIAL_INFO = {
'size': 0,
'ctime': 0,
'mtime': 0,
'md5': '',
'md5partial': '',
}
def __init__(self, path):
self.path = path
#This offset is where we should start reading the file to get a partial md5
#For audio file, it should be where audio data starts
self._md5partial_offset = 0x4000 #16Kb
self._md5partial_size = 0x4000 #16Kb
def __getattr__(self, attrname):
# Only called when attr is not there
if attrname in self.INITIAL_INFO:
try:
self._read_info(attrname)
except Exception as e:
logging.warning("An error '%s' was raised while decoding '%s'", e, repr(self.path))
try:
return self.__dict__[attrname]
except KeyError:
return self.INITIAL_INFO[attrname]
raise AttributeError()
def _read_info(self, field):
if field in ('size', 'ctime', 'mtime'):
stats = io.stat(self.path)
self.size = nonone(stats.st_size, 0)
self.ctime = nonone(stats.st_ctime, 0)
self.mtime = nonone(stats.st_mtime, 0)
elif field == 'md5partial':
try:
fp = io.open(self.path, 'rb')
offset = self._md5partial_offset
size = self._md5partial_size
fp.seek(offset)
partialdata = fp.read(size)
md5 = hashlib.md5(partialdata)
self.md5partial = md5.digest()
fp.close()
except Exception:
pass
elif field == 'md5':
try:
fp = io.open(self.path, 'rb')
filedata = fp.read()
md5 = hashlib.md5(filedata)
self.md5 = md5.digest()
fp.close()
except Exception:
pass
def _read_all_info(self, attrnames=None):
"""Cache all possible info.
If `attrnames` is not None, caches only attrnames.
"""
if attrnames is None:
attrnames = self.INITIAL_INFO.keys()
for attrname in attrnames:
if attrname not in self.__dict__:
self._read_info(attrname)
#--- Public
@classmethod
def can_handle(cls, path):
return not io.islink(path) and io.isfile(path)
def rename(self, newname):
if newname == self.name:
return
destpath = self.path[:-1] + newname
if io.exists(destpath):
raise AlreadyExistsError(newname, self.path[:-1])
try:
io.rename(self.path, destpath)
except EnvironmentError:
raise OperationError(self)
if not io.exists(destpath):
raise OperationError(self)
self.path = destpath
#--- Properties
@property
def extension(self):
return get_file_ext(self.name)
@property
def name(self):
return self.path[-1]
def get_file(path, fileclasses=[File]):
for fileclass in fileclasses:
if fileclass.can_handle(path):
return fileclass(path)
def get_files(path, fileclasses=[File]):
assert all(issubclass(fileclass, File) for fileclass in fileclasses)
try:
paths = [path + name for name in io.listdir(path)]
result = []
for path in paths:
file = get_file(path, fileclasses=fileclasses)
if file is not None:
result.append(file)
return result
except EnvironmentError:
raise InvalidPath(path)
def get_all_files(path, fileclasses=[File]):
files = get_files(path, fileclasses=fileclasses)
filepaths = set(f.path for f in files)
subpaths = [path + name for name in io.listdir(path)]
# it's possible that a folder (bundle) gets into the file list. in that case, we don't want to recurse into it
subfolders = [p for p in subpaths if not io.islink(p) and io.isdir(p) and p not in filepaths]
subfiles = flatten(get_all_files(subpath, fileclasses=fileclasses) for subpath in subfolders)
return subfiles + files

View File

@@ -32,40 +32,32 @@ class Scanner(object):
self.ignore_list = IgnoreList()
self.discarded_file_count = 0
@staticmethod
def _filter_matches_by_content(matches, partial, j):
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
md5attrname = 'md5partial' if partial else 'md5'
md5 = lambda f: getattr(f, md5attrname)
for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
md5(matched_file)
j.set_progress(100, 'Removing false matches')
return [m for m in matches if md5(m.first) == md5(m.second)]
def _getmatches(self, files, j):
j = j.start_subjob(2)
mf = engine.MatchFactory()
if self.scan_type != SCAN_TYPE_CONTENT:
mf.match_similar_words = self.match_similar_words
mf.weight_words = self.word_weighting
mf.min_match_percentage = self.min_match_percentage
if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
self.scan_type = SCAN_TYPE_FIELDS
mf.no_field_order = True
func = {
SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
SCAN_TYPE_CONTENT: lambda f: [str(f.size)],
SCAN_TYPE_CONTENT_AUDIO: lambda f: [str(f.audiosize)]
}[self.scan_type]
for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
if self.size_threshold:
f.size # pre-read, makes a smoother progress if read here (especially for bundles)
f.words = func(f)
if self.size_threshold:
j = j.start_subjob([2, 8])
for f in j.iter_with_progress(files, 'Read size of %d/%d files'):
f.size # pre-read, makes a smoother progress if read here (especially for bundles)
files = [f for f in files if f.size >= self.size_threshold]
return mf.getmatches(files, j)
if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
sizeattr = 'size' if self.scan_type == SCAN_TYPE_CONTENT else 'audiosize'
return engine.getmatches_by_contents(files, sizeattr, partial=self.scan_type==SCAN_TYPE_CONTENT_AUDIO, j=j)
else:
j = j.start_subjob([2, 8])
kw = {}
kw['match_similar_words'] = self.match_similar_words
kw['weight_words'] = self.word_weighting
kw['min_match_percentage'] = self.min_match_percentage
if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
self.scan_type = SCAN_TYPE_FIELDS
kw['no_field_order'] = True
func = {
SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
}[self.scan_type]
for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
f.words = func(f)
return engine.getmatches(files, j=j, **kw)
@staticmethod
def _key_func(dupe):
@@ -86,10 +78,7 @@ class Scanner(object):
for f in [f for f in files if not hasattr(f, 'is_ref')]:
f.is_ref = False
logging.info('Getting matches')
if self.match_factory is None:
matches = self._getmatches(files, j)
else:
matches = self.match_factory.getmatches(files, j)
matches = self._getmatches(files, j)
logging.info('Found %d matches' % len(matches))
if not self.mix_file_kind:
j.set_progress(100, 'Removing false matches')
@@ -99,14 +88,6 @@ class Scanner(object):
iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
matches = [m for m in iter_matches
if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
j = j.start_subjob(3 if self.scan_type == SCAN_TYPE_CONTENT else 2)
matches = self._filter_matches_by_content(matches, partial=True, j=j)
if self.scan_type == SCAN_TYPE_CONTENT:
matches = self._filter_matches_by_content(matches, partial=False, j=j)
# We compared md5. No words were involved.
for m in matches:
m.first.words = m.second.words = ['--']
logging.info('Grouping matches')
groups = engine.get_groups(matches, j)
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
@@ -118,7 +99,6 @@ class Scanner(object):
g.prioritize(self._key_func, self._tie_breaker)
return groups
match_factory = None
match_similar_words = False
min_match_percentage = 80
mix_file_kind = True
@@ -126,9 +106,3 @@ class Scanner(object):
scanned_tags = set(['artist', 'title'])
size_threshold = 0
word_weighting = False
class ScannerME(Scanner): # Scanner for Music Edition
@staticmethod
def _key_func(dupe):
return (not dupe.is_ref, -dupe.bitrate, -dupe.size)

View File

@@ -18,10 +18,10 @@ from hsutil.path import Path
from hsutil.testcase import TestCase
from hsutil.decorators import log_calls
from hsutil import io
import hsfs.phys
from . import data
from .results_test import GetTestGroups
from .. import engine, data
from .. import engine, fs
try:
from ..app_cocoa import DupeGuru as DupeGuruBase
except ImportError:
@@ -35,7 +35,6 @@ class DupeGuru(DupeGuruBase):
def _start_job(self, jobid, func):
func(nulljob)
def r2np(rows):
#Transforms a list of rows [1,2,3] into a list of node paths [[1],[2],[3]]
return [[i] for i in rows]
@@ -310,15 +309,15 @@ class TCDupeGuru(TestCase):
class TCDupeGuru_renameSelected(TestCase):
def setUp(self):
p = Path(tempfile.mkdtemp())
fp = open(str(p + 'foo bar 1'),mode='w')
p = self.tmppath()
fp = open(unicode(p + 'foo bar 1'),mode='w')
fp.close()
fp = open(str(p + 'foo bar 2'),mode='w')
fp = open(unicode(p + 'foo bar 2'),mode='w')
fp.close()
fp = open(str(p + 'foo bar 3'),mode='w')
fp = open(unicode(p + 'foo bar 3'),mode='w')
fp.close()
refdir = hsfs.phys.Directory(None,str(p))
matches = engine.MatchFactory().getmatches(refdir.files)
files = fs.get_files(p)
matches = engine.getmatches(files)
groups = engine.get_groups(matches)
g = groups[0]
g.prioritize(lambda x:x.name)
@@ -327,45 +326,41 @@ class TCDupeGuru_renameSelected(TestCase):
self.app = app
self.groups = groups
self.p = p
self.refdir = refdir
def tearDown(self):
shutil.rmtree(str(self.p))
self.files = files
def test_simple(self):
app = self.app
refdir = self.refdir
g = self.groups[0]
app.SelectPowerMarkerNodePaths(r2np([0]))
self.assert_(app.RenameSelected('renamed'))
self.assert_('renamed' in refdir)
self.assert_('foo bar 2' not in refdir)
self.assert_(g.dupes[0] is refdir['renamed'])
self.assert_(g.dupes[0] in refdir)
assert app.RenameSelected('renamed')
names = io.listdir(self.p)
assert 'renamed' in names
assert 'foo bar 2' not in names
eq_(g.dupes[0].name, 'renamed')
def test_none_selected(self):
app = self.app
refdir = self.refdir
g = self.groups[0]
app.SelectPowerMarkerNodePaths([])
self.mock(logging, 'warning', log_calls(lambda msg: None))
self.assert_(not app.RenameSelected('renamed'))
assert not app.RenameSelected('renamed')
msg = logging.warning.calls[0]['msg']
self.assertEqual('dupeGuru Warning: list index out of range', msg)
self.assert_('renamed' not in refdir)
self.assert_('foo bar 2' in refdir)
self.assert_(g.dupes[0] is refdir['foo bar 2'])
eq_('dupeGuru Warning: list index out of range', msg)
names = io.listdir(self.p)
assert 'renamed' not in names
assert 'foo bar 2' in names
eq_(g.dupes[0].name, 'foo bar 2')
def test_name_already_exists(self):
app = self.app
refdir = self.refdir
g = self.groups[0]
app.SelectPowerMarkerNodePaths(r2np([0]))
self.mock(logging, 'warning', log_calls(lambda msg: None))
self.assert_(not app.RenameSelected('foo bar 1'))
assert not app.RenameSelected('foo bar 1')
msg = logging.warning.calls[0]['msg']
self.assert_(msg.startswith('dupeGuru Warning: \'foo bar 2\' already exists in'))
self.assert_('foo bar 1' in refdir)
self.assert_('foo bar 2' in refdir)
self.assert_(g.dupes[0] is refdir['foo bar 2'])
assert msg.startswith('dupeGuru Warning: \'foo bar 1\' already exists in')
names = io.listdir(self.p)
assert 'foo bar 1' in names
assert 'foo bar 2' in names
eq_(g.dupes[0].name, 'foo bar 2')

View File

@@ -13,12 +13,11 @@ from hsutil.testcase import TestCase
from hsutil import io
from hsutil.path import Path
from hsutil.decorators import log_calls
import hsfs as fs
import hsfs.phys
import hsutil.files
from hsutil.job import nulljob
from .. import data, app
from . import data
from .. import app, fs
from ..app import DupeGuru as DupeGuruBase
class DupeGuru(DupeGuruBase):
@@ -59,27 +58,27 @@ class TCDupeGuru(TestCase):
# The goal here is just to have a test for a previous blowup I had. I know my test coverage
# for this unit is pathetic. What's done is done. My approach now is to add tests for
# every change I want to make. The blowup was caused by a missing import.
dupe_parent = fs.Directory(None, 'foo')
dupe = fs.File(dupe_parent, 'bar')
dupe.copy = log_calls(lambda dest, newname: None)
p = self.tmppath()
io.open(p + 'foo', 'w').close()
self.mock(hsutil.files, 'copy', log_calls(lambda source_path, dest_path: None))
self.mock(os, 'makedirs', lambda path: None) # We don't want the test to create that fake directory
self.mock(fs.phys, 'Directory', fs.Directory) # We don't want an error because makedirs didn't work
app = DupeGuru()
app.copy_or_move(dupe, True, 'some_destination', 0)
app.directories.add_path(p)
[f] = app.directories.get_files()
app.copy_or_move(f, True, 'some_destination', 0)
self.assertEqual(1, len(hsutil.files.copy.calls))
call = hsutil.files.copy.calls[0]
self.assertEqual('some_destination', call['dest_path'])
self.assertEqual(dupe.path, call['source_path'])
self.assertEqual(f.path, call['source_path'])
def test_copy_or_move_clean_empty_dirs(self):
tmppath = Path(self.tmpdir())
sourcepath = tmppath + 'source'
io.mkdir(sourcepath)
io.open(sourcepath + 'myfile', 'w')
tmpdir = hsfs.phys.Directory(None, unicode(tmppath))
myfile = tmpdir['source']['myfile']
app = DupeGuru()
app.directories.add_path(tmppath)
[myfile] = app.directories.get_files()
self.mock(app, 'clean_empty_dirs', log_calls(lambda path: None))
app.copy_or_move(myfile, False, tmppath + 'dest', 0)
calls = app.clean_empty_dirs.calls
@@ -87,9 +86,14 @@ class TCDupeGuru(TestCase):
self.assertEqual(sourcepath, calls[0]['path'])
def test_Scan_with_objects_evaluating_to_false(self):
class FakeFile(fs.File):
def __nonzero__(self):
return False
# At some point, any() was used in a wrong way that made Scan() wrongly return 1
app = DupeGuru()
f1, f2 = [fs.File(None, 'foo') for i in range(2)]
f1, f2 = [FakeFile('foo') for i in range(2)]
f1.is_ref, f2.is_ref = (False, False)
assert not (bool(f1) and bool(f2))
app.directories.get_files = lambda: [f1, f2]

45
base/py/tests/data.py Normal file
View File

@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
# Created By: Virgil Dupras
# Created On: 2009-10-23
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "HS" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
# data module for tests
from hsutil.str import format_size
from dupeguru.data import format_path, cmp_value
COLUMNS = [
{'attr':'name','display':'Filename'},
{'attr':'path','display':'Directory'},
{'attr':'size','display':'Size (KB)'},
{'attr':'extension','display':'Kind'},
]
METADATA_TO_READ = ['size']
def GetDisplayInfo(dupe, group, delta):
size = dupe.size
m = group.get_match_of(dupe)
if m and delta:
r = group.ref
size -= r.size
return [
dupe.name,
format_path(dupe.path),
format_size(size, 0, 1, False),
dupe.extension,
]
def GetDupeSortKey(dupe, get_group, key, delta):
r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
if delta and (key == 2):
r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
return r
def GetGroupSortKey(group, key):
return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))

View File

@@ -10,20 +10,43 @@
import os.path as op
import os
import time
import shutil
from nose.tools import eq_
from hsutil import job, io
from hsutil import io
from hsutil.path import Path
from hsutil.testcase import TestCase
import hsfs.phys
from hsfs.tests import phys_test
from ..directories import *
testpath = Path(TestCase.datadirpath())
def create_fake_fs(rootpath):
rootpath = rootpath + 'fs'
io.mkdir(rootpath)
io.mkdir(rootpath + 'dir1')
io.mkdir(rootpath + 'dir2')
io.mkdir(rootpath + 'dir3')
fp = io.open(rootpath + 'file1.test', 'w')
fp.write('1')
fp.close()
fp = io.open(rootpath + 'file2.test', 'w')
fp.write('12')
fp.close()
fp = io.open(rootpath + 'file3.test', 'w')
fp.write('123')
fp.close()
fp = io.open(rootpath + ('dir1', 'file1.test'), 'w')
fp.write('1')
fp.close()
fp = io.open(rootpath + ('dir2', 'file2.test'), 'w')
fp.write('12')
fp.close()
fp = io.open(rootpath + ('dir3', 'file3.test'), 'w')
fp.write('123')
fp.close()
return rootpath
class TCDirectories(TestCase):
def test_empty(self):
d = Directories()
@@ -33,13 +56,11 @@ class TCDirectories(TestCase):
def test_add_path(self):
d = Directories()
p = testpath + 'utils'
added = d.add_path(p)
d.add_path(p)
self.assertEqual(1,len(d))
self.assert_(p in d)
self.assert_((p + 'foobar') in d)
self.assert_(p[:-1] not in d)
self.assertEqual(p,added.path)
self.assert_(d[0] is added)
p = self.tmppath()
d.add_path(p)
self.assertEqual(2,len(d))
@@ -53,13 +74,13 @@ class TCDirectories(TestCase):
self.assertRaises(AlreadyThereError, d.add_path, p + 'foobar')
self.assertEqual(1, len(d))
def test_AddPath_containing_paths_already_there(self):
def test_add_path_containing_paths_already_there(self):
d = Directories()
d.add_path(testpath + 'utils')
self.assertEqual(1, len(d))
added = d.add_path(testpath)
self.assertEqual(1, len(d))
self.assert_(added is d[0])
d.add_path(testpath)
eq_(len(d), 1)
eq_(d[0], testpath)
def test_AddPath_non_latin(self):
p = Path(self.tmpdir())
@@ -114,7 +135,7 @@ class TCDirectories(TestCase):
def test_set_state_keep_state_dict_size_to_minimum(self):
d = Directories()
p = Path(phys_test.create_fake_fs(self.tmpdir()))
p = create_fake_fs(self.tmppath())
d.add_path(p)
d.set_state(p,STATE_REFERENCE)
d.set_state(p + 'dir1',STATE_REFERENCE)
@@ -129,7 +150,7 @@ class TCDirectories(TestCase):
def test_get_files(self):
d = Directories()
p = Path(phys_test.create_fake_fs(self.tmpdir()))
p = create_fake_fs(self.tmppath())
d.add_path(p)
d.set_state(p + 'dir1',STATE_REFERENCE)
d.set_state(p + 'dir2',STATE_EXCLUDED)
@@ -177,52 +198,28 @@ class TCDirectories(TestCase):
except LookupError:
self.fail()
def test_default_dirclass(self):
self.assert_(Directories().dirclass is hsfs.phys.Directory)
def test_dirclass(self):
class MySpecialDirclass(hsfs.phys.Directory): pass
d = Directories()
d.dirclass = MySpecialDirclass
d.add_path(testpath)
self.assert_(isinstance(d[0], MySpecialDirclass))
def test_load_from_file_with_invalid_path(self):
#This test simulates a load from file resulting in a
#InvalidPath raise. Other directories must be loaded.
d1 = Directories()
d1.add_path(testpath + 'utils')
#Will raise InvalidPath upon loading
d1.add_path(self.tmppath()).name = 'does_not_exist'
p = self.tmppath()
d1.add_path(p)
io.rmdir(p)
tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
d1.save_to_file(tmpxml)
d2 = Directories()
d2.load_from_file(tmpxml)
self.assertEqual(1, len(d2))
def test_load_from_file_with_same_paths(self):
#This test simulates a load from file resulting in a
#AlreadyExists raise. Other directories must be loaded.
d1 = Directories()
p1 = self.tmppath()
p2 = self.tmppath()
d1.add_path(p1)
d1.add_path(p2)
#Will raise AlreadyExists upon loading
d1.add_path(self.tmppath()).name = unicode(p1)
tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
d1.save_to_file(tmpxml)
d2 = Directories()
d2.load_from_file(tmpxml)
self.assertEqual(2, len(d2))
def test_unicode_save(self):
d = Directories()
p1 = self.tmppath() + u'hello\xe9'
io.mkdir(p1)
io.mkdir(p1 + u'foo\xe9')
d.add_path(p1)
d.set_state(d[0][0].path, STATE_EXCLUDED)
d.set_state(p1 + u'foo\xe9', STATE_EXCLUDED)
tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
try:
d.save_to_file(tmpxml)
@@ -231,7 +228,7 @@ class TCDirectories(TestCase):
def test_get_files_refreshes_its_directories(self):
d = Directories()
p = Path(phys_test.create_fake_fs(self.tmpdir()))
p = create_fake_fs(self.tmppath())
d.add_path(p)
files = d.get_files()
self.assertEqual(6, len(list(files)))
@@ -258,16 +255,6 @@ class TCDirectories(TestCase):
d.set_state(hidden_dir_path, STATE_NORMAL)
self.assertEqual(d.get_state(hidden_dir_path), STATE_NORMAL)
def test_special_dirclasses(self):
# if a path is in special_dirclasses, use this class instead
class MySpecialDirclass(hsfs.phys.Directory): pass
d = Directories()
p1 = self.tmppath()
p2 = self.tmppath()
d.special_dirclasses[p1] = MySpecialDirclass
self.assert_(isinstance(d.add_path(p2), hsfs.phys.Directory))
self.assert_(isinstance(d.add_path(p1), MySpecialDirclass))
def test_default_path_state_override(self):
# It's possible for a subclass to override the default state of a path
class MyDirectories(Directories):

View File

@@ -340,21 +340,13 @@ class TCget_match(TestCase):
self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage)
class TCMatchFactory(TestCase):
class GetMatches(TestCase):
def test_empty(self):
self.assertEqual([],MatchFactory().getmatches([]))
def test_defaults(self):
mf = MatchFactory()
self.assertEqual(50,mf.common_word_threshold)
self.assertEqual(False,mf.weight_words)
self.assertEqual(False,mf.match_similar_words)
self.assertEqual(False,mf.no_field_order)
self.assertEqual(0,mf.min_match_percentage)
eq_(getmatches([]), [])
def test_simple(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
r = MatchFactory().getmatches(l)
r = getmatches(l)
self.assertEqual(2,len(r))
seek = [m for m in r if m.percentage == 50] #"foo bar" and "bar bleh"
m = seek[0]
@@ -367,7 +359,7 @@ class TCMatchFactory(TestCase):
def test_null_and_unrelated_objects(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")]
r = MatchFactory().getmatches(l)
r = getmatches(l)
self.assertEqual(1,len(r))
m = r[0]
self.assertEqual(50,m.percentage)
@@ -376,34 +368,33 @@ class TCMatchFactory(TestCase):
def test_twice_the_same_word(self):
l = [NamedObject("foo foo bar"),NamedObject("bar bleh")]
r = MatchFactory().getmatches(l)
r = getmatches(l)
self.assertEqual(1,len(r))
def test_twice_the_same_word_when_preworded(self):
l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)]
r = MatchFactory().getmatches(l)
r = getmatches(l)
self.assertEqual(1,len(r))
def test_two_words_match(self):
l = [NamedObject("foo bar"),NamedObject("foo bar bleh")]
r = MatchFactory().getmatches(l)
r = getmatches(l)
self.assertEqual(1,len(r))
def test_match_files_with_only_common_words(self):
#If a word occurs more than 50 times, it is excluded from the matching process
#The problem with the common_word_threshold is that the files containing only common
#words will never be matched together. We *should* match them.
mf = MatchFactory()
mf.common_word_threshold = 50
# This test assumes that the common word threashold const is 50
l = [NamedObject("foo") for i in range(50)]
r = mf.getmatches(l)
r = getmatches(l)
self.assertEqual(1225,len(r))
def test_use_words_already_there_if_there(self):
o1 = NamedObject('foo')
o2 = NamedObject('bar')
o2.words = ['foo']
self.assertEqual(1,len(MatchFactory().getmatches([o1,o2])))
eq_(1, len(getmatches([o1,o2])))
def test_job(self):
def do_progress(p,d=''):
@@ -413,75 +404,62 @@ class TCMatchFactory(TestCase):
j = job.Job(1,do_progress)
self.log = []
s = "foo bar"
MatchFactory().getmatches([NamedObject(s),NamedObject(s),NamedObject(s)],j)
getmatches([NamedObject(s), NamedObject(s), NamedObject(s)], j=j)
self.assert_(len(self.log) > 2)
self.assertEqual(0,self.log[0])
self.assertEqual(100,self.log[-1])
def test_weight_words(self):
mf = MatchFactory()
mf.weight_words = True
l = [NamedObject("foo bar"),NamedObject("bar bleh")]
m = mf.getmatches(l)[0]
m = getmatches(l, weight_words=True)[0]
self.assertEqual(int((6.0 / 13.0) * 100),m.percentage)
def test_similar_word(self):
mf = MatchFactory()
mf.match_similar_words = True
l = [NamedObject("foobar"),NamedObject("foobars")]
self.assertEqual(1,len(mf.getmatches(l)))
self.assertEqual(100,mf.getmatches(l)[0].percentage)
eq_(len(getmatches(l, match_similar_words=True)), 1)
eq_(getmatches(l, match_similar_words=True)[0].percentage, 100)
l = [NamedObject("foobar"),NamedObject("foo")]
self.assertEqual(0,len(mf.getmatches(l))) #too far
eq_(len(getmatches(l, match_similar_words=True)), 0) #too far
l = [NamedObject("bizkit"),NamedObject("bizket")]
self.assertEqual(1,len(mf.getmatches(l)))
eq_(len(getmatches(l, match_similar_words=True)), 1)
l = [NamedObject("foobar"),NamedObject("foosbar")]
self.assertEqual(1,len(mf.getmatches(l)))
eq_(len(getmatches(l, match_similar_words=True)), 1)
def test_single_object_with_similar_words(self):
mf = MatchFactory()
mf.match_similar_words = True
l = [NamedObject("foo foos")]
self.assertEqual(0,len(mf.getmatches(l)))
eq_(len(getmatches(l, match_similar_words=True)), 0)
def test_double_words_get_counted_only_once(self):
mf = MatchFactory()
l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")]
m = mf.getmatches(l)[0]
m = getmatches(l)[0]
self.assertEqual(75,m.percentage)
def test_with_fields(self):
mf = MatchFactory()
o1 = NamedObject("foo bar - foo bleh")
o2 = NamedObject("foo bar - bleh bar")
o1.words = getfields(o1.name)
o2.words = getfields(o2.name)
m = mf.getmatches([o1, o2])[0]
m = getmatches([o1, o2])[0]
self.assertEqual(50, m.percentage)
def test_with_fields_no_order(self):
mf = MatchFactory()
mf.no_field_order = True
o1 = NamedObject("foo bar - foo bleh")
o2 = NamedObject("bleh bang - foo bar")
o1.words = getfields(o1.name)
o2.words = getfields(o2.name)
m = mf.getmatches([o1, o2])[0]
self.assertEqual(50 ,m.percentage)
m = getmatches([o1, o2], no_field_order=True)[0]
eq_(m.percentage, 50)
def test_only_match_similar_when_the_option_is_set(self):
mf = MatchFactory()
mf.match_similar_words = False
l = [NamedObject("foobar"),NamedObject("foobars")]
self.assertEqual(0,len(mf.getmatches(l)))
eq_(len(getmatches(l, match_similar_words=False)), 0)
def test_dont_recurse_do_match(self):
# with nosetests, the stack is increased. The number has to be high enough not to be failing falsely
sys.setrecursionlimit(100)
mf = MatchFactory()
files = [NamedObject('foo bar') for i in range(101)]
try:
mf.getmatches(files)
getmatches(files)
except RuntimeError:
self.fail()
finally:
@@ -489,18 +467,9 @@ class TCMatchFactory(TestCase):
def test_min_match_percentage(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
mf = MatchFactory()
mf.min_match_percentage = 50
r = mf.getmatches(l)
r = getmatches(l, min_match_percentage=50)
self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match
def test_limit(self):
l = [NamedObject(),NamedObject(),NamedObject()]
mf = MatchFactory()
mf.limit = 2
r = mf.getmatches(l)
self.assertEqual(2,len(r))
def test_MemoryError(self):
@log_calls
def mocked_match(first, second, flags):
@@ -510,9 +479,8 @@ class TCMatchFactory(TestCase):
objects = [NamedObject() for i in range(10)] # results in 45 matches
self.mock(engine, 'get_match', mocked_match)
mf = MatchFactory()
try:
r = mf.getmatches(objects)
r = getmatches(objects)
except MemoryError:
self.fail('MemorryError must be handled')
self.assertEqual(42, len(r))
@@ -738,7 +706,7 @@ class TCget_groups(TestCase):
def test_simple(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh")]
matches = MatchFactory().getmatches(l)
matches = getmatches(l)
m = matches[0]
r = get_groups(matches)
self.assertEqual(1,len(r))
@@ -749,7 +717,7 @@ class TCget_groups(TestCase):
def test_group_with_multiple_matches(self):
#This results in 3 matches
l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")]
matches = MatchFactory().getmatches(l)
matches = getmatches(l)
r = get_groups(matches)
self.assertEqual(1,len(r))
g = r[0]
@@ -759,7 +727,7 @@ class TCget_groups(TestCase):
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")]
#There will be 2 groups here: group "a b" and group "c d"
#"b c" can go either of them, but not both.
matches = MatchFactory().getmatches(l)
matches = getmatches(l)
r = get_groups(matches)
self.assertEqual(2,len(r))
self.assertEqual(5,len(r[0])+len(r[1]))
@@ -768,7 +736,7 @@ class TCget_groups(TestCase):
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")]
#There will be 2 groups here: group "a b" and group "c d"
#"b c" can fit in both, but it must be in only one of them
matches = MatchFactory().getmatches(l)
matches = getmatches(l)
r = get_groups(matches)
self.assertEqual(1,len(r))
@@ -788,7 +756,7 @@ class TCget_groups(TestCase):
def test_four_sized_group(self):
l = [NamedObject("foobar") for i in xrange(4)]
m = MatchFactory().getmatches(l)
m = getmatches(l)
r = get_groups(m)
self.assertEqual(1,len(r))
self.assertEqual(4,len(r[0]))

View File

@@ -16,8 +16,8 @@ from hsutil.path import Path
from hsutil.testcase import TestCase
from hsutil.misc import first
from . import engine_test
from .. import data, engine
from . import engine_test, data
from .. import engine
from ..results import *
class NamedObject(engine_test.NamedObject):
@@ -37,7 +37,7 @@ class NamedObject(engine_test.NamedObject):
def GetTestGroups():
objects = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("foo bleh"),NamedObject("ibabtu"),NamedObject("ibabtu")]
objects[1].size = 1024
matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
matches = engine.getmatches(objects) #we should have 5 matches
groups = engine.get_groups(matches) #We should have 2 groups
for g in groups:
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
@@ -505,7 +505,7 @@ class TCResultsXML(TestCase):
return objects[1]
objects = [NamedObject(u"\xe9foo bar",True),NamedObject("bar bleh",True)]
matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
matches = engine.getmatches(objects) #we should have 5 matches
groups = engine.get_groups(matches) #We should have 2 groups
for g in groups:
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is

View File

@@ -132,8 +132,6 @@ def test_content_scan_doesnt_put_md5_in_words_at_the_end():
f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
r = s.GetDupeGroups(f)
g = r[0]
eq_(g.ref.words, ['--'])
eq_(g.dupes[0].words, ['--'])
def test_extension_is_not_counted_in_filename_scan():
s = Scanner()
@@ -369,23 +367,6 @@ def test_ignore_list_checks_for_unicode():
assert f2 in g
assert f3 in g
def test_custom_match_factory():
class MatchFactory(object):
def getmatches(self, objects, j=None):
return [Match(objects[0], objects[1], 420)]
s = Scanner()
s.match_factory = MatchFactory()
o1, o2 = no('foo'), no('bar')
groups = s.GetDupeGroups([o1, o2])
eq_(len(groups), 1)
g = groups[0]
eq_(len(g), 2)
g.switch_ref(o1)
m = g.get_match_of(o2)
eq_(m, (o1, o2, 420))
def test_file_evaluates_to_false():
# A very wrong way to use any() was added at some point, causing resulting group list
# to be empty.
@@ -455,15 +436,3 @@ def test_partial_group_match():
assert o2 in group
assert o3 not in group
eq_(s.discarded_file_count, 1)
#--- Scanner ME
def test_priorize_me():
# in ScannerME, bitrate goes first (right after is_ref) in priorization
s = ScannerME()
o1, o2 = no('foo'), no('foo')
o1.bitrate = 1
o2.bitrate = 2
[group] = s.GetDupeGroups([o1, o2])
assert group.ref is o2

View File

@@ -16,10 +16,10 @@ import os.path as op
from PyQt4.QtCore import Qt, QTimer, QObject, QCoreApplication, QUrl, SIGNAL
from PyQt4.QtGui import QProgressDialog, QDesktopServices, QFileDialog, QDialog, QMessageBox
import hsfs as fs
from hsutil import job
from hsutil.reg import RegistrationRequired
from dupeguru import fs
from dupeguru.app import (DupeGuru as DupeGuruBase, JOB_SCAN, JOB_LOAD, JOB_MOVE, JOB_COPY,
JOB_DELETE)
@@ -145,6 +145,7 @@ class DupeGuru(DupeGuruBase, QObject):
def ask_for_reg_code(self):
if self.reg.ask_for_code():
#XXX bug???
self._setup_ui_as_registered()
@demo_method

View File

@@ -47,7 +47,14 @@ class DirectoryNode(TreeNode):
return DirectoryNode(self.model, self, ref, row)
def _getChildren(self):
return self.ref.dirs
return self.model._dirs.get_subfolders(self.ref)
@property
def name(self):
if self.parent is not None:
return self.ref[-1]
else:
return unicode(self.ref)
class DirectoriesModel(TreeModel):
@@ -70,13 +77,13 @@ class DirectoriesModel(TreeModel):
node = index.internalPointer()
if role == Qt.DisplayRole:
if index.column() == 0:
return node.ref.name
return node.name
else:
return STATES[self._dirs.get_state(node.ref.path)]
return STATES[self._dirs.get_state(node.ref)]
elif role == Qt.EditRole and index.column() == 1:
return self._dirs.get_state(node.ref.path)
return self._dirs.get_state(node.ref)
elif role == Qt.ForegroundRole:
state = self._dirs.get_state(node.ref.path)
state = self._dirs.get_state(node.ref)
if state == 1:
return QBrush(Qt.blue)
elif state == 2:
@@ -101,6 +108,6 @@ class DirectoriesModel(TreeModel):
if not index.isValid() or role != Qt.EditRole or index.column() != 1:
return False
node = index.internalPointer()
self._dirs.set_state(node.ref.path, value)
self._dirs.set_state(node.ref, value)
return True

View File

@@ -8,12 +8,13 @@
import objc
from AppKit import *
from dupeguru import app_me_cocoa, scanner
from dupeguru_me.app_cocoa import DupeGuruME
from dupeguru.scanner import (SCAN_TYPE_FILENAME, SCAN_TYPE_FIELDS, SCAN_TYPE_FIELDS_NO_ORDER,
SCAN_TYPE_TAG, SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO)
# Fix py2app imports which chokes on relative imports
from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, scanner
from hsfs import auto, stats, tree, music
from hsfs.phys import music
from dupeguru_me import app_cocoa, data, fs, scanner
from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, scanner, fs
from hsmedia import aiff, flac, genres, id3v1, id3v2, mp4, mpeg, ogg, wma
from hsutil import conflict
@@ -23,7 +24,7 @@ class PyApp(NSObject):
class PyDupeGuru(PyApp):
def init(self):
self = super(PyDupeGuru,self).init()
self.app = app_me_cocoa.DupeGuruME()
self.app = DupeGuruME()
return self
#---Directories
@@ -180,12 +181,12 @@ class PyDupeGuru(PyApp):
def setScanType_(self, scan_type):
try:
self.app.scanner.scan_type = [
scanner.SCAN_TYPE_FILENAME,
scanner.SCAN_TYPE_FIELDS,
scanner.SCAN_TYPE_FIELDS_NO_ORDER,
scanner.SCAN_TYPE_TAG,
scanner.SCAN_TYPE_CONTENT,
scanner.SCAN_TYPE_CONTENT_AUDIO
SCAN_TYPE_FILENAME,
SCAN_TYPE_FIELDS,
SCAN_TYPE_FIELDS_NO_ORDER,
SCAN_TYPE_TAG,
SCAN_TYPE_CONTENT,
SCAN_TYPE_CONTENT_AUDIO
][scan_type]
except IndexError:
pass

0
me/py/__init__.py Normal file
View File

View File

@@ -7,29 +7,29 @@
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
import os.path as op
import logging
from appscript import app, k, CommandError
import time
from hsutil.cocoa import as_fetch
import hsfs.phys.music
import app_cocoa, data_me, scanner
from dupeguru.app_cocoa import JOBID2TITLE, DupeGuru as DupeGuruBase
from . import data, scanner, fs
JOB_REMOVE_DEAD_TRACKS = 'jobRemoveDeadTracks'
JOB_SCAN_DEAD_TRACKS = 'jobScanDeadTracks'
app_cocoa.JOBID2TITLE.update({
JOBID2TITLE.update({
JOB_REMOVE_DEAD_TRACKS: "Removing dead tracks from your iTunes Library",
JOB_SCAN_DEAD_TRACKS: "Scanning the iTunes Library",
})
class DupeGuruME(app_cocoa.DupeGuru):
class DupeGuruME(DupeGuruBase):
def __init__(self):
app_cocoa.DupeGuru.__init__(self, data_me, 'dupeGuru Music Edition', appid=1)
DupeGuruBase.__init__(self, data, 'dupeGuru Music Edition', appid=1)
self.scanner = scanner.ScannerME()
self.directories.dirclass = hsfs.phys.music.Directory
self.directories.fileclasses = [fs.Mp3File, fs.Mp4File, fs.WmaFile, fs.OggFile, fs.FlacFile, fs.AiffFile]
self.dead_tracks = []
def remove_dead_tracks(self):

View File

@@ -8,7 +8,7 @@
# http://www.hardcoded.net/licenses/hs_license
from hsutil.str import format_time, FT_MINUTES, format_size
from .data import (format_path, format_timestamp, format_words, format_perc,
from dupeguru.data import (format_path, format_timestamp, format_words, format_perc,
format_dupe_count, cmp_value)
COLUMNS = [
@@ -76,7 +76,7 @@ def GetDisplayInfo(dupe, group, delta):
str(dupe.track),
dupe.comment,
format_perc(percentage),
format_words(dupe.words),
format_words(dupe.words) if hasattr(dupe, 'words') else '',
format_dupe_count(dupe_count)
]

183
me/py/fs.py Normal file
View File

@@ -0,0 +1,183 @@
# -*- coding: utf-8 -*-
# Created By: Virgil Dupras
# Created On: 2009-10-23
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "HS" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
from hsmedia import mpeg, wma, mp4, ogg, flac, aiff
from hsutil.str import get_file_ext
from dupeguru import fs
TAG_FIELDS = ['audiosize', 'duration', 'bitrate', 'samplerate', 'title', 'artist',
'album', 'genre', 'year', 'track', 'comment']
class MusicFile(fs.File):
INITIAL_INFO = fs.File.INITIAL_INFO.copy()
INITIAL_INFO.update({
'audiosize': 0,
'bitrate' : 0,
'duration' : 0,
'samplerate':0,
'artist' : '',
'album' : '',
'title' : '',
'genre' : '',
'comment' : '',
'year' : '',
'track' : 0,
})
HANDLED_EXTS = set()
@classmethod
def can_handle(cls, path):
if not fs.File.can_handle(path):
return False
return get_file_ext(path[-1]) in cls.HANDLED_EXTS
class Mp3File(MusicFile):
HANDLED_EXTS = set(['mp3'])
def _read_info(self, field):
if field == 'md5partial':
fileinfo = mpeg.Mpeg(unicode(self.path))
self._md5partial_offset = fileinfo.audio_offset
self._md5partial_size = fileinfo.audio_size
MusicFile._read_info(self, field)
if field in TAG_FIELDS:
fileinfo = mpeg.Mpeg(unicode(self.path))
self.audiosize = fileinfo.audio_size
self.bitrate = fileinfo.bitrate
self.duration = fileinfo.duration
self.samplerate = fileinfo.sample_rate
i1 = fileinfo.id3v1
# id3v1, even when non-existant, gives empty values. not id3v2. if id3v2 don't exist,
# just replace it with id3v1
i2 = fileinfo.id3v2
if not i2.exists:
i2 = i1
self.artist = i2.artist or i1.artist
self.album = i2.album or i1.album
self.title = i2.title or i1.title
self.genre = i2.genre or i1.genre
self.comment = i2.comment or i1.comment
self.year = i2.year or i1.year
self.track = i2.track or i1.track
class WmaFile(MusicFile):
HANDLED_EXTS = set(['wma'])
def _read_info(self, field):
if field == 'md5partial':
dec = wma.WMADecoder(unicode(self.path))
self._md5partial_offset = dec.audio_offset
self._md5partial_size = dec.audio_size
MusicFile._read_info(self, field)
if field in TAG_FIELDS:
dec = wma.WMADecoder(unicode(self.path))
self.audiosize = dec.audio_size
self.bitrate = dec.bitrate
self.duration = dec.duration
self.samplerate = dec.sample_rate
self.artist = dec.artist
self.album = dec.album
self.title = dec.title
self.genre = dec.genre
self.comment = dec.comment
self.year = dec.year
self.track = dec.track
class Mp4File(MusicFile):
HANDLED_EXTS = set(['m4a', 'm4p'])
def _read_info(self, field):
if field == 'md5partial':
dec = mp4.File(unicode(self.path))
self._md5partial_offset = dec.audio_offset
self._md5partial_size = dec.audio_size
dec.close()
MusicFile._read_info(self, field)
if field in TAG_FIELDS:
dec = mp4.File(unicode(self.path))
self.audiosize = dec.audio_size
self.bitrate = dec.bitrate
self.duration = dec.duration
self.samplerate = dec.sample_rate
self.artist = dec.artist
self.album = dec.album
self.title = dec.title
self.genre = dec.genre
self.comment = dec.comment
self.year = dec.year
self.track = dec.track
dec.close()
class OggFile(MusicFile):
HANDLED_EXTS = set(['ogg'])
def _read_info(self, field):
if field == 'md5partial':
dec = ogg.Vorbis(unicode(self.path))
self._md5partial_offset = dec.audio_offset
self._md5partial_size = dec.audio_size
MusicFile._read_info(self, field)
if field in TAG_FIELDS:
dec = ogg.Vorbis(unicode(self.path))
self.audiosize = dec.audio_size
self.bitrate = dec.bitrate
self.duration = dec.duration
self.samplerate = dec.sample_rate
self.artist = dec.artist
self.album = dec.album
self.title = dec.title
self.genre = dec.genre
self.comment = dec.comment
self.year = dec.year
self.track = dec.track
class FlacFile(MusicFile):
HANDLED_EXTS = set(['flac'])
def _read_info(self, field):
if field == 'md5partial':
dec = flac.FLAC(unicode(self.path))
self._md5partial_offset = dec.audio_offset
self._md5partial_size = dec.audio_size
MusicFile._read_info(self, field)
if field in TAG_FIELDS:
dec = flac.FLAC(unicode(self.path))
self.audiosize = dec.audio_size
self.bitrate = dec.bitrate
self.duration = dec.duration
self.samplerate = dec.sample_rate
self.artist = dec.artist
self.album = dec.album
self.title = dec.title
self.genre = dec.genre
self.comment = dec.comment
self.year = dec.year
self.track = dec.track
class AiffFile(MusicFile):
HANDLED_EXTS = set(['aif', 'aiff', 'aifc'])
def _read_info(self, field):
if field == 'md5partial':
dec = aiff.File(unicode(self.path))
self._md5partial_offset = dec.audio_offset
self._md5partial_size = dec.audio_size
MusicFile._read_info(self, field)
if field in TAG_FIELDS:
dec = aiff.File(unicode(self.path))
self.audiosize = dec.audio_size
self.bitrate = dec.bitrate
self.duration = dec.duration
self.samplerate = dec.sample_rate
tag = dec.tag
if tag is not None:
self.artist = tag.artist
self.album = tag.album
self.title = tag.title
self.genre = tag.genre
self.comment = tag.comment
self.year = tag.year
self.track = tag.track

16
me/py/scanner.py Normal file
View File

@@ -0,0 +1,16 @@
# Created By: Virgil Dupras
# Created On: 2006/03/03
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "HS" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
from dupeguru.scanner import Scanner as ScannerBase
class ScannerME(ScannerBase):
@staticmethod
def _key_func(dupe):
return (not dupe.is_ref, -dupe.bitrate, -dupe.size)

0
me/py/tests/__init__.py Normal file
View File

View File

@@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
# Created By: Virgil Dupras
# Created On: 2009-10-23
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "HS" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
from hsutil.path import Path
from dupeguru.engine import getwords
from ..scanner import *
class NamedObject(object):
def __init__(self, name="foobar", size=1):
self.name = name
self.size = size
self.path = Path('')
self.words = getwords(name)
no = NamedObject
def test_priorize_me():
# in ScannerME, bitrate goes first (right after is_ref) in priorization
s = ScannerME()
o1, o2 = no('foo'), no('foo')
o1.bitrate = 1
o2.bitrate = 2
[group] = s.GetDupeGroups([o1, o2])
assert group.ref is o2

View File

@@ -7,9 +7,7 @@
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
import hsfs.phys.music
from dupeguru import data_me, scanner
from dupeguru_me import data, scanner, fs
from base.app import DupeGuru as DupeGuruBase
from details_dialog import DetailsDialog
@@ -23,11 +21,11 @@ class DupeGuru(DupeGuruBase):
DELTA_COLUMNS = frozenset([2, 3, 4, 5, 7, 8])
def __init__(self):
DupeGuruBase.__init__(self, data_me, appid=1)
DupeGuruBase.__init__(self, data, appid=1)
def _setup(self):
self.scanner = scanner.ScannerME()
self.directories.dirclass = hsfs.phys.music.Directory
self.directories.fileclasses = [fs.Mp3File, fs.Mp4File, fs.WmaFile, fs.OggFile, fs.FlacFile, fs.AiffFile]
DupeGuruBase._setup(self)
def _update_options(self):

View File

@@ -12,7 +12,6 @@ from dupeguru_pe import app_cocoa as app_pe_cocoa
# Fix py2app imports which chokes on relative imports
from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, scanner
from dupeguru_pe import block, cache, matchbase, data
from hsfs import auto, stats, tree
from hsutil import conflict
class PyApp(NSObject):
@@ -39,7 +38,7 @@ class PyDupeGuru(PyApp):
self.app.scanner.ignore_list.Clear()
def clearPictureCache(self):
self.app.scanner.match_factory.cached_blocks.clear()
self.app.scanner.cached_blocks.clear()
def doScan(self):
return self.app.start_scanning()
@@ -172,10 +171,10 @@ class PyDupeGuru(PyApp):
#---Properties
def setMatchScaled_(self,match_scaled):
self.app.scanner.match_factory.match_scaled = match_scaled
self.app.scanner.match_scaled = match_scaled
def setMinMatchPercentage_(self,percentage):
self.app.scanner.match_factory.threshold = int(percentage)
self.app.scanner.threshold = int(percentage)
def setMixFileKind_(self,mix_file_kind):
self.app.scanner.mix_file_kind = mix_file_kind

View File

@@ -1,3 +1,7 @@
- date: 2009-10-24
version: 1.7.8
description: |
* Fixed a bug sometimes causing some duplicates to be ignored during the scans. (#73)
- date: 2009-10-14
version: 1.7.7
description: |

View File

@@ -7,41 +7,43 @@
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
import os
import os.path as op
import logging
import plistlib
import re
import objc
from Foundation import *
from AppKit import *
from appscript import app, k
from hsutil import job, io
import hsfs as fs
from hsfs import phys, InvalidPath
from hsutil import files
from hsutil import io
from hsutil.str import get_file_ext
from hsutil.path import Path
from hsutil.cocoa import as_fetch
from dupeguru import fs
from dupeguru import app_cocoa, directories
from . import data, matchbase
from . import data
from .cache import string_to_colors, Cache
from .scanner import ScannerPE
mainBundle = NSBundle.mainBundle()
PictureBlocks = mainBundle.classNamed_('PictureBlocks')
assert PictureBlocks is not None
class Photo(phys.File):
INITIAL_INFO = phys.File.INITIAL_INFO.copy()
class Photo(fs.File):
INITIAL_INFO = fs.File.INITIAL_INFO.copy()
INITIAL_INFO.update({
'dimensions': (0,0),
})
HANDLED_EXTS = set(['png', 'jpg', 'jpeg', 'gif', 'psd', 'bmp', 'tiff', 'tif', 'nef', 'cr2'])
@classmethod
def can_handle(cls, path):
return fs.File.can_handle(path) and get_file_ext(path[-1]) in cls.HANDLED_EXTS
def _read_info(self, field):
super(Photo, self)._read_info(field)
fs.File._read_info(self, field)
if field == 'dimensions':
size = PictureBlocks.getImageSize_(unicode(self.path))
self.dimensions = (size.width, size.height)
@@ -49,7 +51,7 @@ class Photo(phys.File):
def get_blocks(self, block_count_per_side):
try:
blocks = PictureBlocks.getBlocksFromImagePath_blockCount_(unicode(self.path), block_count_per_side)
except Exception, e:
except Exception as e:
raise IOError('The reading of "%s" failed with "%s"' % (unicode(self.path), unicode(e)))
if not blocks:
raise IOError('The picture %s could not be read' % unicode(self.path))
@@ -57,89 +59,79 @@ class Photo(phys.File):
class IPhoto(Photo):
def __init__(self, parent, whole_path):
super(IPhoto, self).__init__(parent, whole_path[-1])
self.whole_path = whole_path
def _build_path(self):
return self.whole_path
@property
def display_path(self):
return super(IPhoto, self)._build_path()
return Path(('iPhoto Library', self.name))
def get_iphoto_database_path():
ud = NSUserDefaults.standardUserDefaults()
prefs = ud.persistentDomainForName_('com.apple.iApps')
if 'iPhotoRecentDatabases' not in prefs:
raise directories.InvalidPathError()
plisturl = NSURL.URLWithString_(prefs['iPhotoRecentDatabases'][0])
return Path(plisturl.path())
class Directory(phys.Directory):
cls_file_class = Photo
cls_supported_exts = ('png', 'jpg', 'jpeg', 'gif', 'psd', 'bmp', 'tiff', 'nef', 'cr2')
def _fetch_subitems(self):
subdirs, subfiles = super(Directory,self)._fetch_subitems()
return subdirs, [name for name in subfiles if get_file_ext(name) in self.cls_supported_exts]
class IPhotoLibrary(fs.Directory):
def __init__(self, plistpath):
self.plistpath = plistpath
self.refpath = plistpath[:-1]
# the AlbumData.xml file lives right in the library path
super(IPhotoLibrary, self).__init__(None, 'iPhoto Library')
if not io.exists(plistpath):
raise InvalidPath(self)
def _update_photo(self, photo_data):
def get_iphoto_pictures(plistpath):
if not io.exists(plistpath):
raise InvalidPath(self)
s = io.open(plistpath).read()
# There was a case where a guy had 0x10 chars in his plist, causing expat errors on loading
s = s.replace('\x10', '')
# It seems that iPhoto sometimes doesn't properly escape & chars. The regexp below is to find
# any & char that is not a &-based entity (&, ", etc.). based on TextMate's XML
# bundle's regexp
s, count = re.subn(r'&(?![a-zA-Z0-9_-]+|#[0-9]+|#x[0-9a-fA-F]+;)', '', s)
if count:
logging.warning("%d invalid XML entities replacement made", count)
plist = plistlib.readPlistFromString(s)
result = []
for photo_data in plist['Master Image List'].values():
if photo_data['MediaType'] != 'Image':
return
continue
photo_path = Path(photo_data['ImagePath'])
subpath = photo_path[len(self.refpath):-1]
subdir = self
for element in subpath:
try:
subdir = subdir[element]
except KeyError:
subdir = fs.Directory(subdir, element)
try:
IPhoto(subdir, photo_path)
except fs.AlreadyExistsError:
# it's possible for 2 entries in the plist to point to the same path. Ignore one of them.
pass
photo = IPhoto(photo_path)
result.append(photo)
return result
class Directories(directories.Directories):
def __init__(self):
directories.Directories.__init__(self, fileclasses=[Photo])
self.iphoto_libpath = get_iphoto_database_path()
self.set_state(self.iphoto_libpath[:-1], directories.STATE_EXCLUDED)
def update(self):
self.clear()
s = open(unicode(self.plistpath)).read()
# There was a case where a guy had 0x10 chars in his plist, causing expat errors on loading
s = s.replace('\x10', '')
# It seems that iPhoto sometimes doesn't properly escape & chars. The regexp below is to find
# any & char that is not a &-based entity (&, ", etc.). based on TextMate's XML
# bundle's regexp
s, count = re.subn(r'&(?![a-zA-Z0-9_-]+|#[0-9]+|#x[0-9a-fA-F]+;)', '', s)
if count:
logging.warning("%d invalid XML entities replacement made", count)
plist = plistlib.readPlistFromString(s)
for photo_data in plist['Master Image List'].values():
self._update_photo(photo_data)
def _get_files(self, from_path):
if from_path == Path('iPhoto Library'):
is_ref = self.get_state(from_path) == directories.STATE_REFERENCE
photos = get_iphoto_pictures(self.iphoto_libpath)
for photo in photos:
photo.is_ref = is_ref
return photos
else:
return directories.Directories._get_files(self, from_path)
def force_update(self): # Don't update
pass
@staticmethod
def get_subfolders(path):
if path == Path('iPhoto Library'):
return []
else:
return directories.Directories.get_subfolders(path)
def add_path(self, path):
if path == Path('iPhoto Library'):
if path in self:
raise AlreadyThereError()
self._dirs.append(path)
else:
directories.Directories.add_path(self, path)
class DupeGuruPE(app_cocoa.DupeGuru):
def __init__(self):
app_cocoa.DupeGuru.__init__(self, data, 'dupeGuru Picture Edition', appid=5)
self.scanner.match_factory = matchbase.AsyncMatchFactory()
self.directories.dirclass = Directory
self.directories.special_dirclasses[Path('iPhoto Library')] = lambda _, __: self._create_iphoto_library()
self.scanner = ScannerPE()
self.directories = Directories()
p = op.join(self.appdata, 'cached_pictures.db')
self.scanner.match_factory.cached_blocks = Cache(p)
def _create_iphoto_library(self):
ud = NSUserDefaults.standardUserDefaults()
prefs = ud.persistentDomainForName_('com.apple.iApps')
if 'iPhotoRecentDatabases' not in prefs:
raise directories.InvalidPathError
plisturl = NSURL.URLWithString_(prefs['iPhotoRecentDatabases'][0])
plistpath = Path(plisturl.path())
return IPhotoLibrary(plistpath)
self.scanner.cached_blocks = Cache(p)
def _do_delete(self, j):
def op(dupe):
@@ -174,40 +166,19 @@ class DupeGuruPE(app_cocoa.DupeGuru):
def _do_load(self, j):
self.directories.load_from_file(op.join(self.appdata, 'last_directories.xml'))
for d in self.directories:
if isinstance(d, IPhotoLibrary):
d.update()
self.results.load_from_xml(op.join(self.appdata, 'last_results.xml'), self._get_file, j)
def _get_file(self, str_path):
p = Path(str_path)
for d in self.directories:
result = None
if p in d.path:
result = d.find_path(p[d.path:])
if isinstance(d, IPhotoLibrary) and p in d.refpath:
result = d.find_path(p[d.refpath:])
if result is not None:
return result
def add_directory(self, d):
result = app_cocoa.DupeGuru.add_directory(self, d)
if (result == 0) and (d == 'iPhoto Library'):
[iphotolib] = [dir for dir in self.directories if dir.path == d]
iphotolib.update()
return result
if p in self.directories.iphoto_libpath[:-1]:
return IPhoto(p)
return app_cocoa.DupeGuru._get_file(self, str_path)
def copy_or_move(self, dupe, copy, destination, dest_type):
if isinstance(dupe, IPhoto):
copy = True
return app_cocoa.DupeGuru.copy_or_move(self, dupe, copy, destination, dest_type)
def start_scanning(self):
for directory in self.directories:
if isinstance(directory, IPhotoLibrary):
self.directories.set_state(directory.refpath, directories.STATE_EXCLUDED)
return app_cocoa.DupeGuru.start_scanning(self)
def selected_dupe_path(self):
if not self.selected_dupes:
return None

View File

@@ -20,58 +20,42 @@ from .block import avgdiff, DifferentBlockCountError, NoBlocksError
from .cache import Cache
MIN_ITERATIONS = 3
BLOCK_COUNT_PER_SIDE = 15
# Enough so that we're sure that the main thread will not wait after a result.get() call
# cpucount*2 should be enough to be sure that the spawned process will not wait after the results
# collection made by the main process.
RESULTS_QUEUE_LIMIT = multiprocessing.cpu_count() * 2
def get_match(first,second,percentage):
def prepare_pictures(pictures, cached_blocks, j=job.nulljob):
# The MemoryError handlers in there use logging without first caring about whether or not
# there is enough memory left to carry on the operation because it is assumed that the
# MemoryError happens when trying to read an image file, which is freed from memory by the
# time that MemoryError is raised.
prepared = [] # only pictures for which there was no error getting blocks
try:
for picture in j.iter_with_progress(pictures, 'Analyzed %d/%d pictures'):
picture.dimensions
picture.unicode_path = unicode(picture.path)
try:
if picture.unicode_path not in cached_blocks:
blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE)
cached_blocks[picture.unicode_path] = blocks
prepared.append(picture)
except IOError as e:
logging.warning(unicode(e))
except MemoryError:
logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
if picture.size < 10 * 1024 * 1024: # We're really running out of memory
raise
except MemoryError:
logging.warning('Ran out of memory while preparing pictures')
return prepared
def get_match(first, second, percentage):
if percentage < 0:
percentage = 0
return Match(first,second,percentage)
class MatchFactory(object):
cached_blocks = None
block_count_per_side = 15
threshold = 75
match_scaled = False
def _do_getmatches(self, files, j):
raise NotImplementedError()
def getmatches(self, files, j=job.nulljob):
# The MemoryError handlers in there use logging without first caring about whether or not
# there is enough memory left to carry on the operation because it is assumed that the
# MemoryError happens when trying to read an image file, which is freed from memory by the
# time that MemoryError is raised.
j = j.start_subjob([3, 7])
logging.info('Preparing %d files' % len(files))
prepared = self.prepare_files(files, j)
logging.info('Finished preparing %d files' % len(prepared))
return self._do_getmatches(prepared, j)
def prepare_files(self, files, j=job.nulljob):
prepared = [] # only files for which there was no error getting blocks
try:
for picture in j.iter_with_progress(files, 'Analyzed %d/%d pictures'):
picture.dimensions
picture.unicode_path = unicode(picture.path)
try:
if picture.unicode_path not in self.cached_blocks:
blocks = picture.get_blocks(self.block_count_per_side)
self.cached_blocks[picture.unicode_path] = blocks
prepared.append(picture)
except IOError as e:
logging.warning(unicode(e))
except MemoryError:
logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
if picture.size < 10 * 1024 * 1024: # We're really running out of memory
raise
except MemoryError:
logging.warning('Ran out of memory while preparing files')
return prepared
return Match(first, second, percentage)
def async_compare(ref_id, other_ids, dbname, threshold):
cache = Cache(dbname, threaded=False)
@@ -89,53 +73,55 @@ def async_compare(ref_id, other_ids, dbname, threshold):
results.append((ref_id, other_id, percentage))
cache.con.close()
return results
class AsyncMatchFactory(MatchFactory):
def _do_getmatches(self, pictures, j):
def empty_out_queue(queue, into):
try:
while True:
into.append(queue.get(block=False))
except Empty:
pass
j = j.start_subjob([9, 1], 'Preparing for matching')
cache = self.cached_blocks
id2picture = {}
dimensions2pictures = defaultdict(set)
for picture in pictures:
try:
picture.cache_id = cache.get_id(picture.unicode_path)
id2picture[picture.cache_id] = picture
if not self.match_scaled:
dimensions2pictures[picture.dimensions].add(picture)
except ValueError:
pass
pictures = [p for p in pictures if hasattr(p, 'cache_id')]
pool = multiprocessing.Pool()
async_results = []
matches = []
pictures_copy = set(pictures)
for ref in j.iter_with_progress(pictures, 'Matched %d/%d pictures'):
others = pictures_copy if self.match_scaled else dimensions2pictures[ref.dimensions]
others.remove(ref)
if others:
cache_ids = [f.cache_id for f in others]
args = (ref.cache_id, cache_ids, self.cached_blocks.dbname, self.threshold)
async_results.append(pool.apply_async(async_compare, args))
if len(async_results) > RESULTS_QUEUE_LIMIT:
result = async_results.pop(0)
matches.extend(result.get())
result = []
for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
ref = id2picture[ref_id]
other = id2picture[other_id]
if percentage == 100 and ref.md5 != other.md5:
percentage = 99
if percentage >= self.threshold:
result.append(get_match(ref, other, percentage))
return result
def getmatches(pictures, cached_blocks, threshold=75, match_scaled=False, j=job.nulljob):
def empty_out_queue(queue, into):
try:
while True:
into.append(queue.get(block=False))
except Empty:
pass
j = j.start_subjob([3, 7])
pictures = prepare_pictures(pictures, cached_blocks, j)
j = j.start_subjob([9, 1], 'Preparing for matching')
cache = cached_blocks
id2picture = {}
dimensions2pictures = defaultdict(set)
for picture in pictures:
try:
picture.cache_id = cache.get_id(picture.unicode_path)
id2picture[picture.cache_id] = picture
if not match_scaled:
dimensions2pictures[picture.dimensions].add(picture)
except ValueError:
pass
pictures = [p for p in pictures if hasattr(p, 'cache_id')]
pool = multiprocessing.Pool()
async_results = []
matches = []
pictures_copy = set(pictures)
for ref in j.iter_with_progress(pictures, 'Matched %d/%d pictures'):
others = pictures_copy if match_scaled else dimensions2pictures[ref.dimensions]
others.remove(ref)
if others:
cache_ids = [f.cache_id for f in others]
args = (ref.cache_id, cache_ids, cached_blocks.dbname, threshold)
async_results.append(pool.apply_async(async_compare, args))
if len(async_results) > RESULTS_QUEUE_LIMIT:
result = async_results.pop(0)
matches.extend(result.get())
for result in async_results: # process the rest of the results
matches.extend(result.get())
result = []
for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
ref = id2picture[ref_id]
other = id2picture[other_id]
if percentage == 100 and ref.md5 != other.md5:
percentage = 99
if percentage >= threshold:
result.append(get_match(ref, other, percentage))
return result
multiprocessing.freeze_support()

22
pe/py/scanner.py Normal file
View File

@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# Created By: Virgil Dupras
# Created On: 2009-10-18
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "HS" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
from dupeguru.scanner import Scanner
from . import matchbase
class ScannerPE(Scanner):
cached_blocks = None
match_scaled = False
threshold = 75
def _getmatches(self, files, j):
return matchbase.getmatches(files, self.cached_blocks, self.threshold, self.match_scaled, j)

View File

@@ -12,12 +12,12 @@ import os.path as op
from PyQt4.QtGui import QImage
import PIL.Image
from hsfs import phys
from hsutil.str import get_file_ext
from dupeguru import fs
from dupeguru_pe import data as data_pe
from dupeguru_pe.cache import Cache
from dupeguru_pe.matchbase import AsyncMatchFactory
from dupeguru_pe.scanner import ScannerPE
from block import getblocks
from base.app import DupeGuru as DupeGuruBase
@@ -26,14 +26,19 @@ from main_window import MainWindow
from preferences import Preferences
from preferences_dialog import PreferencesDialog
class File(phys.File):
INITIAL_INFO = phys.File.INITIAL_INFO.copy()
class File(fs.File):
INITIAL_INFO = fs.File.INITIAL_INFO.copy()
INITIAL_INFO.update({
'dimensions': (0,0),
})
HANDLED_EXTS = set(['png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'tif'])
@classmethod
def can_handle(cls, path):
return fs.File.can_handle(path) and get_file_ext(path[-1]) in cls.HANDLED_EXTS
def _read_info(self, field):
super(File, self)._read_info(field)
fs.File._read_info(self, field)
if field == 'dimensions':
im = PIL.Image.open(unicode(self.path))
self.dimensions = im.size
@@ -44,15 +49,6 @@ class File(phys.File):
return getblocks(image, block_count_per_side)
class Directory(phys.Directory):
cls_file_class = File
cls_supported_exts = ('png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff')
def _fetch_subitems(self):
subdirs, subfiles = super(Directory, self)._fetch_subitems()
return subdirs, [name for name in subfiles if get_file_ext(name) in self.cls_supported_exts]
class DupeGuru(DupeGuruBase):
LOGO_NAME = 'logo_pe'
NAME = 'dupeGuru Picture Edition'
@@ -63,15 +59,15 @@ class DupeGuru(DupeGuruBase):
DupeGuruBase.__init__(self, data_pe, appid=5)
def _setup(self):
self.scanner.match_factory = AsyncMatchFactory()
self.directories.dirclass = Directory
self.scanner.match_factory.cached_blocks = Cache(op.join(self.appdata, 'cached_pictures.db'))
self.scanner = ScannerPE()
self.directories.fileclasses = [File]
self.scanner.cached_blocks = Cache(op.join(self.appdata, 'cached_pictures.db'))
DupeGuruBase._setup(self)
def _update_options(self):
DupeGuruBase._update_options(self)
self.scanner.match_factory.match_scaled = self.prefs.match_scaled
self.scanner.match_factory.threshold = self.prefs.filter_hardness
self.scanner.match_scaled = self.prefs.match_scaled
self.scanner.threshold = self.prefs.filter_hardness
def _create_details_dialog(self, parent):
return DetailsDialog(parent, self)

View File

@@ -1,6 +1,6 @@
# -*- mode: python -*-
a = Analysis([os.path.join(HOMEPATH,'support\\_mountzlib.py'), os.path.join(HOMEPATH,'support\\useUnicode.py'), 'start.py'],
pathex=['C:\\src\\dupeguru\\pe\\qt'])
pathex=[])
pyz = PYZ(a.pure)
exe = EXE(pyz,
a.scripts,

View File

@@ -16,6 +16,7 @@ from hsutil.build import print_and_do, build_all_qt_ui
build_all_qt_ui(op.join('qtlib', 'ui'))
build_all_qt_ui('base')
build_all_qt_ui('.')
print_and_do("pyrcc4 base\\dg.qrc > base\\dg_rc.py")
def move(src, dst):
if not op.exists(src):

View File

@@ -23,6 +23,6 @@ class MainWindow(MainWindowBase):
title = "Clear Picture Cache"
msg = "Do you really want to remove all your cached picture analysis?"
if self._confirm(title, msg, QMessageBox.No):
self.app.scanner.match_factory.cached_blocks.clear()
self.app.scanner.cached_blocks.clear()
QMessageBox.information(self, title, "Picture cache cleared.")

View File

@@ -14,6 +14,9 @@ import base.dg_rc
from app import DupeGuru
# This is a workaround for a pyinstaller problem where compiled dupeguru can't read tiff files
from PIL import TiffImagePlugin, TiffTags
if __name__ == "__main__":
app = QApplication(sys.argv)
app.setWindowIcon(QIcon(QPixmap(":/logo_pe")))

View File

@@ -8,12 +8,12 @@
import objc
from AppKit import *
from dupeguru import app_se_cocoa, scanner
from dupeguru_se.app_cocoa import DupeGuru
from dupeguru import scanner
# Fix py2app imports with chokes on relative imports
from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, scanner
from hsfs import auto, stats, tree
from hsfs.phys import bundle
from dupeguru_se import fs, data
from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, fs
from hsutil import conflict
class PyApp(NSObject):
@@ -22,7 +22,7 @@ class PyApp(NSObject):
class PyDupeGuru(PyApp):
def init(self):
self = super(PyDupeGuru,self).init()
self.app = app_se_cocoa.DupeGuru()
self.app = DupeGuru()
return self
#---Directories

11
se/py/LICENSE Normal file
View File

@@ -0,0 +1,11 @@
Copyright 2009 Hardcoded Software Inc. (http://www.hardcoded.net)
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
* Neither the name of Hardcoded Software Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
* If the source code has been published less than two years ago, any redistribution, in whole or in part, must retain full licensing functionality, without any attempt to change, obscure or in other ways circumvent its intent.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

1
se/py/__init__.py Normal file
View File

@@ -0,0 +1 @@

View File

@@ -11,14 +11,15 @@ import logging
from AppKit import *
from hsfs.phys import Directory as DirectoryBase
from hsfs.phys.bundle import Bundle
from hsutil import io
from hsutil.path import Path
from hsutil.misc import extract
from hsutil.str import get_file_ext
from . import app_cocoa, data
from .directories import Directories as DirectoriesBase, STATE_EXCLUDED
from dupeguru import fs
from dupeguru.app_cocoa import DupeGuru as DupeGuruBase
from dupeguru.directories import Directories as DirectoriesBase, STATE_EXCLUDED
from . import data
from .fs import Bundle as BundleBase
if NSWorkspace.sharedWorkspace().respondsToSelector_('typeOfFile:error:'): # Only from 10.5
def is_bundle(str_path):
@@ -31,27 +32,17 @@ else: # Tiger
def is_bundle(str_path): # just return a list of a few known bundle extensions.
return get_file_ext(str_path) in ('app', 'pages', 'numbers')
class DGDirectory(DirectoryBase):
def _create_sub_file(self, name, with_parent=True):
if is_bundle(unicode(self.path + name)):
parent = self if with_parent else None
return Bundle(parent, name)
else:
return super(DGDirectory, self)._create_sub_file(name, with_parent)
def _fetch_subitems(self):
subdirs, subfiles = super(DGDirectory, self)._fetch_subitems()
apps, normal_dirs = extract(lambda name: is_bundle(unicode(self.path + name)), subdirs)
subfiles += apps
return normal_dirs, subfiles
class Bundle(BundleBase):
@classmethod
def can_handle(cls, path):
return not io.islink(path) and io.isdir(path) and is_bundle(unicode(path))
class Directories(DirectoriesBase):
ROOT_PATH_TO_EXCLUDE = map(Path, ['/Library', '/Volumes', '/System', '/bin', '/sbin', '/opt', '/private', '/dev'])
HOME_PATH_TO_EXCLUDE = [Path('Library')]
def __init__(self):
DirectoriesBase.__init__(self)
self.dirclass = DGDirectory
DirectoriesBase.__init__(self, fileclasses=[Bundle, fs.File])
def _default_state_for_path(self, path):
result = DirectoriesBase._default_state_for_path(self, path)
@@ -63,8 +54,8 @@ class Directories(DirectoriesBase):
return STATE_EXCLUDED
class DupeGuru(app_cocoa.DupeGuru):
class DupeGuru(DupeGuruBase):
def __init__(self):
app_cocoa.DupeGuru.__init__(self, data, 'dupeGuru', appid=4)
DupeGuruBase.__init__(self, data, 'dupeGuru', appid=4)
self.directories = Directories()

72
se/py/data.py Normal file
View File

@@ -0,0 +1,72 @@
# Created By: Virgil Dupras
# Created On: 2006/03/15
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "HS" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
from hsutil.str import format_size
from dupeguru.data import (format_path, format_timestamp, format_words, format_perc,
format_dupe_count, cmp_value)
COLUMNS = [
{'attr':'name','display':'Filename'},
{'attr':'path','display':'Directory'},
{'attr':'size','display':'Size (KB)'},
{'attr':'extension','display':'Kind'},
{'attr':'ctime','display':'Creation'},
{'attr':'mtime','display':'Modification'},
{'attr':'percentage','display':'Match %'},
{'attr':'words','display':'Words Used'},
{'attr':'dupe_count','display':'Dupe Count'},
]
METADATA_TO_READ = ['size', 'ctime', 'mtime']
def GetDisplayInfo(dupe, group, delta):
size = dupe.size
ctime = dupe.ctime
mtime = dupe.mtime
m = group.get_match_of(dupe)
if m:
percentage = m.percentage
dupe_count = 0
if delta:
r = group.ref
size -= r.size
ctime -= r.ctime
mtime -= r.mtime
else:
percentage = group.percentage
dupe_count = len(group.dupes)
return [
dupe.name,
format_path(dupe.path),
format_size(size, 0, 1, False),
dupe.extension,
format_timestamp(ctime, delta and m),
format_timestamp(mtime, delta and m),
format_perc(percentage),
format_words(dupe.words) if hasattr(dupe, 'words') else '',
format_dupe_count(dupe_count)
]
def GetDupeSortKey(dupe, get_group, key, delta):
if key == 6:
m = get_group().get_match_of(dupe)
return m.percentage
if key == 8:
return 0
r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
if delta and (key in (2, 4, 5)):
r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
return r
def GetGroupSortKey(group, key):
if key == 6:
return group.percentage
if key == 8:
return len(group)
return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))

43
se/py/fs.py Normal file
View File

@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
# Created By: Virgil Dupras
# Created On: 2009-10-23
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "HS" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
import hashlib
from hsutil import io
from hsutil.misc import nonone
from dupeguru import fs
class Bundle(fs.File):
"""This class is for Mac OSX bundles (.app). Bundles are seen by the OS as
normal directories, but I don't want that in dupeGuru. I want dupeGuru
to see them as files.
"""
def _read_info(self, field):
if field in ('size', 'ctime', 'mtime'):
files = fs.get_all_files(self.path)
size = sum((file.size for file in files), 0)
self.size = size
stats = io.stat(self.path)
self.ctime = nonone(stats.st_ctime, 0)
self.mtime = nonone(stats.st_mtime, 0)
elif field in ('md5', 'md5partial'):
# What's sensitive here is that we must make sure that subfiles'
# md5 are always added up in the same order, but we also want a
# different md5 if a file gets moved in a different subdirectory.
def get_dir_md5_concat():
files = fs.get_all_files(self.path)
files.sort(key=lambda f:f.path)
md5s = [getattr(f, field) for f in files]
return ''.join(md5s)
md5 = hashlib.md5(get_dir_md5_concat())
digest = md5.digest()
setattr(self, field, digest)

0
se/py/tests/__init__.py Normal file
View File

48
se/py/tests/fs_test.py Normal file
View File

@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
# Created By: Virgil Dupras
# Created On: 2009-10-23
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "HS" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
import hashlib
from nose.tools import eq_
from hsutil.testcase import TestCase
from dupeguru.fs import File
from dupeguru.tests.directories_test import create_fake_fs
from .. import fs
class TCBundle(TestCase):
def test_size_aggregates_subfiles(self):
p = create_fake_fs(self.tmppath())
b = fs.Bundle(p)
eq_(b.size, 12)
def test_md5_aggregate_subfiles_sorted(self):
#dir.allfiles can return child in any order. Thus, bundle.md5 must aggregate
#all files' md5 it contains, but it must make sure that it does so in the
#same order everytime.
p = create_fake_fs(self.tmppath())
b = fs.Bundle(p)
md5s = File(p + ('dir1', 'file1.test')).md5
md5s += File(p + ('dir2', 'file2.test')).md5
md5s += File(p + ('dir3', 'file3.test')).md5
md5s += File(p + 'file1.test').md5
md5s += File(p + 'file2.test').md5
md5s += File(p + 'file3.test').md5
md5 = hashlib.md5(md5s)
eq_(b.md5, md5.digest())
def test_has_file_attrs(self):
#a Bundle must behave like a file, so it must have ctime and mtime attributes
b = fs.Bundle(self.tmppath())
assert b.mtime > 0
assert b.ctime > 0
eq_(b.extension, '')

View File

@@ -7,7 +7,7 @@
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license
from dupeguru import data
from dupeguru_se import data
from dupeguru.directories import Directories as DirectoriesBase, STATE_EXCLUDED
from base.app import DupeGuru as DupeGuruBase