1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2025-03-23 19:24:36 +00:00

Format files with black

- Format all files with black
- Update tox.ini flake8 arguments to be compatible
- Add black to requirements-extra.txt
- Reduce ignored flake8 rules and fix a few violations
This commit is contained in:
Andrew Senetar 2019-12-31 20:16:27 -06:00
parent 359d6498f7
commit 7ba8aa3514
Signed by: arsenetar
GPG Key ID: C63300DCE48AB2F1
141 changed files with 5241 additions and 3648 deletions

143
build.py
View File

@ -13,129 +13,165 @@ from setuptools import setup, Extension
from hscommon import sphinxgen
from hscommon.build import (
add_to_pythonpath, print_and_do, move_all, fix_qt_resource_file,
add_to_pythonpath,
print_and_do,
move_all,
fix_qt_resource_file,
)
from hscommon import loc
def parse_args():
usage = "usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option(
'--clean', action='store_true', dest='clean',
help="Clean build folder before building"
"--clean",
action="store_true",
dest="clean",
help="Clean build folder before building",
)
parser.add_option(
'--doc', action='store_true', dest='doc',
help="Build only the help file"
"--doc", action="store_true", dest="doc", help="Build only the help file"
)
parser.add_option(
'--loc', action='store_true', dest='loc',
help="Build only localization"
"--loc", action="store_true", dest="loc", help="Build only localization"
)
parser.add_option(
'--updatepot', action='store_true', dest='updatepot',
help="Generate .pot files from source code."
"--updatepot",
action="store_true",
dest="updatepot",
help="Generate .pot files from source code.",
)
parser.add_option(
'--mergepot', action='store_true', dest='mergepot',
help="Update all .po files based on .pot files."
"--mergepot",
action="store_true",
dest="mergepot",
help="Update all .po files based on .pot files.",
)
parser.add_option(
'--normpo', action='store_true', dest='normpo',
help="Normalize all PO files (do this before commit)."
"--normpo",
action="store_true",
dest="normpo",
help="Normalize all PO files (do this before commit).",
)
(options, args) = parser.parse_args()
return options
def build_help():
print("Generating Help")
current_path = op.abspath('.')
help_basepath = op.join(current_path, 'help', 'en')
help_destpath = op.join(current_path, 'build', 'help')
changelog_path = op.join(current_path, 'help', 'changelog')
current_path = op.abspath(".")
help_basepath = op.join(current_path, "help", "en")
help_destpath = op.join(current_path, "build", "help")
changelog_path = op.join(current_path, "help", "changelog")
tixurl = "https://github.com/hsoft/dupeguru/issues/{}"
confrepl = {'language': 'en'}
changelogtmpl = op.join(current_path, 'help', 'changelog.tmpl')
conftmpl = op.join(current_path, 'help', 'conf.tmpl')
sphinxgen.gen(help_basepath, help_destpath, changelog_path, tixurl, confrepl, conftmpl, changelogtmpl)
confrepl = {"language": "en"}
changelogtmpl = op.join(current_path, "help", "changelog.tmpl")
conftmpl = op.join(current_path, "help", "conf.tmpl")
sphinxgen.gen(
help_basepath,
help_destpath,
changelog_path,
tixurl,
confrepl,
conftmpl,
changelogtmpl,
)
def build_qt_localizations():
loc.compile_all_po(op.join('qtlib', 'locale'))
loc.merge_locale_dir(op.join('qtlib', 'locale'), 'locale')
loc.compile_all_po(op.join("qtlib", "locale"))
loc.merge_locale_dir(op.join("qtlib", "locale"), "locale")
def build_localizations():
loc.compile_all_po('locale')
loc.compile_all_po("locale")
build_qt_localizations()
locale_dest = op.join('build', 'locale')
locale_dest = op.join("build", "locale")
if op.exists(locale_dest):
shutil.rmtree(locale_dest)
shutil.copytree('locale', locale_dest, ignore=shutil.ignore_patterns('*.po', '*.pot'))
shutil.copytree(
"locale", locale_dest, ignore=shutil.ignore_patterns("*.po", "*.pot")
)
def build_updatepot():
print("Building .pot files from source files")
print("Building core.pot")
loc.generate_pot(['core'], op.join('locale', 'core.pot'), ['tr'])
loc.generate_pot(["core"], op.join("locale", "core.pot"), ["tr"])
print("Building columns.pot")
loc.generate_pot(['core'], op.join('locale', 'columns.pot'), ['coltr'])
loc.generate_pot(["core"], op.join("locale", "columns.pot"), ["coltr"])
print("Building ui.pot")
# When we're not under OS X, we don't want to overwrite ui.pot because it contains Cocoa locs
# We want to merge the generated pot with the old pot in the most preserving way possible.
ui_packages = ['qt', op.join('cocoa', 'inter')]
loc.generate_pot(ui_packages, op.join('locale', 'ui.pot'), ['tr'], merge=True)
ui_packages = ["qt", op.join("cocoa", "inter")]
loc.generate_pot(ui_packages, op.join("locale", "ui.pot"), ["tr"], merge=True)
print("Building qtlib.pot")
loc.generate_pot(['qtlib'], op.join('qtlib', 'locale', 'qtlib.pot'), ['tr'])
loc.generate_pot(["qtlib"], op.join("qtlib", "locale", "qtlib.pot"), ["tr"])
def build_mergepot():
print("Updating .po files using .pot files")
loc.merge_pots_into_pos('locale')
loc.merge_pots_into_pos(op.join('qtlib', 'locale'))
loc.merge_pots_into_pos(op.join('cocoalib', 'locale'))
loc.merge_pots_into_pos("locale")
loc.merge_pots_into_pos(op.join("qtlib", "locale"))
loc.merge_pots_into_pos(op.join("cocoalib", "locale"))
def build_normpo():
loc.normalize_all_pos('locale')
loc.normalize_all_pos(op.join('qtlib', 'locale'))
loc.normalize_all_pos(op.join('cocoalib', 'locale'))
loc.normalize_all_pos("locale")
loc.normalize_all_pos(op.join("qtlib", "locale"))
loc.normalize_all_pos(op.join("cocoalib", "locale"))
def build_pe_modules():
print("Building PE Modules")
exts = [
Extension(
"_block",
[op.join('core', 'pe', 'modules', 'block.c'), op.join('core', 'pe', 'modules', 'common.c')]
[
op.join("core", "pe", "modules", "block.c"),
op.join("core", "pe", "modules", "common.c"),
],
),
Extension(
"_cache",
[op.join('core', 'pe', 'modules', 'cache.c'), op.join('core', 'pe', 'modules', 'common.c')]
[
op.join("core", "pe", "modules", "cache.c"),
op.join("core", "pe", "modules", "common.c"),
],
),
]
exts.append(Extension("_block_qt", [op.join('qt', 'pe', 'modules', 'block.c')]))
exts.append(Extension("_block_qt", [op.join("qt", "pe", "modules", "block.c")]))
setup(
script_args=['build_ext', '--inplace'],
ext_modules=exts,
script_args=["build_ext", "--inplace"], ext_modules=exts,
)
move_all('_block_qt*', op.join('qt', 'pe'))
move_all('_block*', op.join('core', 'pe'))
move_all('_cache*', op.join('core', 'pe'))
move_all("_block_qt*", op.join("qt", "pe"))
move_all("_block*", op.join("core", "pe"))
move_all("_cache*", op.join("core", "pe"))
def build_normal():
print("Building dupeGuru with UI qt")
add_to_pythonpath('.')
add_to_pythonpath(".")
print("Building dupeGuru")
build_pe_modules()
print("Building localizations")
build_localizations()
print("Building Qt stuff")
print_and_do("pyrcc5 {0} > {1}".format(op.join('qt', 'dg.qrc'), op.join('qt', 'dg_rc.py')))
fix_qt_resource_file(op.join('qt', 'dg_rc.py'))
print_and_do(
"pyrcc5 {0} > {1}".format(op.join("qt", "dg.qrc"), op.join("qt", "dg_rc.py"))
)
fix_qt_resource_file(op.join("qt", "dg_rc.py"))
build_help()
def main():
options = parse_args()
if options.clean:
if op.exists('build'):
shutil.rmtree('build')
if not op.exists('build'):
os.mkdir('build')
if op.exists("build"):
shutil.rmtree("build")
if not op.exists("build"):
os.mkdir("build")
if options.doc:
build_help()
elif options.loc:
@ -149,5 +185,6 @@ def main():
else:
build_normal()
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -1,3 +1,2 @@
__version__ = '4.0.4'
__appname__ = 'dupeGuru'
__version__ = "4.0.4"
__appname__ = "dupeGuru"

View File

@ -34,8 +34,8 @@ from .gui.ignore_list_dialog import IgnoreListDialog
from .gui.problem_dialog import ProblemDialog
from .gui.stats_label import StatsLabel
HAD_FIRST_LAUNCH_PREFERENCE = 'HadFirstLaunch'
DEBUG_MODE_PREFERENCE = 'DebugMode'
HAD_FIRST_LAUNCH_PREFERENCE = "HadFirstLaunch"
DEBUG_MODE_PREFERENCE = "DebugMode"
MSG_NO_MARKED_DUPES = tr("There are no marked duplicates. Nothing has been done.")
MSG_NO_SELECTED_DUPES = tr("There are no selected duplicates. Nothing has been done.")
@ -44,23 +44,27 @@ MSG_MANY_FILES_TO_OPEN = tr(
"files are opened with, doing so can create quite a mess. Continue?"
)
class DestType:
Direct = 0
Relative = 1
Absolute = 2
class JobType:
Scan = 'job_scan'
Load = 'job_load'
Move = 'job_move'
Copy = 'job_copy'
Delete = 'job_delete'
Scan = "job_scan"
Load = "job_load"
Move = "job_move"
Copy = "job_copy"
Delete = "job_delete"
class AppMode:
Standard = 0
Music = 1
Picture = 2
JOBID2TITLE = {
JobType.Scan: tr("Scanning for duplicates"),
JobType.Load: tr("Loading"),
@ -69,6 +73,7 @@ JOBID2TITLE = {
JobType.Delete: tr("Sending to Trash"),
}
class DupeGuru(Broadcaster):
"""Holds everything together.
@ -100,7 +105,8 @@ class DupeGuru(Broadcaster):
Instance of :mod:`meta-gui <core.gui>` table listing the results from :attr:`results`
"""
#--- View interface
# --- View interface
# get_default(key_name)
# set_default(key_name, value)
# show_message(msg)
@ -116,7 +122,7 @@ class DupeGuru(Broadcaster):
NAME = PROMPT_NAME = "dupeGuru"
PICTURE_CACHE_TYPE = 'sqlite' # set to 'shelve' for a ShelveCache
PICTURE_CACHE_TYPE = "sqlite" # set to 'shelve' for a ShelveCache
def __init__(self, view):
if view.get_default(DEBUG_MODE_PREFERENCE):
@ -124,7 +130,9 @@ class DupeGuru(Broadcaster):
logging.debug("Debug mode enabled")
Broadcaster.__init__(self)
self.view = view
self.appdata = desktop.special_folder_path(desktop.SpecialFolder.AppData, appname=self.NAME)
self.appdata = desktop.special_folder_path(
desktop.SpecialFolder.AppData, appname=self.NAME
)
if not op.exists(self.appdata):
os.makedirs(self.appdata)
self.app_mode = AppMode.Standard
@ -136,11 +144,11 @@ class DupeGuru(Broadcaster):
# sent to the scanner. They don't have default values because those defaults values are
# defined in the scanner class.
self.options = {
'escape_filter_regexp': True,
'clean_empty_dirs': False,
'ignore_hardlink_matches': False,
'copymove_dest_type': DestType.Relative,
'picture_cache_type': self.PICTURE_CACHE_TYPE
"escape_filter_regexp": True,
"clean_empty_dirs": False,
"ignore_hardlink_matches": False,
"copymove_dest_type": DestType.Relative,
"picture_cache_type": self.PICTURE_CACHE_TYPE,
}
self.selected_dupes = []
self.details_panel = DetailsPanel(self)
@ -155,7 +163,7 @@ class DupeGuru(Broadcaster):
for child in children:
child.connect()
#--- Private
# --- Private
def _recreate_result_table(self):
if self.result_table is not None:
self.result_table.disconnect()
@ -169,26 +177,30 @@ class DupeGuru(Broadcaster):
self.view.create_results_window()
def _get_picture_cache_path(self):
cache_type = self.options['picture_cache_type']
cache_name = 'cached_pictures.shelve' if cache_type == 'shelve' else 'cached_pictures.db'
cache_type = self.options["picture_cache_type"]
cache_name = (
"cached_pictures.shelve" if cache_type == "shelve" else "cached_pictures.db"
)
return op.join(self.appdata, cache_name)
def _get_dupe_sort_key(self, dupe, get_group, key, delta):
if self.app_mode in (AppMode.Music, AppMode.Picture):
if key == 'folder_path':
dupe_folder_path = getattr(dupe, 'display_folder_path', dupe.folder_path)
if key == "folder_path":
dupe_folder_path = getattr(
dupe, "display_folder_path", dupe.folder_path
)
return str(dupe_folder_path).lower()
if self.app_mode == AppMode.Picture:
if delta and key == 'dimensions':
if delta and key == "dimensions":
r = cmp_value(dupe, key)
ref_value = cmp_value(get_group().ref, key)
return get_delta_dimensions(r, ref_value)
if key == 'marked':
if key == "marked":
return self.results.is_marked(dupe)
if key == 'percentage':
if key == "percentage":
m = get_group().get_match_of(dupe)
return m.percentage
elif key == 'dupe_count':
elif key == "dupe_count":
return 0
else:
result = cmp_value(dupe, key)
@ -203,21 +215,25 @@ class DupeGuru(Broadcaster):
def _get_group_sort_key(self, group, key):
if self.app_mode in (AppMode.Music, AppMode.Picture):
if key == 'folder_path':
dupe_folder_path = getattr(group.ref, 'display_folder_path', group.ref.folder_path)
if key == "folder_path":
dupe_folder_path = getattr(
group.ref, "display_folder_path", group.ref.folder_path
)
return str(dupe_folder_path).lower()
if key == 'percentage':
if key == "percentage":
return group.percentage
if key == 'dupe_count':
if key == "dupe_count":
return len(group)
if key == 'marked':
if key == "marked":
return len([dupe for dupe in group.dupes if self.results.is_marked(dupe)])
return cmp_value(group.ref, key)
def _do_delete(self, j, link_deleted, use_hardlinks, direct_deletion):
def op(dupe):
j.add_progress()
return self._do_delete_dupe(dupe, link_deleted, use_hardlinks, direct_deletion)
return self._do_delete_dupe(
dupe, link_deleted, use_hardlinks, direct_deletion
)
j.start_job(self.results.mark_count)
self.results.perform_on_marked(op, True)
@ -233,7 +249,7 @@ class DupeGuru(Broadcaster):
else:
os.remove(str_path)
else:
send2trash(str_path) # Raises OSError when there's a problem
send2trash(str_path) # Raises OSError when there's a problem
if link_deleted:
group = self.results.get_group_of_duplicate(dupe)
ref = group.ref
@ -258,8 +274,9 @@ class DupeGuru(Broadcaster):
def _get_export_data(self):
columns = [
col for col in self.result_table.columns.ordered_columns
if col.visible and col.name != 'marked'
col
for col in self.result_table.columns.ordered_columns
if col.visible and col.name != "marked"
]
colnames = [col.display for col in columns]
rows = []
@ -273,10 +290,11 @@ class DupeGuru(Broadcaster):
def _results_changed(self):
self.selected_dupes = [
d for d in self.selected_dupes
d
for d in self.selected_dupes
if self.results.get_group_of_duplicate(d) is not None
]
self.notify('results_changed')
self.notify("results_changed")
def _start_job(self, jobid, func, args=()):
title = JOBID2TITLE[jobid]
@ -310,7 +328,9 @@ class DupeGuru(Broadcaster):
msg = {
JobType.Copy: tr("All marked files were copied successfully."),
JobType.Move: tr("All marked files were moved successfully."),
JobType.Delete: tr("All marked files were successfully sent to Trash."),
JobType.Delete: tr(
"All marked files were successfully sent to Trash."
),
}[jobid]
self.view.show_message(msg)
@ -341,9 +361,9 @@ class DupeGuru(Broadcaster):
if dupes == self.selected_dupes:
return
self.selected_dupes = dupes
self.notify('dupes_selected')
self.notify("dupes_selected")
#--- Protected
# --- Protected
def _get_fileclasses(self):
if self.app_mode == AppMode.Picture:
return [pe.photo.PLAT_SPECIFIC_PHOTO_CLASS]
@ -360,7 +380,7 @@ class DupeGuru(Broadcaster):
else:
return prioritize.all_categories()
#--- Public
# --- Public
def add_directory(self, d):
"""Adds folder ``d`` to :attr:`directories`.
@ -370,7 +390,7 @@ class DupeGuru(Broadcaster):
"""
try:
self.directories.add_path(Path(d))
self.notify('directories_changed')
self.notify("directories_changed")
except directories.AlreadyThereError:
self.view.show_message(tr("'{}' already is in the list.").format(d))
except directories.InvalidPathError:
@ -383,7 +403,9 @@ class DupeGuru(Broadcaster):
if not dupes:
self.view.show_message(MSG_NO_SELECTED_DUPES)
return
msg = tr("All selected %d matches are going to be ignored in all subsequent scans. Continue?")
msg = tr(
"All selected %d matches are going to be ignored in all subsequent scans. Continue?"
)
if not self.view.ask_yes_no(msg % len(dupes)):
return
for dupe in dupes:
@ -400,22 +422,22 @@ class DupeGuru(Broadcaster):
:param str filter: filter to apply
"""
self.results.apply_filter(None)
if self.options['escape_filter_regexp']:
filter = escape(filter, set('()[]\\.|+?^'))
filter = escape(filter, '*', '.')
if self.options["escape_filter_regexp"]:
filter = escape(filter, set("()[]\\.|+?^"))
filter = escape(filter, "*", ".")
self.results.apply_filter(filter)
self._results_changed()
def clean_empty_dirs(self, path):
if self.options['clean_empty_dirs']:
while delete_if_empty(path, ['.DS_Store']):
if self.options["clean_empty_dirs"]:
while delete_if_empty(path, [".DS_Store"]):
path = path.parent()
def clear_picture_cache(self):
try:
os.remove(self._get_picture_cache_path())
except FileNotFoundError:
pass # we don't care
pass # we don't care
def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType):
source_path = dupe.path
@ -444,6 +466,7 @@ class DupeGuru(Broadcaster):
:param bool copy: If True, duplicates will be copied instead of moved
"""
def do(j):
def op(dupe):
j.add_progress()
@ -459,7 +482,7 @@ class DupeGuru(Broadcaster):
prompt = tr("Select a directory to {} marked files to").format(opname)
destination = self.view.select_dest_folder(prompt)
if destination:
desttype = self.options['copymove_dest_type']
desttype = self.options["copymove_dest_type"]
jobid = JobType.Copy if copy else JobType.Move
self._start_job(jobid, do)
@ -472,8 +495,9 @@ class DupeGuru(Broadcaster):
if not self.deletion_options.show(self.results.mark_count):
return
args = [
self.deletion_options.link_deleted, self.deletion_options.use_hardlinks,
self.deletion_options.direct
self.deletion_options.link_deleted,
self.deletion_options.use_hardlinks,
self.deletion_options.direct,
]
logging.debug("Starting deletion job with args %r", args)
self._start_job(JobType.Delete, self._do_delete, args=args)
@ -495,7 +519,9 @@ class DupeGuru(Broadcaster):
The columns and their order in the resulting CSV file is determined in the same way as in
:meth:`export_to_xhtml`.
"""
dest_file = self.view.select_dest_file(tr("Select a destination for your exported CSV"), 'csv')
dest_file = self.view.select_dest_file(
tr("Select a destination for your exported CSV"), "csv"
)
if dest_file:
colnames, rows = self._get_export_data()
try:
@ -505,13 +531,16 @@ class DupeGuru(Broadcaster):
def get_display_info(self, dupe, group, delta=False):
def empty_data():
return {c.name: '---' for c in self.result_table.COLUMNS[1:]}
return {c.name: "---" for c in self.result_table.COLUMNS[1:]}
if (dupe is None) or (group is None):
return empty_data()
try:
return dupe.get_display_info(group, delta)
except Exception as e:
logging.warning("Exception on GetDisplayInfo for %s: %s", str(dupe.path), str(e))
logging.warning(
"Exception on GetDisplayInfo for %s: %s", str(dupe.path), str(e)
)
return empty_data()
def invoke_custom_command(self):
@ -521,9 +550,11 @@ class DupeGuru(Broadcaster):
is replaced with that dupe's ref file. If there's no selection, the command is not invoked.
If the dupe is a ref, ``%d`` and ``%r`` will be the same.
"""
cmd = self.view.get_default('CustomCommand')
cmd = self.view.get_default("CustomCommand")
if not cmd:
msg = tr("You have no custom command set up. Set it up in your preferences.")
msg = tr(
"You have no custom command set up. Set it up in your preferences."
)
self.view.show_message(msg)
return
if not self.selected_dupes:
@ -531,8 +562,8 @@ class DupeGuru(Broadcaster):
dupe = self.selected_dupes[0]
group = self.results.get_group_of_duplicate(dupe)
ref = group.ref
cmd = cmd.replace('%d', str(dupe.path))
cmd = cmd.replace('%r', str(ref.path))
cmd = cmd.replace("%d", str(dupe.path))
cmd = cmd.replace("%r", str(ref.path))
match = re.match(r'"([^"]+)"(.*)', cmd)
if match is not None:
# This code here is because subprocess. Popen doesn't seem to accept, under Windows,
@ -551,9 +582,9 @@ class DupeGuru(Broadcaster):
is persistent data, is the same as when the last session was closed (when :meth:`save` was
called).
"""
self.directories.load_from_file(op.join(self.appdata, 'last_directories.xml'))
self.notify('directories_changed')
p = op.join(self.appdata, 'ignore_list.xml')
self.directories.load_from_file(op.join(self.appdata, "last_directories.xml"))
self.notify("directories_changed")
p = op.join(self.appdata, "ignore_list.xml")
self.ignore_list.load_from_xml(p)
self.ignore_list_dialog.refresh()
@ -562,8 +593,10 @@ class DupeGuru(Broadcaster):
:param str filename: path of the XML file (created with :meth:`save_as`) to load
"""
def do(j):
self.results.load_from_xml(filename, self._get_file, j)
self._start_job(JobType.Load, do)
def make_selected_reference(self):
@ -588,35 +621,36 @@ class DupeGuru(Broadcaster):
if not self.result_table.power_marker:
if changed_groups:
self.selected_dupes = [
d for d in self.selected_dupes
d
for d in self.selected_dupes
if self.results.get_group_of_duplicate(d).ref is d
]
self.notify('results_changed')
self.notify("results_changed")
else:
# If we're in "Dupes Only" mode (previously called Power Marker), things are a bit
# different. The refs are not shown in the table, and if our operation is successful,
# this means that there's no way to follow our dupe selection. Then, the best thing to
# do is to keep our selection index-wise (different dupe selection, but same index
# selection).
self.notify('results_changed_but_keep_selection')
self.notify("results_changed_but_keep_selection")
def mark_all(self):
"""Set all dupes in the results as marked.
"""
self.results.mark_all()
self.notify('marking_changed')
self.notify("marking_changed")
def mark_none(self):
"""Set all dupes in the results as unmarked.
"""
self.results.mark_none()
self.notify('marking_changed')
self.notify("marking_changed")
def mark_invert(self):
"""Invert the marked state of all dupes in the results.
"""
self.results.mark_invert()
self.notify('marking_changed')
self.notify("marking_changed")
def mark_dupe(self, dupe, marked):
"""Change marked status of ``dupe``.
@ -629,7 +663,7 @@ class DupeGuru(Broadcaster):
self.results.mark(dupe)
else:
self.results.unmark(dupe)
self.notify('marking_changed')
self.notify("marking_changed")
def open_selected(self):
"""Open :attr:`selected_dupes` with their associated application.
@ -656,7 +690,7 @@ class DupeGuru(Broadcaster):
indexes = sorted(indexes, reverse=True)
for index in indexes:
del self.directories[index]
self.notify('directories_changed')
self.notify("directories_changed")
except IndexError:
pass
@ -669,7 +703,7 @@ class DupeGuru(Broadcaster):
:type duplicates: list of :class:`~core.fs.File`
"""
self.results.remove_duplicates(self.without_ref(duplicates))
self.notify('results_changed_but_keep_selection')
self.notify("results_changed_but_keep_selection")
def remove_marked(self):
"""Removed marked duplicates from the results (without touching the files themselves).
@ -724,7 +758,9 @@ class DupeGuru(Broadcaster):
if group.prioritize(key_func=sort_key):
count += 1
self._results_changed()
msg = tr("{} duplicate groups were changed by the re-prioritization.").format(count)
msg = tr("{} duplicate groups were changed by the re-prioritization.").format(
count
)
self.view.show_message(msg)
def reveal_selected(self):
@ -734,10 +770,10 @@ class DupeGuru(Broadcaster):
def save(self):
if not op.exists(self.appdata):
os.makedirs(self.appdata)
self.directories.save_to_file(op.join(self.appdata, 'last_directories.xml'))
p = op.join(self.appdata, 'ignore_list.xml')
self.directories.save_to_file(op.join(self.appdata, "last_directories.xml"))
p = op.join(self.appdata, "ignore_list.xml")
self.ignore_list.save_to_xml(p)
self.notify('save_session')
self.notify("save_session")
def save_as(self, filename):
"""Save results in ``filename``.
@ -756,7 +792,9 @@ class DupeGuru(Broadcaster):
"""
scanner = self.SCANNER_CLASS()
if not self.directories.has_any_file():
self.view.show_message(tr("The selected directories contain no scannable file."))
self.view.show_message(
tr("The selected directories contain no scannable file.")
)
return
# Send relevant options down to the scanner instance
for k, v in self.options.items():
@ -771,12 +809,16 @@ class DupeGuru(Broadcaster):
def do(j):
j.set_progress(0, tr("Collecting files to scan"))
if scanner.scan_type == ScanType.Folders:
files = list(self.directories.get_folders(folderclass=se.fs.Folder, j=j))
files = list(
self.directories.get_folders(folderclass=se.fs.Folder, j=j)
)
else:
files = list(self.directories.get_files(fileclasses=self.fileclasses, j=j))
if self.options['ignore_hardlink_matches']:
files = list(
self.directories.get_files(fileclasses=self.fileclasses, j=j)
)
if self.options["ignore_hardlink_matches"]:
files = self._remove_hardlink_dupes(files)
logging.info('Scanning %d files' % len(files))
logging.info("Scanning %d files" % len(files))
self.results.groups = scanner.get_dupe_groups(files, self.ignore_list, j)
self.discarded_file_count = scanner.discarded_file_count
@ -792,12 +834,16 @@ class DupeGuru(Broadcaster):
markfunc = self.results.mark
for dupe in selected:
markfunc(dupe)
self.notify('marking_changed')
self.notify("marking_changed")
def without_ref(self, dupes):
"""Returns ``dupes`` with all reference elements removed.
"""
return [dupe for dupe in dupes if self.results.get_group_of_duplicate(dupe).ref is not dupe]
return [
dupe
for dupe in dupes
if self.results.get_group_of_duplicate(dupe).ref is not dupe
]
def get_default(self, key, fallback_value=None):
result = nonone(self.view.get_default(key), fallback_value)
@ -812,7 +858,7 @@ class DupeGuru(Broadcaster):
def set_default(self, key, value):
self.view.set_default(key, value)
#--- Properties
# --- Properties
@property
def stat_line(self):
result = self.results.stat_line
@ -836,12 +882,21 @@ class DupeGuru(Broadcaster):
@property
def METADATA_TO_READ(self):
if self.app_mode == AppMode.Picture:
return ['size', 'mtime', 'dimensions', 'exif_timestamp']
return ["size", "mtime", "dimensions", "exif_timestamp"]
elif self.app_mode == AppMode.Music:
return [
'size', 'mtime', 'duration', 'bitrate', 'samplerate', 'title', 'artist',
'album', 'genre', 'year', 'track', 'comment'
"size",
"mtime",
"duration",
"bitrate",
"samplerate",
"title",
"artist",
"album",
"genre",
"year",
"track",
"comment",
]
else:
return ['size', 'mtime']
return ["size", "mtime"]

View File

@ -15,12 +15,13 @@ from hscommon.util import FileOrPath
from . import fs
__all__ = [
'Directories',
'DirectoryState',
'AlreadyThereError',
'InvalidPathError',
"Directories",
"DirectoryState",
"AlreadyThereError",
"InvalidPathError",
]
class DirectoryState:
"""Enum describing how a folder should be considered.
@ -28,16 +29,20 @@ class DirectoryState:
* DirectoryState.Reference: Scan files, but make sure never to delete any of them
* DirectoryState.Excluded: Don't scan this folder
"""
Normal = 0
Reference = 1
Excluded = 2
class AlreadyThereError(Exception):
"""The path being added is already in the directory list"""
class InvalidPathError(Exception):
"""The path being added is invalid"""
class Directories:
"""Holds user folder selection.
@ -47,7 +52,8 @@ class Directories:
Then, when the user starts the scan, :meth:`get_files` is called to retrieve all files (wrapped
in :mod:`core.fs`) that have to be scanned according to the chosen folders/states.
"""
#---Override
# ---Override
def __init__(self):
self._dirs = []
# {path: state}
@ -68,10 +74,10 @@ class Directories:
def __len__(self):
return len(self._dirs)
#---Private
# ---Private
def _default_state_for_path(self, path):
# Override this in subclasses to specify the state of some special folders.
if path.name.startswith('.'): # hidden
if path.name.startswith("."): # hidden
return DirectoryState.Excluded
def _get_files(self, from_path, fileclasses, j):
@ -83,11 +89,13 @@ class Directories:
# Recursively get files from folders with lots of subfolder is expensive. However, there
# might be a subfolder in this path that is not excluded. What we want to do is to skim
# through self.states and see if we must continue, or we can stop right here to save time
if not any(p[:len(root)] == root for p in self.states):
if not any(p[: len(root)] == root for p in self.states):
del dirs[:]
try:
if state != DirectoryState.Excluded:
found_files = [fs.get_file(root + f, fileclasses=fileclasses) for f in files]
found_files = [
fs.get_file(root + f, fileclasses=fileclasses) for f in files
]
found_files = [f for f in found_files if f is not None]
# In some cases, directories can be considered as files by dupeGuru, which is
# why we have this line below. In fact, there only one case: Bundle files under
@ -97,7 +105,11 @@ class Directories:
if f is not None:
found_files.append(f)
dirs.remove(d)
logging.debug("Collected %d files in folder %s", len(found_files), str(from_path))
logging.debug(
"Collected %d files in folder %s",
len(found_files),
str(from_path),
)
for file in found_files:
file.is_ref = state == DirectoryState.Reference
yield file
@ -118,7 +130,7 @@ class Directories:
except (EnvironmentError, fs.InvalidPath):
pass
#---Public
# ---Public
def add_path(self, path):
"""Adds ``path`` to self, if not already there.
@ -212,21 +224,21 @@ class Directories:
root = ET.parse(infile).getroot()
except Exception:
return
for rdn in root.getiterator('root_directory'):
for rdn in root.getiterator("root_directory"):
attrib = rdn.attrib
if 'path' not in attrib:
if "path" not in attrib:
continue
path = attrib['path']
path = attrib["path"]
try:
self.add_path(Path(path))
except (AlreadyThereError, InvalidPathError):
pass
for sn in root.getiterator('state'):
for sn in root.getiterator("state"):
attrib = sn.attrib
if not ('path' in attrib and 'value' in attrib):
if not ("path" in attrib and "value" in attrib):
continue
path = attrib['path']
state = attrib['value']
path = attrib["path"]
state = attrib["value"]
self.states[Path(path)] = int(state)
def save_to_file(self, outfile):
@ -234,17 +246,17 @@ class Directories:
:param file outfile: path or file pointer to XML file to save to.
"""
with FileOrPath(outfile, 'wb') as fp:
root = ET.Element('directories')
with FileOrPath(outfile, "wb") as fp:
root = ET.Element("directories")
for root_path in self:
root_path_node = ET.SubElement(root, 'root_directory')
root_path_node.set('path', str(root_path))
root_path_node = ET.SubElement(root, "root_directory")
root_path_node.set("path", str(root_path))
for path, state in self.states.items():
state_node = ET.SubElement(root, 'state')
state_node.set('path', str(path))
state_node.set('value', str(state))
state_node = ET.SubElement(root, "state")
state_node.set("path", str(path))
state_node.set("value", str(state))
tree = ET.ElementTree(root)
tree.write(fp, encoding='utf-8')
tree.write(fp, encoding="utf-8")
def set_state(self, path, state):
"""Set the state of folder at ``path``.
@ -259,4 +271,3 @@ class Directories:
if path.is_parent_of(iter_path):
del self.states[iter_path]
self.states[path] = state

View File

@ -17,25 +17,26 @@ from hscommon.util import flatten, multi_replace
from hscommon.trans import tr
from hscommon.jobprogress import job
(
WEIGHT_WORDS,
MATCH_SIMILAR_WORDS,
NO_FIELD_ORDER,
) = range(3)
(WEIGHT_WORDS, MATCH_SIMILAR_WORDS, NO_FIELD_ORDER,) = range(3)
JOB_REFRESH_RATE = 100
def getwords(s):
# We decompose the string so that ascii letters with accents can be part of the word.
s = normalize('NFD', s)
s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", ' ').lower()
s = ''.join(c for c in s if c in string.ascii_letters + string.digits + string.whitespace)
return [_f for _f in s.split(' ') if _f] # remove empty elements
s = normalize("NFD", s)
s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", " ").lower()
s = "".join(
c for c in s if c in string.ascii_letters + string.digits + string.whitespace
)
return [_f for _f in s.split(" ") if _f] # remove empty elements
def getfields(s):
fields = [getwords(field) for field in s.split(' - ')]
fields = [getwords(field) for field in s.split(" - ")]
return [_f for _f in fields if _f]
def unpack_fields(fields):
result = []
for field in fields:
@ -45,6 +46,7 @@ def unpack_fields(fields):
result.append(field)
return result
def compare(first, second, flags=()):
"""Returns the % of words that match between ``first`` and ``second``
@ -55,11 +57,11 @@ def compare(first, second, flags=()):
return 0
if any(isinstance(element, list) for element in first):
return compare_fields(first, second, flags)
second = second[:] #We must use a copy of second because we remove items from it
second = second[:] # We must use a copy of second because we remove items from it
match_similar = MATCH_SIMILAR_WORDS in flags
weight_words = WEIGHT_WORDS in flags
joined = first + second
total_count = (sum(len(word) for word in joined) if weight_words else len(joined))
total_count = sum(len(word) for word in joined) if weight_words else len(joined)
match_count = 0
in_order = True
for word in first:
@ -71,12 +73,13 @@ def compare(first, second, flags=()):
if second[0] != word:
in_order = False
second.remove(word)
match_count += (len(word) if weight_words else 1)
match_count += len(word) if weight_words else 1
result = round(((match_count * 2) / total_count) * 100)
if (result == 100) and (not in_order):
result = 99 # We cannot consider a match exact unless the ordering is the same
result = 99 # We cannot consider a match exact unless the ordering is the same
return result
def compare_fields(first, second, flags=()):
"""Returns the score for the lowest matching :ref:`fields`.
@ -87,7 +90,7 @@ def compare_fields(first, second, flags=()):
return 0
if NO_FIELD_ORDER in flags:
results = []
#We don't want to remove field directly in the list. We must work on a copy.
# We don't want to remove field directly in the list. We must work on a copy.
second = second[:]
for field1 in first:
max = 0
@ -101,9 +104,12 @@ def compare_fields(first, second, flags=()):
if matched_field:
second.remove(matched_field)
else:
results = [compare(field1, field2, flags) for field1, field2 in zip(first, second)]
results = [
compare(field1, field2, flags) for field1, field2 in zip(first, second)
]
return min(results) if results else 0
def build_word_dict(objects, j=job.nulljob):
"""Returns a dict of objects mapped by their words.
@ -113,11 +119,14 @@ def build_word_dict(objects, j=job.nulljob):
The result will be a dict with words as keys, lists of objects as values.
"""
result = defaultdict(set)
for object in j.iter_with_progress(objects, 'Prepared %d/%d files', JOB_REFRESH_RATE):
for object in j.iter_with_progress(
objects, "Prepared %d/%d files", JOB_REFRESH_RATE
):
for word in unpack_fields(object.words):
result[word].add(object)
return result
def merge_similar_words(word_dict):
"""Take all keys in ``word_dict`` that are similar, and merge them together.
@ -126,7 +135,7 @@ def merge_similar_words(word_dict):
a word equal to the other.
"""
keys = list(word_dict.keys())
keys.sort(key=len)# we want the shortest word to stay
keys.sort(key=len) # we want the shortest word to stay
while keys:
key = keys.pop(0)
similars = difflib.get_close_matches(key, keys, 100, 0.8)
@ -138,6 +147,7 @@ def merge_similar_words(word_dict):
del word_dict[similar]
keys.remove(similar)
def reduce_common_words(word_dict, threshold):
"""Remove all objects from ``word_dict`` values where the object count >= ``threshold``
@ -146,7 +156,9 @@ def reduce_common_words(word_dict, threshold):
The exception to this removal are the objects where all the words of the object are common.
Because if we remove them, we will miss some duplicates!
"""
uncommon_words = set(word for word, objects in word_dict.items() if len(objects) < threshold)
uncommon_words = set(
word for word, objects in word_dict.items() if len(objects) < threshold
)
for word, objects in list(word_dict.items()):
if len(objects) < threshold:
continue
@ -159,11 +171,13 @@ def reduce_common_words(word_dict, threshold):
else:
del word_dict[word]
# Writing docstrings in a namedtuple is tricky. From Python 3.3, it's possible to set __doc__, but
# some research allowed me to find a more elegant solution, which is what is done here. See
# http://stackoverflow.com/questions/1606436/adding-docstrings-to-namedtuples-in-python
class Match(namedtuple('Match', 'first second percentage')):
class Match(namedtuple("Match", "first second percentage")):
"""Represents a match between two :class:`~core.fs.File`.
Regarless of the matching method, when two files are determined to match, a Match pair is created,
@ -182,16 +196,24 @@ class Match(namedtuple('Match', 'first second percentage')):
their match level according to the scan method which found the match. int from 1 to 100. For
exact scan methods, such as Contents scans, this will always be 100.
"""
__slots__ = ()
def get_match(first, second, flags=()):
#it is assumed here that first and second both have a "words" attribute
# it is assumed here that first and second both have a "words" attribute
percentage = compare(first.words, second.words, flags)
return Match(first, second, percentage)
def getmatches(
objects, min_match_percentage=0, match_similar_words=False, weight_words=False,
no_field_order=False, j=job.nulljob):
objects,
min_match_percentage=0,
match_similar_words=False,
weight_words=False,
no_field_order=False,
j=job.nulljob,
):
"""Returns a list of :class:`Match` within ``objects`` after fuzzily matching their words.
:param objects: List of :class:`~core.fs.File` to match.
@ -206,7 +228,7 @@ def getmatches(
j = j.start_subjob(2)
sj = j.start_subjob(2)
for o in objects:
if not hasattr(o, 'words'):
if not hasattr(o, "words"):
o.words = getwords(o.name)
word_dict = build_word_dict(objects, sj)
reduce_common_words(word_dict, COMMON_WORD_THRESHOLD)
@ -241,11 +263,15 @@ def getmatches(
except MemoryError:
# This is the place where the memory usage is at its peak during the scan.
# Just continue the process with an incomplete list of matches.
del compared # This should give us enough room to call logging.
logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
del compared # This should give us enough room to call logging.
logging.warning(
"Memory Overflow. Matches: %d. Word dict: %d"
% (len(result), len(word_dict))
)
return result
return result
def getmatches_by_contents(files, j=job.nulljob):
"""Returns a list of :class:`Match` within ``files`` if their contents is the same.
@ -263,13 +289,14 @@ def getmatches_by_contents(files, j=job.nulljob):
for group in possible_matches:
for first, second in itertools.combinations(group, 2):
if first.is_ref and second.is_ref:
continue # Don't spend time comparing two ref pics together.
continue # Don't spend time comparing two ref pics together.
if first.md5partial == second.md5partial:
if first.md5 == second.md5:
result.append(Match(first, second, 100))
j.add_progress(desc=tr("%d matches found") % len(result))
return result
class Group:
"""A group of :class:`~core.fs.File` that match together.
@ -297,7 +324,8 @@ class Group:
Average match percentage of match pairs containing :attr:`ref`.
"""
#---Override
# ---Override
def __init__(self):
self._clear()
@ -313,7 +341,7 @@ class Group:
def __len__(self):
return len(self.ordered)
#---Private
# ---Private
def _clear(self):
self._percentage = None
self._matches_for_ref = None
@ -328,7 +356,7 @@ class Group:
self._matches_for_ref = [match for match in self.matches if ref in match]
return self._matches_for_ref
#---Public
# ---Public
def add_match(self, match):
"""Adds ``match`` to internal match list and possibly add duplicates to the group.
@ -339,6 +367,7 @@ class Group:
:param tuple match: pair of :class:`~core.fs.File` to add
"""
def add_candidate(item, match):
matches = self.candidates[item]
matches.add(match)
@ -362,7 +391,11 @@ class Group:
You can call this after the duplicate scanning process to free a bit of memory.
"""
discarded = set(m for m in self.matches if not all(obj in self.unordered for obj in [m.first, m.second]))
discarded = set(
m
for m in self.matches
if not all(obj in self.unordered for obj in [m.first, m.second])
)
self.matches -= discarded
self.candidates = defaultdict(set)
return discarded
@ -409,7 +442,9 @@ class Group:
self.unordered.remove(item)
self._percentage = None
self._matches_for_ref = None
if (len(self) > 1) and any(not getattr(item, 'is_ref', False) for item in self):
if (len(self) > 1) and any(
not getattr(item, "is_ref", False) for item in self
):
if discard_matches:
self.matches = set(m for m in self.matches if item not in m)
else:
@ -438,7 +473,9 @@ class Group:
if self._percentage is None:
if self.dupes:
matches = self._get_matches_for_ref()
self._percentage = sum(match.percentage for match in matches) // len(matches)
self._percentage = sum(match.percentage for match in matches) // len(
matches
)
else:
self._percentage = 0
return self._percentage
@ -485,7 +522,7 @@ def get_groups(matches):
del dupe2group
del matches
# should free enough memory to continue
logging.warning('Memory Overflow. Groups: {0}'.format(len(groups)))
logging.warning("Memory Overflow. Groups: {0}".format(len(groups)))
# Now that we have a group, we have to discard groups' matches and see if there're any "orphan"
# matches, that is, matches that were candidate in a group but that none of their 2 files were
# accepted in the group. With these orphan groups, it's safe to build additional groups
@ -493,9 +530,12 @@ def get_groups(matches):
orphan_matches = []
for group in groups:
orphan_matches += {
m for m in group.discard_matches()
m
for m in group.discard_matches()
if not any(obj in matched_files for obj in [m.first, m.second])
}
if groups and orphan_matches:
groups += get_groups(orphan_matches) # no job, as it isn't supposed to take a long time
groups += get_groups(
orphan_matches
) # no job, as it isn't supposed to take a long time
return groups

View File

@ -114,36 +114,42 @@ ROW_TEMPLATE = """
CELL_TEMPLATE = """<td>{value}</td>"""
def export_to_xhtml(colnames, rows):
# a row is a list of values with the first value being a flag indicating if the row should be indented
if rows:
assert len(rows[0]) == len(colnames) + 1 # + 1 is for the "indented" flag
colheaders = ''.join(COLHEADERS_TEMPLATE.format(name=name) for name in colnames)
assert len(rows[0]) == len(colnames) + 1 # + 1 is for the "indented" flag
colheaders = "".join(COLHEADERS_TEMPLATE.format(name=name) for name in colnames)
rendered_rows = []
previous_group_id = None
for row in rows:
# [2:] is to remove the indented flag + filename
if row[0] != previous_group_id:
# We've just changed dupe group, which means that this dupe is a ref. We don't indent it.
indented = ''
indented = ""
else:
indented = 'indented'
indented = "indented"
filename = row[1]
cells = ''.join(CELL_TEMPLATE.format(value=value) for value in row[2:])
rendered_rows.append(ROW_TEMPLATE.format(indented=indented, filename=filename, cells=cells))
cells = "".join(CELL_TEMPLATE.format(value=value) for value in row[2:])
rendered_rows.append(
ROW_TEMPLATE.format(indented=indented, filename=filename, cells=cells)
)
previous_group_id = row[0]
rendered_rows = ''.join(rendered_rows)
rendered_rows = "".join(rendered_rows)
# The main template can't use format because the css code uses {}
content = MAIN_TEMPLATE.replace('$colheaders', colheaders).replace('$rows', rendered_rows)
content = MAIN_TEMPLATE.replace("$colheaders", colheaders).replace(
"$rows", rendered_rows
)
folder = mkdtemp()
destpath = op.join(folder, 'export.htm')
fp = open(destpath, 'wt', encoding='utf-8')
destpath = op.join(folder, "export.htm")
fp = open(destpath, "wt", encoding="utf-8")
fp.write(content)
fp.close()
return destpath
def export_to_csv(dest, colnames, rows):
writer = csv.writer(open(dest, 'wt', encoding='utf-8'))
writer = csv.writer(open(dest, "wt", encoding="utf-8"))
writer.writerow(["Group ID"] + colnames)
for row in rows:
writer.writerow(row)

View File

@ -17,19 +17,20 @@ import logging
from hscommon.util import nonone, get_file_ext
__all__ = [
'File',
'Folder',
'get_file',
'get_files',
'FSError',
'AlreadyExistsError',
'InvalidPath',
'InvalidDestinationError',
'OperationError',
"File",
"Folder",
"get_file",
"get_files",
"FSError",
"AlreadyExistsError",
"InvalidPath",
"InvalidDestinationError",
"OperationError",
]
NOT_SET = object()
class FSError(Exception):
cls_message = "An error has occured on '{name}' in '{parent}'"
@ -40,8 +41,8 @@ class FSError(Exception):
elif isinstance(fsobject, File):
name = fsobject.name
else:
name = ''
parentname = str(parent) if parent is not None else ''
name = ""
parentname = str(parent) if parent is not None else ""
Exception.__init__(self, message.format(name=name, parent=parentname))
@ -49,32 +50,39 @@ class AlreadyExistsError(FSError):
"The directory or file name we're trying to add already exists"
cls_message = "'{name}' already exists in '{parent}'"
class InvalidPath(FSError):
"The path of self is invalid, and cannot be worked with."
cls_message = "'{name}' is invalid."
class InvalidDestinationError(FSError):
"""A copy/move operation has been called, but the destination is invalid."""
cls_message = "'{name}' is an invalid destination for this operation."
class OperationError(FSError):
"""A copy/move/delete operation has been called, but the checkup after the
operation shows that it didn't work."""
cls_message = "Operation on '{name}' failed."
class File:
"""Represents a file and holds metadata to be used for scanning.
"""
INITIAL_INFO = {
'size': 0,
'mtime': 0,
'md5': '',
'md5partial': '',
"size": 0,
"mtime": 0,
"md5": "",
"md5partial": "",
}
# Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
# files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
# even greater when we take into account read attributes (70%!). Yeah, it's worth it.
__slots__ = ('path', 'is_ref', 'words') + tuple(INITIAL_INFO.keys())
__slots__ = ("path", "is_ref", "words") + tuple(INITIAL_INFO.keys())
def __init__(self, path):
self.path = path
@ -90,25 +98,27 @@ class File:
try:
self._read_info(attrname)
except Exception as e:
logging.warning("An error '%s' was raised while decoding '%s'", e, repr(self.path))
logging.warning(
"An error '%s' was raised while decoding '%s'", e, repr(self.path)
)
result = object.__getattribute__(self, attrname)
if result is NOT_SET:
result = self.INITIAL_INFO[attrname]
return result
#This offset is where we should start reading the file to get a partial md5
#For audio file, it should be where audio data starts
# This offset is where we should start reading the file to get a partial md5
# For audio file, it should be where audio data starts
def _get_md5partial_offset_and_size(self):
return (0x4000, 0x4000) #16Kb
return (0x4000, 0x4000) # 16Kb
def _read_info(self, field):
if field in ('size', 'mtime'):
if field in ("size", "mtime"):
stats = self.path.stat()
self.size = nonone(stats.st_size, 0)
self.mtime = nonone(stats.st_mtime, 0)
elif field == 'md5partial':
elif field == "md5partial":
try:
fp = self.path.open('rb')
fp = self.path.open("rb")
offset, size = self._get_md5partial_offset_and_size()
fp.seek(offset)
partialdata = fp.read(size)
@ -117,14 +127,14 @@ class File:
fp.close()
except Exception:
pass
elif field == 'md5':
elif field == "md5":
try:
fp = self.path.open('rb')
fp = self.path.open("rb")
md5 = hashlib.md5()
# The goal here is to not run out of memory on really big files. However, the chunk
# size has to be large enough so that the python loop isn't too costly in terms of
# CPU.
CHUNK_SIZE = 1024 * 1024 # 1 mb
CHUNK_SIZE = 1024 * 1024 # 1 mb
filedata = fp.read(CHUNK_SIZE)
while filedata:
md5.update(filedata)
@ -144,7 +154,7 @@ class File:
for attrname in attrnames:
getattr(self, attrname)
#--- Public
# --- Public
@classmethod
def can_handle(cls, path):
"""Returns whether this file wrapper class can handle ``path``.
@ -170,7 +180,7 @@ class File:
"""
raise NotImplementedError()
#--- Properties
# --- Properties
@property
def extension(self):
return get_file_ext(self.name)
@ -189,7 +199,8 @@ class Folder(File):
It has the size/md5 info of a File, but it's value are the sum of its subitems.
"""
__slots__ = File.__slots__ + ('_subfolders', )
__slots__ = File.__slots__ + ("_subfolders",)
def __init__(self, path):
File.__init__(self, path)
@ -201,12 +212,12 @@ class Folder(File):
return folders + files
def _read_info(self, field):
if field in {'size', 'mtime'}:
if field in {"size", "mtime"}:
size = sum((f.size for f in self._all_items()), 0)
self.size = size
stats = self.path.stat()
self.mtime = nonone(stats.st_mtime, 0)
elif field in {'md5', 'md5partial'}:
elif field in {"md5", "md5partial"}:
# What's sensitive here is that we must make sure that subfiles'
# md5 are always added up in the same order, but we also want a
# different md5 if a file gets moved in a different subdirectory.
@ -214,7 +225,7 @@ class Folder(File):
items = self._all_items()
items.sort(key=lambda f: f.path)
md5s = [getattr(f, field) for f in items]
return b''.join(md5s)
return b"".join(md5s)
md5 = hashlib.md5(get_dir_md5_concat())
digest = md5.digest()
@ -223,7 +234,9 @@ class Folder(File):
@property
def subfolders(self):
if self._subfolders is None:
subfolders = [p for p in self.path.listdir() if not p.islink() and p.isdir()]
subfolders = [
p for p in self.path.listdir() if not p.islink() and p.isdir()
]
self._subfolders = [self.__class__(p) for p in subfolders]
return self._subfolders
@ -244,6 +257,7 @@ def get_file(path, fileclasses=[File]):
if fileclass.can_handle(path):
return fileclass(path)
def get_files(path, fileclasses=[File]):
"""Returns a list of :class:`File` for each file contained in ``path``.

View File

@ -13,4 +13,3 @@ blue, which is supposed to be orange, does the sorting logic, holds selection, e
.. _cross-toolkit: http://www.hardcoded.net/articles/cross-toolkit-software
"""

View File

@ -8,6 +8,7 @@
from hscommon.notify import Listener
class DupeGuruGUIObject(Listener):
def __init__(self, app):
Listener.__init__(self, app)
@ -27,4 +28,3 @@ class DupeGuruGUIObject(Listener):
def results_changed_but_keep_selection(self):
pass

View File

@ -1,8 +1,8 @@
# Created On: 2012-05-30
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
import os
@ -10,42 +10,46 @@ import os
from hscommon.gui.base import GUIObject
from hscommon.trans import tr
class DeletionOptionsView:
"""Expected interface for :class:`DeletionOptions`'s view.
*Not actually used in the code. For documentation purposes only.*
Our view presents the user with an appropriate way (probably a mix of checkboxes and radio
buttons) to set the different flags in :class:`DeletionOptions`. Note that
:attr:`DeletionOptions.use_hardlinks` is only relevant if :attr:`DeletionOptions.link_deleted`
is true. This is why we toggle the "enabled" state of that flag.
We expect the view to set :attr:`DeletionOptions.link_deleted` immediately as the user changes
its value because it will toggle :meth:`set_hardlink_option_enabled`
Other than the flags, there's also a prompt message which has a dynamic content, defined by
:meth:`update_msg`.
"""
def update_msg(self, msg: str):
"""Update the dialog's prompt with ``str``.
"""
def show(self):
"""Show the dialog in a modal fashion.
Returns whether the dialog was "accepted" (the user pressed OK).
"""
def set_hardlink_option_enabled(self, is_enabled: bool):
"""Enable or disable the widget controlling :attr:`DeletionOptions.use_hardlinks`.
"""
class DeletionOptions(GUIObject):
"""Present the user with deletion options before proceeding.
When the user activates "Send to trash", we present him with a couple of options that changes
the behavior of that deletion operation.
"""
def __init__(self):
GUIObject.__init__(self)
#: Whether symlinks or hardlinks are used when doing :attr:`link_deleted`.
@ -54,10 +58,10 @@ class DeletionOptions(GUIObject):
#: Delete dupes directly and don't send to trash.
#: *bool*. *get/set*
self.direct = False
def show(self, mark_count):
"""Prompt the user with a modal dialog offering our deletion options.
:param int mark_count: Number of dupes marked for deletion.
:rtype: bool
:returns: Whether the user accepted the dialog (we cancel deletion if false).
@ -69,7 +73,7 @@ class DeletionOptions(GUIObject):
msg = tr("You are sending {} file(s) to the Trash.").format(mark_count)
self.view.update_msg(msg)
return self.view.show()
def supports_links(self):
"""Returns whether our platform supports symlinks.
"""
@ -87,21 +91,19 @@ class DeletionOptions(GUIObject):
except TypeError:
# wrong number of arguments
return True
@property
def link_deleted(self):
"""Replace deleted dupes with symlinks (or hardlinks) to the dupe group reference.
*bool*. *get/set*
Whether the link is a symlink or hardlink is decided by :attr:`use_hardlinks`.
"""
return self._link_deleted
@link_deleted.setter
def link_deleted(self, value):
self._link_deleted = value
hardlinks_enabled = value and self.supports_links()
self.view.set_hardlink_option_enabled(hardlinks_enabled)

View File

@ -9,6 +9,7 @@
from hscommon.gui.base import GUIObject
from .base import DupeGuruGUIObject
class DetailsPanel(GUIObject, DupeGuruGUIObject):
def __init__(self, app):
GUIObject.__init__(self, multibind=True)
@ -19,7 +20,7 @@ class DetailsPanel(GUIObject, DupeGuruGUIObject):
self._refresh()
self.view.refresh()
#--- Private
# --- Private
def _refresh(self):
if self.app.selected_dupes:
dupe = self.app.selected_dupes[0]
@ -31,18 +32,19 @@ class DetailsPanel(GUIObject, DupeGuruGUIObject):
# we don't want the two sides of the table to display the stats for the same file
ref = group.ref if group is not None and group.ref is not dupe else None
data2 = self.app.get_display_info(ref, group, False)
columns = self.app.result_table.COLUMNS[1:] # first column is the 'marked' column
columns = self.app.result_table.COLUMNS[
1:
] # first column is the 'marked' column
self._table = [(c.display, data1[c.name], data2[c.name]) for c in columns]
#--- Public
# --- Public
def row_count(self):
return len(self._table)
def row(self, row_index):
return self._table[row_index]
#--- Event Handlers
# --- Event Handlers
def dupes_selected(self):
self._refresh()
self.view.refresh()

View File

@ -1,9 +1,9 @@
# Created By: Virgil Dupras
# Created On: 2010-02-06
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.gui.tree import Tree, Node
@ -13,6 +13,7 @@ from .base import DupeGuruGUIObject
STATE_ORDER = [DirectoryState.Normal, DirectoryState.Reference, DirectoryState.Excluded]
# Lazily loads children
class DirectoryNode(Node):
def __init__(self, tree, path, name):
@ -21,29 +22,31 @@ class DirectoryNode(Node):
self._directory_path = path
self._loaded = False
self._state = STATE_ORDER.index(self._tree.app.directories.get_state(path))
def __len__(self):
if not self._loaded:
self._load()
return Node.__len__(self)
def _load(self):
self.clear()
subpaths = self._tree.app.directories.get_subfolders(self._directory_path)
for path in subpaths:
self.append(DirectoryNode(self._tree, path, path.name))
self._loaded = True
def update_all_states(self):
self._state = STATE_ORDER.index(self._tree.app.directories.get_state(self._directory_path))
self._state = STATE_ORDER.index(
self._tree.app.directories.get_state(self._directory_path)
)
for node in self:
node.update_all_states()
# The state propery is an index to the combobox
@property
def state(self):
return self._state
@state.setter
def state(self, value):
if value == self._state:
@ -52,29 +55,29 @@ class DirectoryNode(Node):
state = STATE_ORDER[value]
self._tree.app.directories.set_state(self._directory_path, state)
self._tree.update_all_states()
class DirectoryTree(Tree, DupeGuruGUIObject):
#--- model -> view calls:
# --- model -> view calls:
# refresh()
# refresh_states() # when only states label need to be refreshed
#
def __init__(self, app):
Tree.__init__(self)
DupeGuruGUIObject.__init__(self, app)
def _view_updated(self):
self._refresh()
self.view.refresh()
def _refresh(self):
self.clear()
for path in self.app.directories:
self.append(DirectoryNode(self, path, str(path)))
def add_directory(self, path):
self.app.add_directory(path)
def remove_selected(self):
selected_paths = self.selected_paths
if not selected_paths:
@ -90,18 +93,17 @@ class DirectoryTree(Tree, DupeGuruGUIObject):
newstate = DirectoryState.Normal
for node in nodes:
node.state = newstate
def select_all(self):
self.selected_nodes = list(self)
self.view.refresh()
def update_all_states(self):
for node in self:
node.update_all_states()
self.view.refresh_states()
#--- Event Handlers
# --- Event Handlers
def directories_changed(self):
self._refresh()
self.view.refresh()

View File

@ -8,8 +8,9 @@
from hscommon.trans import tr
from .ignore_list_table import IgnoreListTable
class IgnoreListDialog:
#--- View interface
# --- View interface
# show()
#
@ -21,7 +22,9 @@ class IgnoreListDialog:
def clear(self):
if not self.ignore_list:
return
msg = tr("Do you really want to remove all %d items from the ignore list?") % len(self.ignore_list)
msg = tr(
"Do you really want to remove all %d items from the ignore list?"
) % len(self.ignore_list)
if self.app.view.ask_yes_no(msg):
self.ignore_list.Clear()
self.refresh()
@ -36,4 +39,3 @@ class IgnoreListDialog:
def show(self):
self.view.show()

View File

@ -1,35 +1,36 @@
# Created By: Virgil Dupras
# Created On: 2012-03-13
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.gui.table import GUITable, Row
from hscommon.gui.column import Column, Columns
from hscommon.trans import trget
coltr = trget('columns')
coltr = trget("columns")
class IgnoreListTable(GUITable):
COLUMNS = [
# the str concat below saves us needless localization.
Column('path1', coltr("File Path") + " 1"),
Column('path2', coltr("File Path") + " 2"),
Column("path1", coltr("File Path") + " 1"),
Column("path2", coltr("File Path") + " 2"),
]
def __init__(self, ignore_list_dialog):
GUITable.__init__(self)
self.columns = Columns(self)
self.view = None
self.dialog = ignore_list_dialog
#--- Override
# --- Override
def _fill(self):
for path1, path2 in self.dialog.ignore_list:
self.append(IgnoreListRow(self, path1, path2))
class IgnoreListRow(Row):
def __init__(self, table, path1, path2):
@ -38,4 +39,3 @@ class IgnoreListRow(Row):
self.path2_original = path2
self.path1 = str(path1)
self.path2 = str(path2)

View File

@ -9,6 +9,7 @@
from hscommon.gui.base import GUIObject
from hscommon.gui.selectable_list import GUISelectableList
class CriterionCategoryList(GUISelectableList):
def __init__(self, dialog):
self.dialog = dialog
@ -18,6 +19,7 @@ class CriterionCategoryList(GUISelectableList):
self.dialog.select_category(self.dialog.categories[self.selected_index])
GUISelectableList._update_selection(self)
class PrioritizationList(GUISelectableList):
def __init__(self, dialog):
self.dialog = dialog
@ -41,6 +43,7 @@ class PrioritizationList(GUISelectableList):
del prilist[i]
self._refresh_contents()
class PrioritizeDialog(GUIObject):
def __init__(self, app):
GUIObject.__init__(self)
@ -52,15 +55,15 @@ class PrioritizeDialog(GUIObject):
self.prioritizations = []
self.prioritization_list = PrioritizationList(self)
#--- Override
# --- Override
def _view_updated(self):
self.category_list.select(0)
#--- Private
# --- Private
def _sort_key(self, dupe):
return tuple(crit.sort_key(dupe) for crit in self.prioritizations)
#--- Public
# --- Public
def select_category(self, category):
self.criteria = category.criteria_list()
self.criteria_list[:] = [c.display_value for c in self.criteria]

View File

@ -1,29 +1,29 @@
# Created By: Virgil Dupras
# Created On: 2010-04-12
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon import desktop
from .problem_table import ProblemTable
class ProblemDialog:
def __init__(self, app):
self.app = app
self._selected_dupe = None
self.problem_table = ProblemTable(self)
def refresh(self):
self._selected_dupe = None
self.problem_table.refresh()
def reveal_selected_dupe(self):
if self._selected_dupe is not None:
desktop.reveal_path(self._selected_dupe.path)
def select_dupe(self, dupe):
self._selected_dupe = dupe

View File

@ -1,39 +1,40 @@
# Created By: Virgil Dupras
# Created On: 2010-04-12
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.gui.table import GUITable, Row
from hscommon.gui.column import Column, Columns
from hscommon.trans import trget
coltr = trget('columns')
coltr = trget("columns")
class ProblemTable(GUITable):
COLUMNS = [
Column('path', coltr("File Path")),
Column('msg', coltr("Error Message")),
Column("path", coltr("File Path")),
Column("msg", coltr("Error Message")),
]
def __init__(self, problem_dialog):
GUITable.__init__(self)
self.columns = Columns(self)
self.dialog = problem_dialog
#--- Override
# --- Override
def _update_selection(self):
row = self.selected_row
dupe = row.dupe if row is not None else None
self.dialog.select_dupe(dupe)
def _fill(self):
problems = self.dialog.app.results.problems
for dupe, msg in problems:
self.append(ProblemRow(self, dupe, msg))
class ProblemRow(Row):
def __init__(self, table, dupe, msg):
@ -41,4 +42,3 @@ class ProblemRow(Row):
self.dupe = dupe
self.msg = msg
self.path = str(dupe.path)

View File

@ -1,9 +1,9 @@
# Created By: Virgil Dupras
# Created On: 2010-02-11
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from operator import attrgetter
@ -13,6 +13,7 @@ from hscommon.gui.column import Columns
from .base import DupeGuruGUIObject
class DupeRow(Row):
def __init__(self, table, group, dupe):
Row.__init__(self, table)
@ -22,14 +23,14 @@ class DupeRow(Row):
self._data = None
self._data_delta = None
self._delta_columns = None
def is_cell_delta(self, column_name):
"""Returns whether a cell is in delta mode (orange color).
If the result table is in delta mode, returns True if the column is one of the "delta
columns", that is, one of the columns that display a a differential value rather than an
absolute value.
If not, returns True if the dupe's value is different from its ref value.
"""
if not self.table.delta_values:
@ -42,62 +43,64 @@ class DupeRow(Row):
dupe_info = self.data
ref_info = self._group.ref.get_display_info(group=self._group, delta=False)
for key, value in dupe_info.items():
if (key not in self._delta_columns) and (ref_info[key].lower() != value.lower()):
if (key not in self._delta_columns) and (
ref_info[key].lower() != value.lower()
):
self._delta_columns.add(key)
return column_name in self._delta_columns
@property
def data(self):
if self._data is None:
self._data = self._app.get_display_info(self._dupe, self._group, False)
return self._data
@property
def data_delta(self):
if self._data_delta is None:
self._data_delta = self._app.get_display_info(self._dupe, self._group, True)
return self._data_delta
@property
def isref(self):
return self._dupe is self._group.ref
@property
def markable(self):
return self._app.results.is_markable(self._dupe)
@property
def marked(self):
return self._app.results.is_marked(self._dupe)
@marked.setter
def marked(self, value):
self._app.mark_dupe(self._dupe, value)
class ResultTable(GUITable, DupeGuruGUIObject):
def __init__(self, app):
GUITable.__init__(self)
DupeGuruGUIObject.__init__(self, app)
self.columns = Columns(self, prefaccess=app, savename='ResultTable')
self.columns = Columns(self, prefaccess=app, savename="ResultTable")
self._power_marker = False
self._delta_values = False
self._sort_descriptors = ('name', True)
#--- Override
self._sort_descriptors = ("name", True)
# --- Override
def _view_updated(self):
self._refresh_with_view()
def _restore_selection(self, previous_selection):
if self.app.selected_dupes:
to_find = set(self.app.selected_dupes)
indexes = [i for i, r in enumerate(self) if r._dupe in to_find]
self.selected_indexes = indexes
def _update_selection(self):
rows = self.selected_rows
self.app._select_dupes(list(map(attrgetter('_dupe'), rows)))
self.app._select_dupes(list(map(attrgetter("_dupe"), rows)))
def _fill(self):
if not self.power_marker:
for group in self.app.results.groups:
@ -108,22 +111,22 @@ class ResultTable(GUITable, DupeGuruGUIObject):
for dupe in self.app.results.dupes:
group = self.app.results.get_group_of_duplicate(dupe)
self.append(DupeRow(self, group, dupe))
def _refresh_with_view(self):
self.refresh()
self.view.show_selected_row()
#--- Public
# --- Public
def get_row_value(self, index, column):
try:
row = self[index]
except IndexError:
return '---'
return "---"
if self.delta_values:
return row.data_delta[column]
else:
return row.data[column]
def rename_selected(self, newname):
row = self.selected_row
if row is None:
@ -133,7 +136,7 @@ class ResultTable(GUITable, DupeGuruGUIObject):
row._data = None
row._data_delta = None
return self.app.rename_selected(newname)
def sort(self, key, asc):
if self.power_marker:
self.app.results.sort_dupes(key, asc, self.delta_values)
@ -141,12 +144,12 @@ class ResultTable(GUITable, DupeGuruGUIObject):
self.app.results.sort_groups(key, asc)
self._sort_descriptors = (key, asc)
self._refresh_with_view()
#--- Properties
# --- Properties
@property
def power_marker(self):
return self._power_marker
@power_marker.setter
def power_marker(self, value):
if value == self._power_marker:
@ -155,29 +158,29 @@ class ResultTable(GUITable, DupeGuruGUIObject):
key, asc = self._sort_descriptors
self.sort(key, asc)
# no need to refresh, it has happened in sort()
@property
def delta_values(self):
return self._delta_values
@delta_values.setter
def delta_values(self, value):
if value == self._delta_values:
return
self._delta_values = value
self.refresh()
@property
def selected_dupe_count(self):
return sum(1 for row in self.selected_rows if not row.isref)
#--- Event Handlers
# --- Event Handlers
def marking_changed(self):
self.view.invalidate_markings()
def results_changed(self):
self._refresh_with_view()
def results_changed_but_keep_selection(self):
# What we want to to here is that instead of restoring selected *dupes* after refresh, we
# restore selected *paths*.
@ -185,7 +188,6 @@ class ResultTable(GUITable, DupeGuruGUIObject):
self.refresh(refresh_view=False)
self.select(indexes)
self.view.refresh()
def save_session(self):
self.columns.save_columns()

View File

@ -1,21 +1,23 @@
# Created By: Virgil Dupras
# Created On: 2010-02-11
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from .base import DupeGuruGUIObject
class StatsLabel(DupeGuruGUIObject):
def _view_updated(self):
self.view.refresh()
@property
def display(self):
return self.app.stat_line
def results_changed(self):
self.view.refresh()
marking_changed = results_changed

View File

@ -10,13 +10,15 @@ from xml.etree import ElementTree as ET
from hscommon.util import FileOrPath
class IgnoreList:
"""An ignore list implementation that is iterable, filterable and exportable to XML.
Call Ignore to add an ignore list entry, and AreIgnore to check if 2 items are in the list.
When iterated, 2 sized tuples will be returned, the tuples containing 2 items ignored together.
"""
#---Override
# ---Override
def __init__(self):
self._ignored = {}
self._count = 0
@ -29,7 +31,7 @@ class IgnoreList:
def __len__(self):
return self._count
#---Public
# ---Public
def AreIgnored(self, first, second):
def do_check(first, second):
try:
@ -99,14 +101,14 @@ class IgnoreList:
root = ET.parse(infile).getroot()
except Exception:
return
file_elems = (e for e in root if e.tag == 'file')
file_elems = (e for e in root if e.tag == "file")
for fn in file_elems:
file_path = fn.get('path')
file_path = fn.get("path")
if not file_path:
continue
subfile_elems = (e for e in fn if e.tag == 'file')
subfile_elems = (e for e in fn if e.tag == "file")
for sfn in subfile_elems:
subfile_path = sfn.get('path')
subfile_path = sfn.get("path")
if subfile_path:
self.Ignore(file_path, subfile_path)
@ -115,15 +117,13 @@ class IgnoreList:
outfile can be a file object or a filename.
"""
root = ET.Element('ignore_list')
root = ET.Element("ignore_list")
for filename, subfiles in self._ignored.items():
file_node = ET.SubElement(root, 'file')
file_node.set('path', filename)
file_node = ET.SubElement(root, "file")
file_node.set("path", filename)
for subfilename in subfiles:
subfile_node = ET.SubElement(file_node, 'file')
subfile_node.set('path', subfilename)
subfile_node = ET.SubElement(file_node, "file")
subfile_node.set("path", subfilename)
tree = ET.ElementTree(root)
with FileOrPath(outfile, 'wb') as fp:
tree.write(fp, encoding='utf-8')
with FileOrPath(outfile, "wb") as fp:
tree.write(fp, encoding="utf-8")

View File

@ -2,40 +2,41 @@
# Created On: 2006/02/23
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
class Markable:
def __init__(self):
self.__marked = set()
self.__inverted = False
#---Virtual
#About did_mark and did_unmark: They only happen what an object is actually added/removed
# ---Virtual
# About did_mark and did_unmark: They only happen what an object is actually added/removed
# in self.__marked, and is not affected by __inverted. Thus, self.mark while __inverted
#is True will launch _DidUnmark.
# is True will launch _DidUnmark.
def _did_mark(self, o):
pass
def _did_unmark(self, o):
pass
def _get_markable_count(self):
return 0
def _is_markable(self, o):
return True
#---Protected
# ---Protected
def _remove_mark_flag(self, o):
try:
self.__marked.remove(o)
self._did_unmark(o)
except KeyError:
pass
#---Public
pass
# ---Public
def is_marked(self, o):
if not self._is_markable(o):
return False
@ -43,31 +44,31 @@ class Markable:
if self.__inverted:
is_marked = not is_marked
return is_marked
def mark(self, o):
if self.is_marked(o):
return False
if not self._is_markable(o):
return False
return self.mark_toggle(o)
def mark_multiple(self, objects):
for o in objects:
self.mark(o)
def mark_all(self):
self.mark_none()
self.__inverted = True
def mark_invert(self):
self.__inverted = not self.__inverted
def mark_none(self):
for o in self.__marked:
self._did_unmark(o)
self.__marked = set()
self.__inverted = False
def mark_toggle(self, o):
try:
self.__marked.remove(o)
@ -78,32 +79,33 @@ class Markable:
self.__marked.add(o)
self._did_mark(o)
return True
def mark_toggle_multiple(self, objects):
for o in objects:
self.mark_toggle(o)
def unmark(self, o):
if not self.is_marked(o):
return False
return self.mark_toggle(o)
def unmark_multiple(self, objects):
for o in objects:
self.unmark(o)
#--- Properties
# --- Properties
@property
def mark_count(self):
if self.__inverted:
return self._get_markable_count() - len(self.__marked)
else:
return len(self.__marked)
@property
def mark_inverted(self):
return self.__inverted
class MarkableList(list, Markable):
def __init__(self):
list.__init__(self)

View File

@ -1 +1 @@
from . import fs, prioritize, result_table, scanner # noqa
from . import fs, prioritize, result_table, scanner # noqa

View File

@ -13,25 +13,37 @@ from core.util import format_timestamp, format_perc, format_words, format_dupe_c
from core import fs
TAG_FIELDS = {
'audiosize', 'duration', 'bitrate', 'samplerate', 'title', 'artist',
'album', 'genre', 'year', 'track', 'comment'
"audiosize",
"duration",
"bitrate",
"samplerate",
"title",
"artist",
"album",
"genre",
"year",
"track",
"comment",
}
class MusicFile(fs.File):
INITIAL_INFO = fs.File.INITIAL_INFO.copy()
INITIAL_INFO.update({
'audiosize': 0,
'bitrate': 0,
'duration': 0,
'samplerate': 0,
'artist': '',
'album': '',
'title': '',
'genre': '',
'comment': '',
'year': '',
'track': 0,
})
INITIAL_INFO.update(
{
"audiosize": 0,
"bitrate": 0,
"duration": 0,
"samplerate": 0,
"artist": "",
"album": "",
"title": "",
"genre": "",
"comment": "",
"year": "",
"track": 0,
}
)
__slots__ = fs.File.__slots__ + tuple(INITIAL_INFO.keys())
@classmethod
@ -60,26 +72,26 @@ class MusicFile(fs.File):
else:
percentage = group.percentage
dupe_count = len(group.dupes)
dupe_folder_path = getattr(self, 'display_folder_path', self.folder_path)
dupe_folder_path = getattr(self, "display_folder_path", self.folder_path)
return {
'name': self.name,
'folder_path': str(dupe_folder_path),
'size': format_size(size, 2, 2, False),
'duration': format_time(duration, with_hours=False),
'bitrate': str(bitrate),
'samplerate': str(samplerate),
'extension': self.extension,
'mtime': format_timestamp(mtime, delta and m),
'title': self.title,
'artist': self.artist,
'album': self.album,
'genre': self.genre,
'year': self.year,
'track': str(self.track),
'comment': self.comment,
'percentage': format_perc(percentage),
'words': format_words(self.words) if hasattr(self, 'words') else '',
'dupe_count': format_dupe_count(dupe_count),
"name": self.name,
"folder_path": str(dupe_folder_path),
"size": format_size(size, 2, 2, False),
"duration": format_time(duration, with_hours=False),
"bitrate": str(bitrate),
"samplerate": str(samplerate),
"extension": self.extension,
"mtime": format_timestamp(mtime, delta and m),
"title": self.title,
"artist": self.artist,
"album": self.album,
"genre": self.genre,
"year": self.year,
"track": str(self.track),
"comment": self.comment,
"percentage": format_perc(percentage),
"words": format_words(self.words) if hasattr(self, "words") else "",
"dupe_count": format_dupe_count(dupe_count),
}
def _get_md5partial_offset_and_size(self):
@ -101,4 +113,3 @@ class MusicFile(fs.File):
self.comment = f.comment
self.year = f.year
self.track = f.track

View File

@ -8,11 +8,16 @@
from hscommon.trans import trget
from core.prioritize import (
KindCategory, FolderCategory, FilenameCategory, NumericalCategory,
SizeCategory, MtimeCategory
KindCategory,
FolderCategory,
FilenameCategory,
NumericalCategory,
SizeCategory,
MtimeCategory,
)
coltr = trget('columns')
coltr = trget("columns")
class DurationCategory(NumericalCategory):
NAME = coltr("Duration")
@ -20,21 +25,29 @@ class DurationCategory(NumericalCategory):
def extract_value(self, dupe):
return dupe.duration
class BitrateCategory(NumericalCategory):
NAME = coltr("Bitrate")
def extract_value(self, dupe):
return dupe.bitrate
class SamplerateCategory(NumericalCategory):
NAME = coltr("Samplerate")
def extract_value(self, dupe):
return dupe.samplerate
def all_categories():
return [
KindCategory, FolderCategory, FilenameCategory, SizeCategory, DurationCategory,
BitrateCategory, SamplerateCategory, MtimeCategory
KindCategory,
FolderCategory,
FilenameCategory,
SizeCategory,
DurationCategory,
BitrateCategory,
SamplerateCategory,
MtimeCategory,
]

View File

@ -1,8 +1,8 @@
# Created On: 2011-11-27
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.gui.column import Column
@ -10,28 +10,29 @@ from hscommon.trans import trget
from core.gui.result_table import ResultTable as ResultTableBase
coltr = trget('columns')
coltr = trget("columns")
class ResultTable(ResultTableBase):
COLUMNS = [
Column('marked', ''),
Column('name', coltr("Filename")),
Column('folder_path', coltr("Folder"), visible=False, optional=True),
Column('size', coltr("Size (MB)"), optional=True),
Column('duration', coltr("Time"), optional=True),
Column('bitrate', coltr("Bitrate"), optional=True),
Column('samplerate', coltr("Sample Rate"), visible=False, optional=True),
Column('extension', coltr("Kind"), optional=True),
Column('mtime', coltr("Modification"), visible=False, optional=True),
Column('title', coltr("Title"), visible=False, optional=True),
Column('artist', coltr("Artist"), visible=False, optional=True),
Column('album', coltr("Album"), visible=False, optional=True),
Column('genre', coltr("Genre"), visible=False, optional=True),
Column('year', coltr("Year"), visible=False, optional=True),
Column('track', coltr("Track Number"), visible=False, optional=True),
Column('comment', coltr("Comment"), visible=False, optional=True),
Column('percentage', coltr("Match %"), optional=True),
Column('words', coltr("Words Used"), visible=False, optional=True),
Column('dupe_count', coltr("Dupe Count"), visible=False, optional=True),
Column("marked", ""),
Column("name", coltr("Filename")),
Column("folder_path", coltr("Folder"), visible=False, optional=True),
Column("size", coltr("Size (MB)"), optional=True),
Column("duration", coltr("Time"), optional=True),
Column("bitrate", coltr("Bitrate"), optional=True),
Column("samplerate", coltr("Sample Rate"), visible=False, optional=True),
Column("extension", coltr("Kind"), optional=True),
Column("mtime", coltr("Modification"), visible=False, optional=True),
Column("title", coltr("Title"), visible=False, optional=True),
Column("artist", coltr("Artist"), visible=False, optional=True),
Column("album", coltr("Album"), visible=False, optional=True),
Column("genre", coltr("Genre"), visible=False, optional=True),
Column("year", coltr("Year"), visible=False, optional=True),
Column("track", coltr("Track Number"), visible=False, optional=True),
Column("comment", coltr("Comment"), visible=False, optional=True),
Column("percentage", coltr("Match %"), optional=True),
Column("words", coltr("Words Used"), visible=False, optional=True),
Column("dupe_count", coltr("Dupe Count"), visible=False, optional=True),
]
DELTA_COLUMNS = {'size', 'duration', 'bitrate', 'samplerate', 'mtime'}
DELTA_COLUMNS = {"size", "duration", "bitrate", "samplerate", "mtime"}

View File

@ -8,6 +8,7 @@ from hscommon.trans import tr
from core.scanner import Scanner as ScannerBase, ScanOption, ScanType
class ScannerME(ScannerBase):
@staticmethod
def _key_func(dupe):
@ -22,5 +23,3 @@ class ScannerME(ScannerBase):
ScanOption(ScanType.Tag, tr("Tags")),
ScanOption(ScanType.Contents, tr("Contents")),
]

View File

@ -1 +1,12 @@
from . import block, cache, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa
from . import ( # noqa
block,
cache,
exif,
iphoto_plist,
matchblock,
matchexif,
photo,
prioritize,
result_table,
scanner,
)

View File

@ -6,7 +6,7 @@
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from ._block import NoBlocksError, DifferentBlockCountError, avgdiff, getblocks2 # NOQA
from ._block import NoBlocksError, DifferentBlockCountError, avgdiff, getblocks2 # NOQA
# Converted to C
# def getblock(image):

View File

@ -4,7 +4,8 @@
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from ._cache import string_to_colors # noqa
from ._cache import string_to_colors # noqa
def colors_to_string(colors):
"""Transform the 3 sized tuples 'colors' into a hex string.
@ -12,7 +13,8 @@ def colors_to_string(colors):
[(0,100,255)] --> 0064ff
[(1,2,3),(4,5,6)] --> 010203040506
"""
return ''.join('%02x%02x%02x' % (r, g, b) for r, g, b in colors)
return "".join("%02x%02x%02x" % (r, g, b) for r, g, b in colors)
# This function is an important bottleneck of dupeGuru PE. It has been converted to C.
# def string_to_colors(s):
@ -23,4 +25,3 @@ def colors_to_string(colors):
# number = int(s[i:i+6], 16)
# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
# return result

View File

@ -12,29 +12,36 @@ from collections import namedtuple
from .cache import string_to_colors, colors_to_string
def wrap_path(path):
return 'path:{}'.format(path)
return "path:{}".format(path)
def unwrap_path(key):
return key[5:]
def wrap_id(path):
return 'id:{}'.format(path)
return "id:{}".format(path)
def unwrap_id(key):
return int(key[3:])
CacheRow = namedtuple('CacheRow', 'id path blocks mtime')
CacheRow = namedtuple("CacheRow", "id path blocks mtime")
class ShelveCache:
"""A class to cache picture blocks in a shelve backend.
"""
def __init__(self, db=None, readonly=False):
self.istmp = db is None
if self.istmp:
self.dtmp = tempfile.mkdtemp()
self.ftmp = db = op.join(self.dtmp, 'tmpdb')
flag = 'r' if readonly else 'c'
self.ftmp = db = op.join(self.dtmp, "tmpdb")
flag = "r" if readonly else "c"
self.shelve = shelve.open(db, flag)
self.maxid = self._compute_maxid()
@ -54,10 +61,10 @@ class ShelveCache:
return string_to_colors(self.shelve[skey].blocks)
def __iter__(self):
return (unwrap_path(k) for k in self.shelve if k.startswith('path:'))
return (unwrap_path(k) for k in self.shelve if k.startswith("path:"))
def __len__(self):
return sum(1 for k in self.shelve if k.startswith('path:'))
return sum(1 for k in self.shelve if k.startswith("path:"))
def __setitem__(self, path_str, blocks):
blocks = colors_to_string(blocks)
@ -74,7 +81,9 @@ class ShelveCache:
self.shelve[wrap_id(rowid)] = wrap_path(path_str)
def _compute_maxid(self):
return max((unwrap_id(k) for k in self.shelve if k.startswith('id:')), default=1)
return max(
(unwrap_id(k) for k in self.shelve if k.startswith("id:")), default=1
)
def _get_new_id(self):
self.maxid += 1
@ -133,4 +142,3 @@ class ShelveCache:
# #402 and #439. I don't think it hurts to silently ignore the error, so that's
# what we do
pass

View File

@ -11,10 +11,12 @@ import sqlite3 as sqlite
from .cache import string_to_colors, colors_to_string
class SqliteCache:
"""A class to cache picture blocks in a sqlite backend.
"""
def __init__(self, db=':memory:', readonly=False):
def __init__(self, db=":memory:", readonly=False):
# readonly is not used in the sqlite version of the cache
self.dbname = db
self.con = None
@ -67,34 +69,40 @@ class SqliteCache:
try:
self.con.execute(sql, [blocks, mtime, path_str])
except sqlite.OperationalError:
logging.warning('Picture cache could not set value for key %r', path_str)
logging.warning("Picture cache could not set value for key %r", path_str)
except sqlite.DatabaseError as e:
logging.warning('DatabaseError while setting value for key %r: %s', path_str, str(e))
logging.warning(
"DatabaseError while setting value for key %r: %s", path_str, str(e)
)
def _create_con(self, second_try=False):
def create_tables():
logging.debug("Creating picture cache tables.")
self.con.execute("drop table if exists pictures")
self.con.execute("drop index if exists idx_path")
self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks TEXT)")
self.con.execute(
"create table pictures(path TEXT, mtime INTEGER, blocks TEXT)"
)
self.con.execute("create index idx_path on pictures (path)")
self.con = sqlite.connect(self.dbname, isolation_level=None)
try:
self.con.execute("select path, mtime, blocks from pictures where 1=2")
except sqlite.OperationalError: # new db
except sqlite.OperationalError: # new db
create_tables()
except sqlite.DatabaseError as e: # corrupted db
except sqlite.DatabaseError as e: # corrupted db
if second_try:
raise # Something really strange is happening
logging.warning('Could not create picture cache because of an error: %s', str(e))
raise # Something really strange is happening
logging.warning(
"Could not create picture cache because of an error: %s", str(e)
)
self.con.close()
os.remove(self.dbname)
self._create_con(second_try=True)
def clear(self):
self.close()
if self.dbname != ':memory:':
if self.dbname != ":memory:":
os.remove(self.dbname)
self._create_con()
@ -117,7 +125,9 @@ class SqliteCache:
raise ValueError(path)
def get_multiple(self, rowids):
sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids))
sql = "select rowid, blocks from pictures where rowid in (%s)" % ",".join(
map(str, rowids)
)
cur = self.con.execute(sql)
return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)
@ -138,6 +148,7 @@ class SqliteCache:
continue
todelete.append(rowid)
if todelete:
sql = "delete from pictures where rowid in (%s)" % ','.join(map(str, todelete))
sql = "delete from pictures where rowid in (%s)" % ",".join(
map(str, todelete)
)
self.con.execute(sql)

View File

@ -83,17 +83,17 @@ EXIF_TAGS = {
0xA003: "PixelYDimension",
0xA004: "RelatedSoundFile",
0xA005: "InteroperabilityIFDPointer",
0xA20B: "FlashEnergy", # 0x920B in TIFF/EP
0xA20C: "SpatialFrequencyResponse", # 0x920C - -
0xA20E: "FocalPlaneXResolution", # 0x920E - -
0xA20F: "FocalPlaneYResolution", # 0x920F - -
0xA210: "FocalPlaneResolutionUnit", # 0x9210 - -
0xA214: "SubjectLocation", # 0x9214 - -
0xA215: "ExposureIndex", # 0x9215 - -
0xA217: "SensingMethod", # 0x9217 - -
0xA20B: "FlashEnergy", # 0x920B in TIFF/EP
0xA20C: "SpatialFrequencyResponse", # 0x920C - -
0xA20E: "FocalPlaneXResolution", # 0x920E - -
0xA20F: "FocalPlaneYResolution", # 0x920F - -
0xA210: "FocalPlaneResolutionUnit", # 0x9210 - -
0xA214: "SubjectLocation", # 0x9214 - -
0xA215: "ExposureIndex", # 0x9215 - -
0xA217: "SensingMethod", # 0x9217 - -
0xA300: "FileSource",
0xA301: "SceneType",
0xA302: "CFAPattern", # 0x828E in TIFF/EP
0xA302: "CFAPattern", # 0x828E in TIFF/EP
0xA401: "CustomRendered",
0xA402: "ExposureMode",
0xA403: "WhiteBalance",
@ -148,17 +148,18 @@ GPS_TA0GS = {
0x1B: "GPSProcessingMethod",
0x1C: "GPSAreaInformation",
0x1D: "GPSDateStamp",
0x1E: "GPSDifferential"
0x1E: "GPSDifferential",
}
INTEL_ENDIAN = ord('I')
MOTOROLA_ENDIAN = ord('M')
INTEL_ENDIAN = ord("I")
MOTOROLA_ENDIAN = ord("M")
# About MAX_COUNT: It's possible to have corrupted exif tags where the entry count is way too high
# and thus makes us loop, not endlessly, but for heck of a long time for nothing. Therefore, we put
# an arbitrary limit on the entry count we'll allow ourselves to read and any IFD reporting more
# entries than that will be considered corrupt.
MAX_COUNT = 0xffff
MAX_COUNT = 0xFFFF
def s2n_motorola(bytes):
x = 0
@ -166,6 +167,7 @@ def s2n_motorola(bytes):
x = (x << 8) | c
return x
def s2n_intel(bytes):
x = 0
y = 0
@ -174,13 +176,14 @@ def s2n_intel(bytes):
y = y + 8
return x
class Fraction:
def __init__(self, num, den):
self.num = num
self.den = den
def __repr__(self):
return '%d/%d' % (self.num, self.den)
return "%d/%d" % (self.num, self.den)
class TIFF_file:
@ -190,16 +193,22 @@ class TIFF_file:
self.s2nfunc = s2n_intel if self.endian == INTEL_ENDIAN else s2n_motorola
def s2n(self, offset, length, signed=0, debug=False):
slice = self.data[offset:offset+length]
slice = self.data[offset : offset + length]
val = self.s2nfunc(slice)
# Sign extension ?
if signed:
msb = 1 << (8*length - 1)
msb = 1 << (8 * length - 1)
if val & msb:
val = val - (msb << 1)
if debug:
logging.debug(self.endian)
logging.debug("Slice for offset %d length %d: %r and value: %d", offset, length, slice, val)
logging.debug(
"Slice for offset %d length %d: %r and value: %d",
offset,
length,
slice,
val,
)
return val
def first_IFD(self):
@ -225,30 +234,31 @@ class TIFF_file:
return []
a = []
for i in range(entries):
entry = ifd + 2 + 12*i
entry = ifd + 2 + 12 * i
tag = self.s2n(entry, 2)
type = self.s2n(entry+2, 2)
type = self.s2n(entry + 2, 2)
if not 1 <= type <= 10:
continue # not handled
typelen = [1, 1, 2, 4, 8, 1, 1, 2, 4, 8][type-1]
count = self.s2n(entry+4, 4)
continue # not handled
typelen = [1, 1, 2, 4, 8, 1, 1, 2, 4, 8][type - 1]
count = self.s2n(entry + 4, 4)
if count > MAX_COUNT:
logging.debug("Probably corrupt. Aborting.")
return []
offset = entry+8
if count*typelen > 4:
offset = entry + 8
if count * typelen > 4:
offset = self.s2n(offset, 4)
if type == 2:
# Special case: nul-terminated ASCII string
values = str(self.data[offset:offset+count-1], encoding='latin-1')
values = str(self.data[offset : offset + count - 1], encoding="latin-1")
else:
values = []
signed = (type == 6 or type >= 8)
signed = type == 6 or type >= 8
for j in range(count):
if type in {5, 10}:
# The type is either 5 or 10
value_j = Fraction(self.s2n(offset, 4, signed),
self.s2n(offset+4, 4, signed))
value_j = Fraction(
self.s2n(offset, 4, signed), self.s2n(offset + 4, 4, signed)
)
else:
# Not a fraction
value_j = self.s2n(offset, typelen, signed)
@ -258,32 +268,37 @@ class TIFF_file:
a.append((tag, type, values))
return a
def read_exif_header(fp):
# If `fp`'s first bytes are not exif, it tries to find it in the next 4kb
def isexif(data):
return data[0:4] == b'\377\330\377\341' and data[6:10] == b'Exif'
return data[0:4] == b"\377\330\377\341" and data[6:10] == b"Exif"
data = fp.read(12)
if isexif(data):
return data
# ok, not exif, try to find it
large_data = fp.read(4096)
try:
index = large_data.index(b'Exif')
data = large_data[index-6:index+6]
index = large_data.index(b"Exif")
data = large_data[index - 6 : index + 6]
# large_data omits the first 12 bytes, and the index is at the middle of the header, so we
# must seek index + 18
fp.seek(index+18)
fp.seek(index + 18)
return data
except ValueError:
raise ValueError("Not an Exif file")
def get_fields(fp):
data = read_exif_header(fp)
length = data[4] * 256 + data[5]
logging.debug("Exif header length: %d bytes", length)
data = fp.read(length-8)
data = fp.read(length - 8)
data_format = data[0]
logging.debug("%s format", {INTEL_ENDIAN: 'Intel', MOTOROLA_ENDIAN: 'Motorola'}[data_format])
logging.debug(
"%s format", {INTEL_ENDIAN: "Intel", MOTOROLA_ENDIAN: "Motorola"}[data_format]
)
T = TIFF_file(data)
# There may be more than one IFD per file, but we only read the first one because others are
# most likely thumbnails.
@ -294,9 +309,9 @@ def get_fields(fp):
try:
stag = EXIF_TAGS[tag]
except KeyError:
stag = '0x%04X' % tag
stag = "0x%04X" % tag
if stag in result:
return # don't overwrite data
return # don't overwrite data
result[stag] = values
logging.debug("IFD at offset %d", main_IFD_offset)

View File

@ -1,24 +1,26 @@
# Created By: Virgil Dupras
# Created On: 2014-03-15
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
import plistlib
class IPhotoPlistParser(plistlib._PlistParser):
"""A parser for iPhoto plists.
iPhoto plists tend to be malformed, so we have to subclass the built-in parser to be a bit more
lenient.
"""
def __init__(self):
plistlib._PlistParser.__init__(self, use_builtin_types=True, dict_type=dict)
# For debugging purposes, we remember the last bit of data to be analyzed so that we can
# log it in case of an exception
self.lastdata = ''
self.lastdata = ""
def get_data(self):
self.lastdata = plistlib._PlistParser.get_data(self)

View File

@ -48,14 +48,18 @@ except Exception:
logging.warning("Had problems to determine cpu count on launch.")
RESULTS_QUEUE_LIMIT = 8
def get_cache(cache_path, readonly=False):
if cache_path.endswith('shelve'):
if cache_path.endswith("shelve"):
from .cache_shelve import ShelveCache
return ShelveCache(cache_path, readonly=readonly)
else:
from .cache_sqlite import SqliteCache
return SqliteCache(cache_path, readonly=readonly)
def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
# The MemoryError handlers in there use logging without first caring about whether or not
# there is enough memory left to carry on the operation because it is assumed that the
@ -63,7 +67,7 @@ def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
# time that MemoryError is raised.
cache = get_cache(cache_path)
cache.purge_outdated()
prepared = [] # only pictures for which there was no error getting blocks
prepared = [] # only pictures for which there was no error getting blocks
try:
for picture in j.iter_with_progress(pictures, tr("Analyzed %d/%d pictures")):
if not picture.path:
@ -77,7 +81,7 @@ def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
picture.unicode_path = str(picture.path)
logging.debug("Analyzing picture at %s", picture.unicode_path)
if with_dimensions:
picture.dimensions # pre-read dimensions
picture.dimensions # pre-read dimensions
try:
if picture.unicode_path not in cache:
blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE)
@ -86,32 +90,45 @@ def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
except (IOError, ValueError) as e:
logging.warning(str(e))
except MemoryError:
logging.warning("Ran out of memory while reading %s of size %d", picture.unicode_path, picture.size)
if picture.size < 10 * 1024 * 1024: # We're really running out of memory
logging.warning(
"Ran out of memory while reading %s of size %d",
picture.unicode_path,
picture.size,
)
if (
picture.size < 10 * 1024 * 1024
): # We're really running out of memory
raise
except MemoryError:
logging.warning('Ran out of memory while preparing pictures')
logging.warning("Ran out of memory while preparing pictures")
cache.close()
return prepared
def get_chunks(pictures):
min_chunk_count = multiprocessing.cpu_count() * 2 # have enough chunks to feed all subprocesses
min_chunk_count = (
multiprocessing.cpu_count() * 2
) # have enough chunks to feed all subprocesses
chunk_count = len(pictures) // DEFAULT_CHUNK_SIZE
chunk_count = max(min_chunk_count, chunk_count)
chunk_size = (len(pictures) // chunk_count) + 1
chunk_size = max(MIN_CHUNK_SIZE, chunk_size)
logging.info(
"Creating %d chunks with a chunk size of %d for %d pictures", chunk_count,
chunk_size, len(pictures)
"Creating %d chunks with a chunk size of %d for %d pictures",
chunk_count,
chunk_size,
len(pictures),
)
chunks = [pictures[i:i+chunk_size] for i in range(0, len(pictures), chunk_size)]
chunks = [pictures[i : i + chunk_size] for i in range(0, len(pictures), chunk_size)]
return chunks
def get_match(first, second, percentage):
if percentage < 0:
percentage = 0
return Match(first, second, percentage)
def async_compare(ref_ids, other_ids, dbname, threshold, picinfo):
# The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids
# can be None. In this case, ref_ids has to be compared with itself
@ -142,6 +159,7 @@ def async_compare(ref_ids, other_ids, dbname, threshold, picinfo):
cache.close()
return results
def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljob):
def get_picinfo(p):
if match_scaled:
@ -160,11 +178,16 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo
async_results.remove(result)
comparison_count += 1
# About the NOQA below: I think there's a bug in pyflakes. To investigate...
progress_msg = tr("Performed %d/%d chunk matches") % (comparison_count, len(comparisons_to_do)) # NOQA
progress_msg = tr("Performed %d/%d chunk matches") % (
comparison_count,
len(comparisons_to_do),
) # NOQA
j.set_progress(comparison_count, progress_msg)
j = j.start_subjob([3, 7])
pictures = prepare_pictures(pictures, cache_path, with_dimensions=not match_scaled, j=j)
pictures = prepare_pictures(
pictures, cache_path, with_dimensions=not match_scaled, j=j
)
j = j.start_subjob([9, 1], tr("Preparing for matching"))
cache = get_cache(cache_path)
id2picture = {}
@ -175,7 +198,7 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo
except ValueError:
pass
cache.close()
pictures = [p for p in pictures if hasattr(p, 'cache_id')]
pictures = [p for p in pictures if hasattr(p, "cache_id")]
pool = multiprocessing.Pool()
async_results = []
matches = []
@ -203,9 +226,17 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo
# some wiggle room, log about the incident, and stop matching right here. We then process
# the matches we have. The rest of the process doesn't allocate much and we should be
# alright.
del comparisons_to_do, chunks, pictures # some wiggle room for the next statements
logging.warning("Ran out of memory when scanning! We had %d matches.", len(matches))
del matches[-len(matches)//3:] # some wiggle room to ensure we don't run out of memory again.
del (
comparisons_to_do,
chunks,
pictures,
) # some wiggle room for the next statements
logging.warning(
"Ran out of memory when scanning! We had %d matches.", len(matches)
)
del matches[
-len(matches) // 3 :
] # some wiggle room to ensure we don't run out of memory again.
pool.close()
result = []
myiter = j.iter_with_progress(
@ -220,10 +251,10 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo
if percentage == 100 and ref.md5 != other.md5:
percentage = 99
if percentage >= threshold:
ref.dimensions # pre-read dimensions for display in results
ref.dimensions # pre-read dimensions for display in results
other.dimensions
result.append(get_match(ref, other, percentage))
return result
multiprocessing.freeze_support()
multiprocessing.freeze_support()

View File

@ -13,14 +13,15 @@ from hscommon.trans import tr
from core.engine import Match
def getmatches(files, match_scaled, j):
timestamp2pic = defaultdict(set)
for picture in j.iter_with_progress(files, tr("Read EXIF of %d/%d pictures")):
timestamp = picture.exif_timestamp
if timestamp:
timestamp2pic[timestamp].add(picture)
if '0000:00:00 00:00:00' in timestamp2pic: # very likely false matches
del timestamp2pic['0000:00:00 00:00:00']
if "0000:00:00 00:00:00" in timestamp2pic: # very likely false matches
del timestamp2pic["0000:00:00 00:00:00"]
matches = []
for pictures in timestamp2pic.values():
for p1, p2 in combinations(pictures, 2):
@ -28,4 +29,3 @@ def getmatches(files, match_scaled, j):
continue
matches.append(Match(p1, p2, 100))
return matches

View File

@ -14,23 +14,22 @@ from . import exif
# This global value is set by the platform-specific subclasser of the Photo base class
PLAT_SPECIFIC_PHOTO_CLASS = None
def format_dimensions(dimensions):
return '%d x %d' % (dimensions[0], dimensions[1])
return "%d x %d" % (dimensions[0], dimensions[1])
def get_delta_dimensions(value, ref_value):
return (value[0]-ref_value[0], value[1]-ref_value[1])
return (value[0] - ref_value[0], value[1] - ref_value[1])
class Photo(fs.File):
INITIAL_INFO = fs.File.INITIAL_INFO.copy()
INITIAL_INFO.update({
'dimensions': (0, 0),
'exif_timestamp': '',
})
INITIAL_INFO.update({"dimensions": (0, 0), "exif_timestamp": ""})
__slots__ = fs.File.__slots__ + tuple(INITIAL_INFO.keys())
# These extensions are supported on all platforms
HANDLED_EXTS = {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'tif'}
HANDLED_EXTS = {"png", "jpg", "jpeg", "gif", "bmp", "tiff", "tif"}
def _plat_get_dimensions(self):
raise NotImplementedError()
@ -39,25 +38,25 @@ class Photo(fs.File):
raise NotImplementedError()
def _get_orientation(self):
if not hasattr(self, '_cached_orientation'):
if not hasattr(self, "_cached_orientation"):
try:
with self.path.open('rb') as fp:
with self.path.open("rb") as fp:
exifdata = exif.get_fields(fp)
# the value is a list (probably one-sized) of ints
orientations = exifdata['Orientation']
orientations = exifdata["Orientation"]
self._cached_orientation = orientations[0]
except Exception: # Couldn't read EXIF data, no transforms
except Exception: # Couldn't read EXIF data, no transforms
self._cached_orientation = 0
return self._cached_orientation
def _get_exif_timestamp(self):
try:
with self.path.open('rb') as fp:
with self.path.open("rb") as fp:
exifdata = exif.get_fields(fp)
return exifdata['DateTimeOriginal']
return exifdata["DateTimeOriginal"]
except Exception:
logging.info("Couldn't read EXIF of picture: %s", self.path)
return ''
return ""
@classmethod
def can_handle(cls, path):
@ -79,28 +78,27 @@ class Photo(fs.File):
else:
percentage = group.percentage
dupe_count = len(group.dupes)
dupe_folder_path = getattr(self, 'display_folder_path', self.folder_path)
dupe_folder_path = getattr(self, "display_folder_path", self.folder_path)
return {
'name': self.name,
'folder_path': str(dupe_folder_path),
'size': format_size(size, 0, 1, False),
'extension': self.extension,
'dimensions': format_dimensions(dimensions),
'exif_timestamp': self.exif_timestamp,
'mtime': format_timestamp(mtime, delta and m),
'percentage': format_perc(percentage),
'dupe_count': format_dupe_count(dupe_count),
"name": self.name,
"folder_path": str(dupe_folder_path),
"size": format_size(size, 0, 1, False),
"extension": self.extension,
"dimensions": format_dimensions(dimensions),
"exif_timestamp": self.exif_timestamp,
"mtime": format_timestamp(mtime, delta and m),
"percentage": format_perc(percentage),
"dupe_count": format_dupe_count(dupe_count),
}
def _read_info(self, field):
fs.File._read_info(self, field)
if field == 'dimensions':
if field == "dimensions":
self.dimensions = self._plat_get_dimensions()
if self._get_orientation() in {5, 6, 7, 8}:
self.dimensions = (self.dimensions[1], self.dimensions[0])
elif field == 'exif_timestamp':
elif field == "exif_timestamp":
self.exif_timestamp = self._get_exif_timestamp()
def get_blocks(self, block_count_per_side):
return self._plat_get_blocks(block_count_per_side, self._get_orientation())

View File

@ -8,11 +8,16 @@
from hscommon.trans import trget
from core.prioritize import (
KindCategory, FolderCategory, FilenameCategory, NumericalCategory,
SizeCategory, MtimeCategory
KindCategory,
FolderCategory,
FilenameCategory,
NumericalCategory,
SizeCategory,
MtimeCategory,
)
coltr = trget('columns')
coltr = trget("columns")
class DimensionsCategory(NumericalCategory):
NAME = coltr("Dimensions")
@ -24,8 +29,13 @@ class DimensionsCategory(NumericalCategory):
width, height = value
return (-width, -height)
def all_categories():
return [
KindCategory, FolderCategory, FilenameCategory, SizeCategory, DimensionsCategory,
MtimeCategory
KindCategory,
FolderCategory,
FilenameCategory,
SizeCategory,
DimensionsCategory,
MtimeCategory,
]

View File

@ -1,8 +1,8 @@
# Created On: 2011-11-27
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.gui.column import Column
@ -10,19 +10,20 @@ from hscommon.trans import trget
from core.gui.result_table import ResultTable as ResultTableBase
coltr = trget('columns')
coltr = trget("columns")
class ResultTable(ResultTableBase):
COLUMNS = [
Column('marked', ''),
Column('name', coltr("Filename")),
Column('folder_path', coltr("Folder"), optional=True),
Column('size', coltr("Size (KB)"), optional=True),
Column('extension', coltr("Kind"), visible=False, optional=True),
Column('dimensions', coltr("Dimensions"), optional=True),
Column('exif_timestamp', coltr("EXIF Timestamp"), visible=False, optional=True),
Column('mtime', coltr("Modification"), visible=False, optional=True),
Column('percentage', coltr("Match %"), optional=True),
Column('dupe_count', coltr("Dupe Count"), visible=False, optional=True),
Column("marked", ""),
Column("name", coltr("Filename")),
Column("folder_path", coltr("Folder"), optional=True),
Column("size", coltr("Size (KB)"), optional=True),
Column("extension", coltr("Kind"), visible=False, optional=True),
Column("dimensions", coltr("Dimensions"), optional=True),
Column("exif_timestamp", coltr("EXIF Timestamp"), visible=False, optional=True),
Column("mtime", coltr("Modification"), visible=False, optional=True),
Column("percentage", coltr("Match %"), optional=True),
Column("dupe_count", coltr("Dupe Count"), visible=False, optional=True),
]
DELTA_COLUMNS = {'size', 'dimensions', 'mtime'}
DELTA_COLUMNS = {"size", "dimensions", "mtime"}

View File

@ -10,6 +10,7 @@ from core.scanner import Scanner, ScanType, ScanOption
from . import matchblock, matchexif
class ScannerPE(Scanner):
cache_path = None
match_scaled = False
@ -28,10 +29,9 @@ class ScannerPE(Scanner):
cache_path=self.cache_path,
threshold=self.min_match_percentage,
match_scaled=self.match_scaled,
j=j
j=j,
)
elif self.scan_type == ScanType.ExifTimestamp:
return matchexif.getmatches(files, self.match_scaled, j)
else:
raise Exception("Invalid scan type")

View File

@ -1,48 +1,50 @@
# Created By: Virgil Dupras
# Created On: 2011/09/07
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.util import dedupe, flatten, rem_file_ext
from hscommon.trans import trget, tr
coltr = trget('columns')
coltr = trget("columns")
class CriterionCategory:
NAME = "Undefined"
def __init__(self, results):
self.results = results
#--- Virtual
# --- Virtual
def extract_value(self, dupe):
raise NotImplementedError()
def format_criterion_value(self, value):
return value
def sort_key(self, dupe, crit_value):
raise NotImplementedError()
def criteria_list(self):
raise NotImplementedError()
class Criterion:
def __init__(self, category, value):
self.category = category
self.value = value
self.display_value = category.format_criterion_value(value)
def sort_key(self, dupe):
return self.category.sort_key(dupe, self.value)
@property
def display(self):
return "{} ({})".format(self.category.NAME, self.display_value)
class ValueListCategory(CriterionCategory):
def sort_key(self, dupe, crit_value):
@ -52,45 +54,47 @@ class ValueListCategory(CriterionCategory):
return 0
else:
return 1
def criteria_list(self):
dupes = flatten(g[:] for g in self.results.groups)
values = sorted(dedupe(self.extract_value(d) for d in dupes))
return [Criterion(self, value) for value in values]
class KindCategory(ValueListCategory):
NAME = coltr("Kind")
def extract_value(self, dupe):
value = dupe.extension
if not value:
value = tr("None")
return value
class FolderCategory(ValueListCategory):
NAME = coltr("Folder")
def extract_value(self, dupe):
return dupe.folder_path
def format_criterion_value(self, value):
return str(value)
def sort_key(self, dupe, crit_value):
value = self.extract_value(dupe)
if value[:len(crit_value)] == crit_value:
if value[: len(crit_value)] == crit_value:
return 0
else:
return 1
class FilenameCategory(CriterionCategory):
NAME = coltr("Filename")
ENDS_WITH_NUMBER = 0
DOESNT_END_WITH_NUMBER = 1
LONGEST = 2
SHORTEST = 3
def format_criterion_value(self, value):
return {
self.ENDS_WITH_NUMBER: tr("Ends with number"),
@ -98,10 +102,10 @@ class FilenameCategory(CriterionCategory):
self.LONGEST: tr("Longest"),
self.SHORTEST: tr("Shortest"),
}[value]
def extract_value(self, dupe):
return rem_file_ext(dupe.name)
def sort_key(self, dupe, crit_value):
value = self.extract_value(dupe)
if crit_value in {self.ENDS_WITH_NUMBER, self.DOESNT_END_WITH_NUMBER}:
@ -113,50 +117,57 @@ class FilenameCategory(CriterionCategory):
else:
value = len(value)
if crit_value == self.LONGEST:
value *= -1 # We want the biggest values on top
value *= -1 # We want the biggest values on top
return value
def criteria_list(self):
return [Criterion(self, crit_value) for crit_value in [
self.ENDS_WITH_NUMBER,
self.DOESNT_END_WITH_NUMBER,
self.LONGEST,
self.SHORTEST,
]]
return [
Criterion(self, crit_value)
for crit_value in [
self.ENDS_WITH_NUMBER,
self.DOESNT_END_WITH_NUMBER,
self.LONGEST,
self.SHORTEST,
]
]
class NumericalCategory(CriterionCategory):
HIGHEST = 0
LOWEST = 1
def format_criterion_value(self, value):
return tr("Highest") if value == self.HIGHEST else tr("Lowest")
def invert_numerical_value(self, value): # Virtual
def invert_numerical_value(self, value): # Virtual
return value * -1
def sort_key(self, dupe, crit_value):
value = self.extract_value(dupe)
if crit_value == self.HIGHEST: # we want highest values on top
if crit_value == self.HIGHEST: # we want highest values on top
value = self.invert_numerical_value(value)
return value
def criteria_list(self):
return [Criterion(self, self.HIGHEST), Criterion(self, self.LOWEST)]
class SizeCategory(NumericalCategory):
NAME = coltr("Size")
def extract_value(self, dupe):
return dupe.size
class MtimeCategory(NumericalCategory):
NAME = coltr("Modification")
def extract_value(self, dupe):
return dupe.mtime
def format_criterion_value(self, value):
return tr("Newest") if value == self.HIGHEST else tr("Oldest")
def all_categories():
return [KindCategory, FolderCategory, FilenameCategory, SizeCategory, MtimeCategory]

View File

@ -20,6 +20,7 @@ from hscommon.trans import tr
from . import engine
from .markable import Markable
class Results(Markable):
"""Manages a collection of duplicate :class:`~core.engine.Group`.
@ -34,21 +35,22 @@ class Results(Markable):
A list of all duplicates (:class:`~core.fs.File` instances), without ref, contained in the
currently managed :attr:`groups`.
"""
#---Override
# ---Override
def __init__(self, app):
Markable.__init__(self)
self.__groups = []
self.__group_of_duplicate = {}
self.__groups_sort_descriptor = None # This is a tuple (key, asc)
self.__groups_sort_descriptor = None # This is a tuple (key, asc)
self.__dupes = None
self.__dupes_sort_descriptor = None # This is a tuple (key, asc, delta)
self.__dupes_sort_descriptor = None # This is a tuple (key, asc, delta)
self.__filters = None
self.__filtered_dupes = None
self.__filtered_groups = None
self.__recalculate_stats()
self.__marked_size = 0
self.app = app
self.problems = [] # (dupe, error_msg)
self.problems = [] # (dupe, error_msg)
self.is_modified = False
def _did_mark(self, dupe):
@ -90,7 +92,7 @@ class Results(Markable):
else:
Markable.mark_none(self)
#---Private
# ---Private
def __get_dupe_list(self):
if self.__dupes is None:
self.__dupes = flatten(group.dupes for group in self.groups)
@ -98,10 +100,13 @@ class Results(Markable):
# This is debug logging to try to figure out #44
logging.warning(
"There is a None value in the Results' dupe list. dupes: %r groups: %r",
self.__dupes, self.groups
self.__dupes,
self.groups,
)
if self.__filtered_dupes:
self.__dupes = [dupe for dupe in self.__dupes if dupe in self.__filtered_dupes]
self.__dupes = [
dupe for dupe in self.__dupes if dupe in self.__filtered_dupes
]
sd = self.__dupes_sort_descriptor
if sd:
self.sort_dupes(sd[0], sd[1], sd[2])
@ -120,10 +125,18 @@ class Results(Markable):
total_count = self.__total_count
total_size = self.__total_size
else:
mark_count = len([dupe for dupe in self.__filtered_dupes if self.is_marked(dupe)])
marked_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_marked(dupe))
total_count = len([dupe for dupe in self.__filtered_dupes if self.is_markable(dupe)])
total_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_markable(dupe))
mark_count = len(
[dupe for dupe in self.__filtered_dupes if self.is_marked(dupe)]
)
marked_size = sum(
dupe.size for dupe in self.__filtered_dupes if self.is_marked(dupe)
)
total_count = len(
[dupe for dupe in self.__filtered_dupes if self.is_markable(dupe)]
)
total_size = sum(
dupe.size for dupe in self.__filtered_dupes if self.is_markable(dupe)
)
if self.mark_inverted:
marked_size = self.__total_size - marked_size
result = tr("%d / %d (%s / %s) duplicates marked.") % (
@ -133,7 +146,7 @@ class Results(Markable):
format_size(total_size, 2),
)
if self.__filters:
result += tr(" filter: %s") % ' --> '.join(self.__filters)
result += tr(" filter: %s") % " --> ".join(self.__filters)
return result
def __recalculate_stats(self):
@ -151,7 +164,7 @@ class Results(Markable):
for g in self.__groups:
for dupe in g:
self.__group_of_duplicate[dupe] = g
if not hasattr(dupe, 'is_ref'):
if not hasattr(dupe, "is_ref"):
dupe.is_ref = False
self.is_modified = bool(self.__groups)
old_filters = nonone(self.__filters, [])
@ -159,7 +172,7 @@ class Results(Markable):
for filter_str in old_filters:
self.apply_filter(filter_str)
#---Public
# ---Public
def apply_filter(self, filter_str):
"""Applies a filter ``filter_str`` to :attr:`groups`
@ -182,11 +195,15 @@ class Results(Markable):
try:
filter_re = re.compile(filter_str, re.IGNORECASE)
except re.error:
return # don't apply this filter.
return # don't apply this filter.
self.__filters.append(filter_str)
if self.__filtered_dupes is None:
self.__filtered_dupes = flatten(g[:] for g in self.groups)
self.__filtered_dupes = set(dupe for dupe in self.__filtered_dupes if filter_re.search(str(dupe.path)))
self.__filtered_dupes = set(
dupe
for dupe in self.__filtered_dupes
if filter_re.search(str(dupe.path))
)
filtered_groups = set()
for dupe in self.__filtered_dupes:
filtered_groups.add(self.get_group_of_duplicate(dupe))
@ -214,6 +231,7 @@ class Results(Markable):
:param get_file: a function f(path) returning a :class:`~core.fs.File` wrapping the path.
:param j: A :ref:`job progress instance <jobs>`.
"""
def do_match(ref_file, other_files, group):
if not other_files:
return
@ -223,31 +241,31 @@ class Results(Markable):
self.apply_filter(None)
root = ET.parse(infile).getroot()
group_elems = list(root.getiterator('group'))
group_elems = list(root.getiterator("group"))
groups = []
marked = set()
for group_elem in j.iter_with_progress(group_elems, every=100):
group = engine.Group()
dupes = []
for file_elem in group_elem.getiterator('file'):
path = file_elem.get('path')
words = file_elem.get('words', '')
for file_elem in group_elem.getiterator("file"):
path = file_elem.get("path")
words = file_elem.get("words", "")
if not path:
continue
file = get_file(path)
if file is None:
continue
file.words = words.split(',')
file.is_ref = file_elem.get('is_ref') == 'y'
file.words = words.split(",")
file.is_ref = file_elem.get("is_ref") == "y"
dupes.append(file)
if file_elem.get('marked') == 'y':
if file_elem.get("marked") == "y":
marked.add(file)
for match_elem in group_elem.getiterator('match'):
for match_elem in group_elem.getiterator("match"):
try:
attrs = match_elem.attrib
first_file = dupes[int(attrs['first'])]
second_file = dupes[int(attrs['second'])]
percentage = int(attrs['percentage'])
first_file = dupes[int(attrs["first"])]
second_file = dupes[int(attrs["second"])]
percentage = int(attrs["percentage"])
group.add_match(engine.Match(first_file, second_file, percentage))
except (IndexError, KeyError, ValueError):
# Covers missing attr, non-int values and indexes out of bounds
@ -339,9 +357,9 @@ class Results(Markable):
:param outfile: file object or path.
"""
self.apply_filter(None)
root = ET.Element('results')
root = ET.Element("results")
for g in self.groups:
group_elem = ET.SubElement(root, 'group')
group_elem = ET.SubElement(root, "group")
dupe2index = {}
for index, d in enumerate(g):
dupe2index[d] = index
@ -349,24 +367,24 @@ class Results(Markable):
words = engine.unpack_fields(d.words)
except AttributeError:
words = ()
file_elem = ET.SubElement(group_elem, 'file')
file_elem = ET.SubElement(group_elem, "file")
try:
file_elem.set('path', str(d.path))
file_elem.set('words', ','.join(words))
except ValueError: # If there's an invalid character, just skip the file
file_elem.set('path', '')
file_elem.set('is_ref', ('y' if d.is_ref else 'n'))
file_elem.set('marked', ('y' if self.is_marked(d) else 'n'))
file_elem.set("path", str(d.path))
file_elem.set("words", ",".join(words))
except ValueError: # If there's an invalid character, just skip the file
file_elem.set("path", "")
file_elem.set("is_ref", ("y" if d.is_ref else "n"))
file_elem.set("marked", ("y" if self.is_marked(d) else "n"))
for match in g.matches:
match_elem = ET.SubElement(group_elem, 'match')
match_elem.set('first', str(dupe2index[match.first]))
match_elem.set('second', str(dupe2index[match.second]))
match_elem.set('percentage', str(int(match.percentage)))
match_elem = ET.SubElement(group_elem, "match")
match_elem.set("first", str(dupe2index[match.first]))
match_elem.set("second", str(dupe2index[match.second]))
match_elem.set("percentage", str(int(match.percentage)))
tree = ET.ElementTree(root)
def do_write(outfile):
with FileOrPath(outfile, 'wb') as fp:
tree.write(fp, encoding='utf-8')
with FileOrPath(outfile, "wb") as fp:
tree.write(fp, encoding="utf-8")
try:
do_write(outfile)
@ -392,7 +410,9 @@ class Results(Markable):
"""
if not self.__dupes:
self.__get_dupe_list()
keyfunc = lambda d: self.app._get_dupe_sort_key(d, lambda: self.get_group_of_duplicate(d), key, delta)
keyfunc = lambda d: self.app._get_dupe_sort_key(
d, lambda: self.get_group_of_duplicate(d), key, delta
)
self.__dupes.sort(key=keyfunc, reverse=not asc)
self.__dupes_sort_descriptor = (key, asc, delta)
@ -408,8 +428,7 @@ class Results(Markable):
self.groups.sort(key=keyfunc, reverse=not asc)
self.__groups_sort_descriptor = (key, asc)
#---Properties
# ---Properties
dupes = property(__get_dupe_list)
groups = property(__get_groups, __set_groups)
stat_line = property(__get_stat_line)

View File

@ -19,6 +19,7 @@ from . import engine
# there will be some nasty bugs popping up (ScanType is used in core when in should exclusively be
# used in core_*). One day I'll clean this up.
class ScanType:
Filename = 0
Fields = 1
@ -27,23 +28,26 @@ class ScanType:
Folders = 4
Contents = 5
#PE
# PE
FuzzyBlock = 10
ExifTimestamp = 11
ScanOption = namedtuple('ScanOption', 'scan_type label')
SCANNABLE_TAGS = ['track', 'artist', 'album', 'title', 'genre', 'year']
ScanOption = namedtuple("ScanOption", "scan_type label")
SCANNABLE_TAGS = ["track", "artist", "album", "title", "genre", "year"]
RE_DIGIT_ENDING = re.compile(r"\d+|\(\d+\)|\[\d+\]|{\d+}")
RE_DIGIT_ENDING = re.compile(r'\d+|\(\d+\)|\[\d+\]|{\d+}')
def is_same_with_digit(name, refname):
# Returns True if name is the same as refname, but with digits (with brackets or not) at the end
if not name.startswith(refname):
return False
end = name[len(refname):].strip()
end = name[len(refname) :].strip()
return RE_DIGIT_ENDING.match(end) is not None
def remove_dupe_paths(files):
# Returns files with duplicates-by-path removed. Files with the exact same path are considered
# duplicates and only the first file to have a path is kept. In certain cases, we have files
@ -57,25 +61,29 @@ def remove_dupe_paths(files):
if normalized in path2file:
try:
if op.samefile(normalized, str(path2file[normalized].path)):
continue # same file, it's a dupe
continue # same file, it's a dupe
else:
pass # We don't treat them as dupes
pass # We don't treat them as dupes
except OSError:
continue # File doesn't exist? Well, treat them as dupes
continue # File doesn't exist? Well, treat them as dupes
else:
path2file[normalized] = f
result.append(f)
return result
class Scanner:
def __init__(self):
self.discarded_file_count = 0
def _getmatches(self, files, j):
if self.size_threshold or self.scan_type in {ScanType.Contents, ScanType.Folders}:
if self.size_threshold or self.scan_type in {
ScanType.Contents,
ScanType.Folders,
}:
j = j.start_subjob([2, 8])
for f in j.iter_with_progress(files, tr("Read size of %d/%d files")):
f.size # pre-read, makes a smoother progress if read here (especially for bundles)
f.size # pre-read, makes a smoother progress if read here (especially for bundles)
if self.size_threshold:
files = [f for f in files if f.size >= self.size_threshold]
if self.scan_type in {ScanType.Contents, ScanType.Folders}:
@ -83,12 +91,12 @@ class Scanner:
else:
j = j.start_subjob([2, 8])
kw = {}
kw['match_similar_words'] = self.match_similar_words
kw['weight_words'] = self.word_weighting
kw['min_match_percentage'] = self.min_match_percentage
kw["match_similar_words"] = self.match_similar_words
kw["weight_words"] = self.word_weighting
kw["min_match_percentage"] = self.min_match_percentage
if self.scan_type == ScanType.FieldsNoOrder:
self.scan_type = ScanType.Fields
kw['no_field_order'] = True
kw["no_field_order"] = True
func = {
ScanType.Filename: lambda f: engine.getwords(rem_file_ext(f.name)),
ScanType.Fields: lambda f: engine.getfields(rem_file_ext(f.name)),
@ -111,9 +119,9 @@ class Scanner:
def _tie_breaker(ref, dupe):
refname = rem_file_ext(ref.name).lower()
dupename = rem_file_ext(dupe.name).lower()
if 'copy' in dupename:
if "copy" in dupename:
return False
if 'copy' in refname:
if "copy" in refname:
return True
if is_same_with_digit(dupename, refname):
return False
@ -130,12 +138,12 @@ class Scanner:
raise NotImplementedError()
def get_dupe_groups(self, files, ignore_list=None, j=job.nulljob):
for f in (f for f in files if not hasattr(f, 'is_ref')):
for f in (f for f in files if not hasattr(f, "is_ref")):
f.is_ref = False
files = remove_dupe_paths(files)
logging.info("Getting matches. Scan type: %d", self.scan_type)
matches = self._getmatches(files, j)
logging.info('Found %d matches' % len(matches))
logging.info("Found %d matches" % len(matches))
j.set_progress(100, tr("Almost done! Fiddling with results..."))
# In removing what we call here "false matches", we first want to remove, if we scan by
# folders, we want to remove folder matches for which the parent is also in a match (they're
@ -153,20 +161,38 @@ class Scanner:
toremove.add(p)
else:
last_parent_path = p
matches = [m for m in matches if m.first.path not in toremove or m.second.path not in toremove]
matches = [
m
for m in matches
if m.first.path not in toremove or m.second.path not in toremove
]
if not self.mix_file_kind:
matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
matches = [m for m in matches if m.first.path.exists() and m.second.path.exists()]
matches = [
m
for m in matches
if get_file_ext(m.first.name) == get_file_ext(m.second.name)
]
matches = [
m for m in matches if m.first.path.exists() and m.second.path.exists()
]
matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)]
if ignore_list:
matches = [
m for m in matches
m
for m in matches
if not ignore_list.AreIgnored(str(m.first.path), str(m.second.path))
]
logging.info('Grouping matches')
logging.info("Grouping matches")
groups = engine.get_groups(matches)
if self.scan_type in {ScanType.Filename, ScanType.Fields, ScanType.FieldsNoOrder, ScanType.Tag}:
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
if self.scan_type in {
ScanType.Filename,
ScanType.Fields,
ScanType.FieldsNoOrder,
ScanType.Tag,
}:
matched_files = dedupe(
[m.first for m in matches] + [m.second for m in matches]
)
self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups)
else:
# Ticket #195
@ -181,7 +207,7 @@ class Scanner:
# reporting discarded matches.
self.discarded_file_count = 0
groups = [g for g in groups if any(not f.is_ref for f in g)]
logging.info('Created %d groups' % len(groups))
logging.info("Created %d groups" % len(groups))
for g in groups:
g.prioritize(self._key_func, self._tie_breaker)
return groups
@ -190,7 +216,6 @@ class Scanner:
min_match_percentage = 80
mix_file_kind = True
scan_type = ScanType.Filename
scanned_tags = {'artist', 'title'}
scanned_tags = {"artist", "title"}
size_threshold = 0
word_weighting = False

View File

@ -1 +1 @@
from . import fs, result_table, scanner # noqa
from . import fs, result_table, scanner # noqa

View File

@ -11,6 +11,7 @@ from hscommon.util import format_size
from core import fs
from core.util import format_timestamp, format_perc, format_words, format_dupe_count
def get_display_info(dupe, group, delta):
size = dupe.size
mtime = dupe.mtime
@ -26,16 +27,17 @@ def get_display_info(dupe, group, delta):
percentage = group.percentage
dupe_count = len(group.dupes)
return {
'name': dupe.name,
'folder_path': str(dupe.folder_path),
'size': format_size(size, 0, 1, False),
'extension': dupe.extension,
'mtime': format_timestamp(mtime, delta and m),
'percentage': format_perc(percentage),
'words': format_words(dupe.words) if hasattr(dupe, 'words') else '',
'dupe_count': format_dupe_count(dupe_count),
"name": dupe.name,
"folder_path": str(dupe.folder_path),
"size": format_size(size, 0, 1, False),
"extension": dupe.extension,
"mtime": format_timestamp(mtime, delta and m),
"percentage": format_perc(percentage),
"words": format_words(dupe.words) if hasattr(dupe, "words") else "",
"dupe_count": format_dupe_count(dupe_count),
}
class File(fs.File):
def get_display_info(self, group, delta):
return get_display_info(self, group, delta)
@ -44,4 +46,3 @@ class File(fs.File):
class Folder(fs.Folder):
def get_display_info(self, group, delta):
return get_display_info(self, group, delta)

View File

@ -1,8 +1,8 @@
# Created On: 2011-11-27
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.gui.column import Column
@ -10,18 +10,19 @@ from hscommon.trans import trget
from core.gui.result_table import ResultTable as ResultTableBase
coltr = trget('columns')
coltr = trget("columns")
class ResultTable(ResultTableBase):
COLUMNS = [
Column('marked', ''),
Column('name', coltr("Filename")),
Column('folder_path', coltr("Folder"), optional=True),
Column('size', coltr("Size (KB)"), optional=True),
Column('extension', coltr("Kind"), visible=False, optional=True),
Column('mtime', coltr("Modification"), visible=False, optional=True),
Column('percentage', coltr("Match %"), optional=True),
Column('words', coltr("Words Used"), visible=False, optional=True),
Column('dupe_count', coltr("Dupe Count"), visible=False, optional=True),
Column("marked", ""),
Column("name", coltr("Filename")),
Column("folder_path", coltr("Folder"), optional=True),
Column("size", coltr("Size (KB)"), optional=True),
Column("extension", coltr("Kind"), visible=False, optional=True),
Column("mtime", coltr("Modification"), visible=False, optional=True),
Column("percentage", coltr("Match %"), optional=True),
Column("words", coltr("Words Used"), visible=False, optional=True),
Column("dupe_count", coltr("Dupe Count"), visible=False, optional=True),
]
DELTA_COLUMNS = {'size', 'mtime'}
DELTA_COLUMNS = {"size", "mtime"}

View File

@ -8,6 +8,7 @@ from hscommon.trans import tr
from core.scanner import Scanner as ScannerBase, ScanOption, ScanType
class ScannerSE(ScannerBase):
@staticmethod
def get_scan_options():
@ -16,4 +17,3 @@ class ScannerSE(ScannerBase):
ScanOption(ScanType.Contents, tr("Contents")),
ScanOption(ScanType.Folders, tr("Folders")),
]

View File

@ -20,93 +20,106 @@ from .results_test import GetTestGroups
from .. import app, fs, engine
from ..scanner import ScanType
def add_fake_files_to_directories(directories, files):
directories.get_files = lambda j=None: iter(files)
directories._dirs.append('this is just so Scan() doesnt return 3')
directories._dirs.append("this is just so Scan() doesnt return 3")
class TestCaseDupeGuru:
def test_apply_filter_calls_results_apply_filter(self, monkeypatch):
dgapp = TestApp().app
monkeypatch.setattr(dgapp.results, 'apply_filter', log_calls(dgapp.results.apply_filter))
dgapp.apply_filter('foo')
monkeypatch.setattr(
dgapp.results, "apply_filter", log_calls(dgapp.results.apply_filter)
)
dgapp.apply_filter("foo")
eq_(2, len(dgapp.results.apply_filter.calls))
call = dgapp.results.apply_filter.calls[0]
assert call['filter_str'] is None
assert call["filter_str"] is None
call = dgapp.results.apply_filter.calls[1]
eq_('foo', call['filter_str'])
eq_("foo", call["filter_str"])
def test_apply_filter_escapes_regexp(self, monkeypatch):
dgapp = TestApp().app
monkeypatch.setattr(dgapp.results, 'apply_filter', log_calls(dgapp.results.apply_filter))
dgapp.apply_filter('()[]\\.|+?^abc')
monkeypatch.setattr(
dgapp.results, "apply_filter", log_calls(dgapp.results.apply_filter)
)
dgapp.apply_filter("()[]\\.|+?^abc")
call = dgapp.results.apply_filter.calls[1]
eq_('\\(\\)\\[\\]\\\\\\.\\|\\+\\?\\^abc', call['filter_str'])
dgapp.apply_filter('(*)') # In "simple mode", we want the * to behave as a wilcard
eq_("\\(\\)\\[\\]\\\\\\.\\|\\+\\?\\^abc", call["filter_str"])
dgapp.apply_filter(
"(*)"
) # In "simple mode", we want the * to behave as a wilcard
call = dgapp.results.apply_filter.calls[3]
eq_(r'\(.*\)', call['filter_str'])
dgapp.options['escape_filter_regexp'] = False
dgapp.apply_filter('(abc)')
eq_(r"\(.*\)", call["filter_str"])
dgapp.options["escape_filter_regexp"] = False
dgapp.apply_filter("(abc)")
call = dgapp.results.apply_filter.calls[5]
eq_('(abc)', call['filter_str'])
eq_("(abc)", call["filter_str"])
def test_copy_or_move(self, tmpdir, monkeypatch):
# The goal here is just to have a test for a previous blowup I had. I know my test coverage
# for this unit is pathetic. What's done is done. My approach now is to add tests for
# every change I want to make. The blowup was caused by a missing import.
p = Path(str(tmpdir))
p['foo'].open('w').close()
monkeypatch.setattr(hscommon.conflict, 'smart_copy', log_calls(lambda source_path, dest_path: None))
p["foo"].open("w").close()
monkeypatch.setattr(
hscommon.conflict,
"smart_copy",
log_calls(lambda source_path, dest_path: None),
)
# XXX This monkeypatch is temporary. will be fixed in a better monkeypatcher.
monkeypatch.setattr(app, 'smart_copy', hscommon.conflict.smart_copy)
monkeypatch.setattr(os, 'makedirs', lambda path: None) # We don't want the test to create that fake directory
monkeypatch.setattr(app, "smart_copy", hscommon.conflict.smart_copy)
monkeypatch.setattr(
os, "makedirs", lambda path: None
) # We don't want the test to create that fake directory
dgapp = TestApp().app
dgapp.directories.add_path(p)
[f] = dgapp.directories.get_files()
dgapp.copy_or_move(f, True, 'some_destination', 0)
dgapp.copy_or_move(f, True, "some_destination", 0)
eq_(1, len(hscommon.conflict.smart_copy.calls))
call = hscommon.conflict.smart_copy.calls[0]
eq_(call['dest_path'], op.join('some_destination', 'foo'))
eq_(call['source_path'], f.path)
eq_(call["dest_path"], op.join("some_destination", "foo"))
eq_(call["source_path"], f.path)
def test_copy_or_move_clean_empty_dirs(self, tmpdir, monkeypatch):
tmppath = Path(str(tmpdir))
sourcepath = tmppath['source']
sourcepath = tmppath["source"]
sourcepath.mkdir()
sourcepath['myfile'].open('w')
sourcepath["myfile"].open("w")
app = TestApp().app
app.directories.add_path(tmppath)
[myfile] = app.directories.get_files()
monkeypatch.setattr(app, 'clean_empty_dirs', log_calls(lambda path: None))
app.copy_or_move(myfile, False, tmppath['dest'], 0)
monkeypatch.setattr(app, "clean_empty_dirs", log_calls(lambda path: None))
app.copy_or_move(myfile, False, tmppath["dest"], 0)
calls = app.clean_empty_dirs.calls
eq_(1, len(calls))
eq_(sourcepath, calls[0]['path'])
eq_(sourcepath, calls[0]["path"])
def test_Scan_with_objects_evaluating_to_false(self):
class FakeFile(fs.File):
def __bool__(self):
return False
# At some point, any() was used in a wrong way that made Scan() wrongly return 1
app = TestApp().app
f1, f2 = [FakeFile('foo') for i in range(2)]
f1, f2 = [FakeFile("foo") for i in range(2)]
f1.is_ref, f2.is_ref = (False, False)
assert not (bool(f1) and bool(f2))
add_fake_files_to_directories(app.directories, [f1, f2])
app.start_scanning() # no exception
app.start_scanning() # no exception
@mark.skipif("not hasattr(os, 'link')")
def test_ignore_hardlink_matches(self, tmpdir):
# If the ignore_hardlink_matches option is set, don't match files hardlinking to the same
# inode.
tmppath = Path(str(tmpdir))
tmppath['myfile'].open('w').write('foo')
os.link(str(tmppath['myfile']), str(tmppath['hardlink']))
tmppath["myfile"].open("w").write("foo")
os.link(str(tmppath["myfile"]), str(tmppath["hardlink"]))
app = TestApp().app
app.directories.add_path(tmppath)
app.options['scan_type'] = ScanType.Contents
app.options['ignore_hardlink_matches'] = True
app.options["scan_type"] = ScanType.Contents
app.options["ignore_hardlink_matches"] = True
app.start_scanning()
eq_(len(app.results.groups), 0)
@ -116,27 +129,32 @@ class TestCaseDupeGuru:
# making the selected row None. Don't crash when it happens.
dgapp = TestApp().app
# selected_row is None because there's no result.
assert not dgapp.result_table.rename_selected('foo') # no crash
assert not dgapp.result_table.rename_selected("foo") # no crash
class TestCaseDupeGuru_clean_empty_dirs:
def pytest_funcarg__do_setup(self, request):
monkeypatch = request.getfuncargvalue('monkeypatch')
monkeypatch.setattr(hscommon.util, 'delete_if_empty', log_calls(lambda path, files_to_delete=[]: None))
monkeypatch = request.getfuncargvalue("monkeypatch")
monkeypatch.setattr(
hscommon.util,
"delete_if_empty",
log_calls(lambda path, files_to_delete=[]: None),
)
# XXX This monkeypatch is temporary. will be fixed in a better monkeypatcher.
monkeypatch.setattr(app, 'delete_if_empty', hscommon.util.delete_if_empty)
monkeypatch.setattr(app, "delete_if_empty", hscommon.util.delete_if_empty)
self.app = TestApp().app
def test_option_off(self, do_setup):
self.app.clean_empty_dirs(Path('/foo/bar'))
self.app.clean_empty_dirs(Path("/foo/bar"))
eq_(0, len(hscommon.util.delete_if_empty.calls))
def test_option_on(self, do_setup):
self.app.options['clean_empty_dirs'] = True
self.app.clean_empty_dirs(Path('/foo/bar'))
self.app.options["clean_empty_dirs"] = True
self.app.clean_empty_dirs(Path("/foo/bar"))
calls = hscommon.util.delete_if_empty.calls
eq_(1, len(calls))
eq_(Path('/foo/bar'), calls[0]['path'])
eq_(['.DS_Store'], calls[0]['files_to_delete'])
eq_(Path("/foo/bar"), calls[0]["path"])
eq_([".DS_Store"], calls[0]["files_to_delete"])
def test_recurse_up(self, do_setup, monkeypatch):
# delete_if_empty must be recursively called up in the path until it returns False
@ -144,16 +162,16 @@ class TestCaseDupeGuru_clean_empty_dirs:
def mock_delete_if_empty(path, files_to_delete=[]):
return len(path) > 1
monkeypatch.setattr(hscommon.util, 'delete_if_empty', mock_delete_if_empty)
monkeypatch.setattr(hscommon.util, "delete_if_empty", mock_delete_if_empty)
# XXX This monkeypatch is temporary. will be fixed in a better monkeypatcher.
monkeypatch.setattr(app, 'delete_if_empty', mock_delete_if_empty)
self.app.options['clean_empty_dirs'] = True
self.app.clean_empty_dirs(Path('not-empty/empty/empty'))
monkeypatch.setattr(app, "delete_if_empty", mock_delete_if_empty)
self.app.options["clean_empty_dirs"] = True
self.app.clean_empty_dirs(Path("not-empty/empty/empty"))
calls = hscommon.util.delete_if_empty.calls
eq_(3, len(calls))
eq_(Path('not-empty/empty/empty'), calls[0]['path'])
eq_(Path('not-empty/empty'), calls[1]['path'])
eq_(Path('not-empty'), calls[2]['path'])
eq_(Path("not-empty/empty/empty"), calls[0]["path"])
eq_(Path("not-empty/empty"), calls[1]["path"])
eq_(Path("not-empty"), calls[2]["path"])
class TestCaseDupeGuruWithResults:
@ -166,10 +184,10 @@ class TestCaseDupeGuruWithResults:
self.dtree = app.dtree
self.rtable = app.rtable
self.rtable.refresh()
tmpdir = request.getfuncargvalue('tmpdir')
tmpdir = request.getfuncargvalue("tmpdir")
tmppath = Path(str(tmpdir))
tmppath['foo'].mkdir()
tmppath['bar'].mkdir()
tmppath["foo"].mkdir()
tmppath["bar"].mkdir()
self.app.directories.add_path(tmppath)
def test_GetObjects(self, do_setup):
@ -187,8 +205,8 @@ class TestCaseDupeGuruWithResults:
def test_GetObjects_after_sort(self, do_setup):
objects = self.objects
groups = self.groups[:] # we need an un-sorted reference
self.rtable.sort('name', False)
groups = self.groups[:] # we need an un-sorted reference
self.rtable.sort("name", False)
r = self.rtable[1]
assert r._group is groups[1]
assert r._dupe is objects[4]
@ -198,7 +216,7 @@ class TestCaseDupeGuruWithResults:
self.rtable.select([1, 2, 3])
self.app.remove_selected()
# The first 2 dupes have been removed. The 3rd one is a ref. it stays there, in first pos.
eq_(self.rtable.selected_indexes, [1]) # no exception
eq_(self.rtable.selected_indexes, [1]) # no exception
def test_selectResultNodePaths(self, do_setup):
app = self.app
@ -220,9 +238,9 @@ class TestCaseDupeGuruWithResults:
def test_selectResultNodePaths_after_sort(self, do_setup):
app = self.app
objects = self.objects
groups = self.groups[:] #To keep the old order in memory
self.rtable.sort('name', False) #0
#Now, the group order is supposed to be reversed
groups = self.groups[:] # To keep the old order in memory
self.rtable.sort("name", False) # 0
# Now, the group order is supposed to be reversed
self.rtable.select([1, 2, 3])
eq_(len(app.selected_dupes), 3)
assert app.selected_dupes[0] is objects[4]
@ -242,13 +260,13 @@ class TestCaseDupeGuruWithResults:
self.rtable.power_marker = True
self.rtable.select([0, 1, 2])
app.remove_selected()
eq_(self.rtable.selected_indexes, []) # no exception
eq_(self.rtable.selected_indexes, []) # no exception
def test_selectPowerMarkerRows_after_sort(self, do_setup):
app = self.app
objects = self.objects
self.rtable.power_marker = True
self.rtable.sort('name', False)
self.rtable.sort("name", False)
self.rtable.select([0, 1, 2])
eq_(len(app.selected_dupes), 3)
assert app.selected_dupes[0] is objects[4]
@ -285,11 +303,11 @@ class TestCaseDupeGuruWithResults:
def test_refreshDetailsWithSelected(self, do_setup):
self.rtable.select([1, 4])
eq_(self.dpanel.row(0), ('Filename', 'bar bleh', 'foo bar'))
self.dpanel.view.check_gui_calls(['refresh'])
eq_(self.dpanel.row(0), ("Filename", "bar bleh", "foo bar"))
self.dpanel.view.check_gui_calls(["refresh"])
self.rtable.select([])
eq_(self.dpanel.row(0), ('Filename', '---', '---'))
self.dpanel.view.check_gui_calls(['refresh'])
eq_(self.dpanel.row(0), ("Filename", "---", "---"))
self.dpanel.view.check_gui_calls(["refresh"])
def test_makeSelectedReference(self, do_setup):
app = self.app
@ -300,12 +318,14 @@ class TestCaseDupeGuruWithResults:
assert groups[0].ref is objects[1]
assert groups[1].ref is objects[4]
def test_makeSelectedReference_by_selecting_two_dupes_in_the_same_group(self, do_setup):
def test_makeSelectedReference_by_selecting_two_dupes_in_the_same_group(
self, do_setup
):
app = self.app
objects = self.objects
groups = self.groups
self.rtable.select([1, 2, 4])
#Only [0, 0] and [1, 0] must go ref, not [0, 1] because it is a part of the same group
# Only [0, 0] and [1, 0] must go ref, not [0, 1] because it is a part of the same group
app.make_selected_reference()
assert groups[0].ref is objects[1]
assert groups[1].ref is objects[4]
@ -314,7 +334,7 @@ class TestCaseDupeGuruWithResults:
app = self.app
self.rtable.select([1, 4])
app.remove_selected()
eq_(len(app.results.dupes), 1) # the first path is now selected
eq_(len(app.results.dupes), 1) # the first path is now selected
app.remove_selected()
eq_(len(app.results.dupes), 0)
@ -336,27 +356,27 @@ class TestCaseDupeGuruWithResults:
def test_addDirectory_does_not_exist(self, do_setup):
app = self.app
app.add_directory('/does_not_exist')
app.add_directory("/does_not_exist")
eq_(len(app.view.messages), 1)
assert "exist" in app.view.messages[0]
def test_ignore(self, do_setup):
app = self.app
self.rtable.select([4]) #The dupe of the second, 2 sized group
self.rtable.select([4]) # The dupe of the second, 2 sized group
app.add_selected_to_ignore_list()
eq_(len(app.ignore_list), 1)
self.rtable.select([1]) #first dupe of the 3 dupes group
self.rtable.select([1]) # first dupe of the 3 dupes group
app.add_selected_to_ignore_list()
#BOTH the ref and the other dupe should have been added
# BOTH the ref and the other dupe should have been added
eq_(len(app.ignore_list), 3)
def test_purgeIgnoreList(self, do_setup, tmpdir):
app = self.app
p1 = str(tmpdir.join('file1'))
p2 = str(tmpdir.join('file2'))
open(p1, 'w').close()
open(p2, 'w').close()
dne = '/does_not_exist'
p1 = str(tmpdir.join("file1"))
p2 = str(tmpdir.join("file2"))
open(p1, "w").close()
open(p2, "w").close()
dne = "/does_not_exist"
app.ignore_list.Ignore(dne, p1)
app.ignore_list.Ignore(p2, dne)
app.ignore_list.Ignore(p1, p2)
@ -381,9 +401,11 @@ class TestCaseDupeGuruWithResults:
# When doing a scan with results being present prior to the scan, correctly invalidate the
# results table.
app = self.app
app.JOB = Job(1, lambda *args, **kw: False) # Cancels the task
add_fake_files_to_directories(app.directories, self.objects) # We want the scan to at least start
app.start_scanning() # will be cancelled immediately
app.JOB = Job(1, lambda *args, **kw: False) # Cancels the task
add_fake_files_to_directories(
app.directories, self.objects
) # We want the scan to at least start
app.start_scanning() # will be cancelled immediately
eq_(len(app.result_table), 0)
def test_selected_dupes_after_removal(self, do_setup):
@ -401,21 +423,21 @@ class TestCaseDupeGuruWithResults:
# Ref #238
self.rtable.delta_values = True
self.rtable.power_marker = True
self.rtable.sort('dupe_count', False)
self.rtable.sort("dupe_count", False)
# don't crash
self.rtable.sort('percentage', False)
self.rtable.sort("percentage", False)
# don't crash
class TestCaseDupeGuru_renameSelected:
def pytest_funcarg__do_setup(self, request):
tmpdir = request.getfuncargvalue('tmpdir')
tmpdir = request.getfuncargvalue("tmpdir")
p = Path(str(tmpdir))
fp = open(str(p['foo bar 1']), mode='w')
fp = open(str(p["foo bar 1"]), mode="w")
fp.close()
fp = open(str(p['foo bar 2']), mode='w')
fp = open(str(p["foo bar 2"]), mode="w")
fp.close()
fp = open(str(p['foo bar 3']), mode='w')
fp = open(str(p["foo bar 3"]), mode="w")
fp.close()
files = fs.get_files(p)
for f in files:
@ -437,46 +459,46 @@ class TestCaseDupeGuru_renameSelected:
app = self.app
g = self.groups[0]
self.rtable.select([1])
assert app.rename_selected('renamed')
assert app.rename_selected("renamed")
names = [p.name for p in self.p.listdir()]
assert 'renamed' in names
assert 'foo bar 2' not in names
eq_(g.dupes[0].name, 'renamed')
assert "renamed" in names
assert "foo bar 2" not in names
eq_(g.dupes[0].name, "renamed")
def test_none_selected(self, do_setup, monkeypatch):
app = self.app
g = self.groups[0]
self.rtable.select([])
monkeypatch.setattr(logging, 'warning', log_calls(lambda msg: None))
assert not app.rename_selected('renamed')
msg = logging.warning.calls[0]['msg']
eq_('dupeGuru Warning: list index out of range', msg)
monkeypatch.setattr(logging, "warning", log_calls(lambda msg: None))
assert not app.rename_selected("renamed")
msg = logging.warning.calls[0]["msg"]
eq_("dupeGuru Warning: list index out of range", msg)
names = [p.name for p in self.p.listdir()]
assert 'renamed' not in names
assert 'foo bar 2' in names
eq_(g.dupes[0].name, 'foo bar 2')
assert "renamed" not in names
assert "foo bar 2" in names
eq_(g.dupes[0].name, "foo bar 2")
def test_name_already_exists(self, do_setup, monkeypatch):
app = self.app
g = self.groups[0]
self.rtable.select([1])
monkeypatch.setattr(logging, 'warning', log_calls(lambda msg: None))
assert not app.rename_selected('foo bar 1')
msg = logging.warning.calls[0]['msg']
assert msg.startswith('dupeGuru Warning: \'foo bar 1\' already exists in')
monkeypatch.setattr(logging, "warning", log_calls(lambda msg: None))
assert not app.rename_selected("foo bar 1")
msg = logging.warning.calls[0]["msg"]
assert msg.startswith("dupeGuru Warning: 'foo bar 1' already exists in")
names = [p.name for p in self.p.listdir()]
assert 'foo bar 1' in names
assert 'foo bar 2' in names
eq_(g.dupes[0].name, 'foo bar 2')
assert "foo bar 1" in names
assert "foo bar 2" in names
eq_(g.dupes[0].name, "foo bar 2")
class TestAppWithDirectoriesInTree:
def pytest_funcarg__do_setup(self, request):
tmpdir = request.getfuncargvalue('tmpdir')
tmpdir = request.getfuncargvalue("tmpdir")
p = Path(str(tmpdir))
p['sub1'].mkdir()
p['sub2'].mkdir()
p['sub3'].mkdir()
p["sub1"].mkdir()
p["sub2"].mkdir()
p["sub3"].mkdir()
app = TestApp()
self.app = app.app
self.dtree = app.dtree
@ -487,12 +509,11 @@ class TestAppWithDirectoriesInTree:
# Setting a node state to something also affect subnodes. These subnodes must be correctly
# refreshed.
node = self.dtree[0]
eq_(len(node), 3) # a len() call is required for subnodes to be loaded
eq_(len(node), 3) # a len() call is required for subnodes to be loaded
subnode = node[0]
node.state = 1 # the state property is a state index
node.state = 1 # the state property is a state index
node = self.dtree[0]
eq_(len(node), 3)
subnode = node[0]
eq_(subnode.state, 1)
self.dtree.view.check_gui_calls(['refresh_states'])
self.dtree.view.check_gui_calls(["refresh_states"])

View File

@ -4,7 +4,7 @@
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.testutil import TestApp as TestAppBase, CallLogger, eq_, with_app # noqa
from hscommon.testutil import TestApp as TestAppBase, CallLogger, eq_, with_app # noqa
from hscommon.path import Path
from hscommon.util import get_file_ext, format_size
from hscommon.gui.column import Column
@ -17,6 +17,7 @@ from ..app import DupeGuru as DupeGuruBase
from ..gui.result_table import ResultTable as ResultTableBase
from ..gui.prioritize_dialog import PrioritizeDialog
class DupeGuruView:
JOB = nulljob
@ -39,28 +40,32 @@ class DupeGuruView:
self.messages.append(msg)
def ask_yes_no(self, prompt):
return True # always answer yes
return True # always answer yes
def create_results_window(self):
pass
class ResultTable(ResultTableBase):
COLUMNS = [
Column('marked', ''),
Column('name', 'Filename'),
Column('folder_path', 'Directory'),
Column('size', 'Size (KB)'),
Column('extension', 'Kind'),
Column("marked", ""),
Column("name", "Filename"),
Column("folder_path", "Directory"),
Column("size", "Size (KB)"),
Column("extension", "Kind"),
]
DELTA_COLUMNS = {'size', }
DELTA_COLUMNS = {
"size",
}
class DupeGuru(DupeGuruBase):
NAME = 'dupeGuru'
METADATA_TO_READ = ['size']
NAME = "dupeGuru"
METADATA_TO_READ = ["size"]
def __init__(self):
DupeGuruBase.__init__(self, DupeGuruView())
self.appdata = '/tmp'
self.appdata = "/tmp"
self._recreate_result_table()
def _prioritization_categories(self):
@ -78,7 +83,7 @@ class NamedObject:
def __init__(self, name="foobar", with_words=False, size=1, folder=None):
self.name = name
if folder is None:
folder = 'basepath'
folder = "basepath"
self._folder = Path(folder)
self.size = size
self.md5partial = name
@ -88,7 +93,7 @@ class NamedObject:
self.is_ref = False
def __bool__(self):
return False #Make sure that operations are made correctly when the bool value of files is false.
return False # Make sure that operations are made correctly when the bool value of files is false.
def get_display_info(self, group, delta):
size = self.size
@ -97,10 +102,10 @@ class NamedObject:
r = group.ref
size -= r.size
return {
'name': self.name,
'folder_path': str(self.folder_path),
'size': format_size(size, 0, 1, False),
'extension': self.extension if hasattr(self, 'extension') else '---',
"name": self.name,
"folder_path": str(self.folder_path),
"size": format_size(size, 0, 1, False),
"extension": self.extension if hasattr(self, "extension") else "---",
}
@property
@ -115,6 +120,7 @@ class NamedObject:
def extension(self):
return get_file_ext(self.name)
# Returns a group set that looks like that:
# "foo bar" (1)
# "bar bleh" (1024)
@ -127,21 +133,24 @@ def GetTestGroups():
NamedObject("bar bleh"),
NamedObject("foo bleh"),
NamedObject("ibabtu"),
NamedObject("ibabtu")
NamedObject("ibabtu"),
]
objects[1].size = 1024
matches = engine.getmatches(objects) #we should have 5 matches
groups = engine.get_groups(matches) #We should have 2 groups
matches = engine.getmatches(objects) # we should have 5 matches
groups = engine.get_groups(matches) # We should have 2 groups
for g in groups:
g.prioritize(lambda x: objects.index(x)) #We want the dupes to be in the same order as the list is
groups.sort(key=len, reverse=True) # We want the group with 3 members to be first.
g.prioritize(
lambda x: objects.index(x)
) # We want the dupes to be in the same order as the list is
groups.sort(key=len, reverse=True) # We want the group with 3 members to be first.
return (objects, matches, groups)
class TestApp(TestAppBase):
def __init__(self):
def link_gui(gui):
gui.view = self.make_logger()
if hasattr(gui, 'columns'): # tables
if hasattr(gui, "columns"): # tables
gui.columns.view = self.make_logger()
return gui
@ -166,7 +175,7 @@ class TestApp(TestAppBase):
# rtable is a property because its instance can be replaced during execution
return self.app.result_table
#--- Helpers
# --- Helpers
def select_pri_criterion(self, name):
# Select a main prioritize criterion by name instead of by index. Makes tests more
# maintainable.

View File

@ -13,13 +13,18 @@ try:
except ImportError:
skip("Can't import the block module, probably hasn't been compiled.")
def my_avgdiff(first, second, limit=768, min_iter=3): # this is so I don't have to re-write every call
def my_avgdiff(
first, second, limit=768, min_iter=3
): # this is so I don't have to re-write every call
return avgdiff(first, second, limit, min_iter)
BLACK = (0, 0, 0)
RED = (0xff, 0, 0)
GREEN = (0, 0xff, 0)
BLUE = (0, 0, 0xff)
RED = (0xFF, 0, 0)
GREEN = (0, 0xFF, 0)
BLUE = (0, 0, 0xFF)
class FakeImage:
def __init__(self, size, data):
@ -37,16 +42,20 @@ class FakeImage:
pixels.append(pixel)
return FakeImage((box[2] - box[0], box[3] - box[1]), pixels)
def empty():
return FakeImage((0, 0), [])
def single_pixel(): #one red pixel
return FakeImage((1, 1), [(0xff, 0, 0)])
def single_pixel(): # one red pixel
return FakeImage((1, 1), [(0xFF, 0, 0)])
def four_pixels():
pixels = [RED, (0, 0x80, 0xff), (0x80, 0, 0), (0, 0x40, 0x80)]
pixels = [RED, (0, 0x80, 0xFF), (0x80, 0, 0), (0, 0x40, 0x80)]
return FakeImage((2, 2), pixels)
class TestCasegetblock:
def test_single_pixel(self):
im = single_pixel()
@ -60,9 +69,9 @@ class TestCasegetblock:
def test_four_pixels(self):
im = four_pixels()
[b] = getblocks2(im, 1)
meanred = (0xff + 0x80) // 4
meanred = (0xFF + 0x80) // 4
meangreen = (0x80 + 0x40) // 4
meanblue = (0xff + 0x80) // 4
meanblue = (0xFF + 0x80) // 4
eq_((meanred, meangreen, meanblue), b)
@ -158,6 +167,7 @@ class TestCasegetblock:
# eq_(BLACK, blocks[3])
#
class TestCasegetblocks2:
def test_empty_image(self):
im = empty()
@ -169,9 +179,9 @@ class TestCasegetblocks2:
blocks = getblocks2(im, 1)
eq_(1, len(blocks))
block = blocks[0]
meanred = (0xff + 0x80) // 4
meanred = (0xFF + 0x80) // 4
meangreen = (0x80 + 0x40) // 4
meanblue = (0xff + 0x80) // 4
meanblue = (0xFF + 0x80) // 4
eq_((meanred, meangreen, meanblue), block)
def test_four_blocks_all_black(self):
@ -225,25 +235,25 @@ class TestCaseavgdiff:
my_avgdiff([b, b], [b])
def test_first_arg_is_empty_but_not_second(self):
#Don't return 0 (as when the 2 lists are empty), raise!
# Don't return 0 (as when the 2 lists are empty), raise!
b = (0, 0, 0)
with raises(DifferentBlockCountError):
my_avgdiff([], [b])
def test_limit(self):
ref = (0, 0, 0)
b1 = (10, 10, 10) #avg 30
b2 = (20, 20, 20) #avg 45
b3 = (30, 30, 30) #avg 60
b1 = (10, 10, 10) # avg 30
b2 = (20, 20, 20) # avg 45
b3 = (30, 30, 30) # avg 60
blocks1 = [ref, ref, ref]
blocks2 = [b1, b2, b3]
eq_(45, my_avgdiff(blocks1, blocks2, 44))
def test_min_iterations(self):
ref = (0, 0, 0)
b1 = (10, 10, 10) #avg 30
b2 = (20, 20, 20) #avg 45
b3 = (10, 10, 10) #avg 40
b1 = (10, 10, 10) # avg 30
b2 = (20, 20, 20) # avg 45
b3 = (10, 10, 10) # avg 40
blocks1 = [ref, ref, ref]
blocks2 = [b1, b2, b3]
eq_(40, my_avgdiff(blocks1, blocks2, 45 - 1, 3))

View File

@ -16,34 +16,35 @@ try:
except ImportError:
skip("Can't import the cache module, probably hasn't been compiled.")
class TestCasecolors_to_string:
def test_no_color(self):
eq_('', colors_to_string([]))
eq_("", colors_to_string([]))
def test_single_color(self):
eq_('000000', colors_to_string([(0, 0, 0)]))
eq_('010101', colors_to_string([(1, 1, 1)]))
eq_('0a141e', colors_to_string([(10, 20, 30)]))
eq_("000000", colors_to_string([(0, 0, 0)]))
eq_("010101", colors_to_string([(1, 1, 1)]))
eq_("0a141e", colors_to_string([(10, 20, 30)]))
def test_two_colors(self):
eq_('000102030405', colors_to_string([(0, 1, 2), (3, 4, 5)]))
eq_("000102030405", colors_to_string([(0, 1, 2), (3, 4, 5)]))
class TestCasestring_to_colors:
def test_empty(self):
eq_([], string_to_colors(''))
eq_([], string_to_colors(""))
def test_single_color(self):
eq_([(0, 0, 0)], string_to_colors('000000'))
eq_([(2, 3, 4)], string_to_colors('020304'))
eq_([(10, 20, 30)], string_to_colors('0a141e'))
eq_([(0, 0, 0)], string_to_colors("000000"))
eq_([(2, 3, 4)], string_to_colors("020304"))
eq_([(10, 20, 30)], string_to_colors("0a141e"))
def test_two_colors(self):
eq_([(10, 20, 30), (40, 50, 60)], string_to_colors('0a141e28323c'))
eq_([(10, 20, 30), (40, 50, 60)], string_to_colors("0a141e28323c"))
def test_incomplete_color(self):
# don't return anything if it's not a complete color
eq_([], string_to_colors('102'))
eq_([], string_to_colors("102"))
class BaseTestCaseCache:
@ -54,58 +55,58 @@ class BaseTestCaseCache:
c = self.get_cache()
eq_(0, len(c))
with raises(KeyError):
c['foo']
c["foo"]
def test_set_then_retrieve_blocks(self):
c = self.get_cache()
b = [(0, 0, 0), (1, 2, 3)]
c['foo'] = b
eq_(b, c['foo'])
c["foo"] = b
eq_(b, c["foo"])
def test_delitem(self):
c = self.get_cache()
c['foo'] = ''
del c['foo']
assert 'foo' not in c
c["foo"] = ""
del c["foo"]
assert "foo" not in c
with raises(KeyError):
del c['foo']
del c["foo"]
def test_persistance(self, tmpdir):
DBNAME = tmpdir.join('hstest.db')
DBNAME = tmpdir.join("hstest.db")
c = self.get_cache(str(DBNAME))
c['foo'] = [(1, 2, 3)]
c["foo"] = [(1, 2, 3)]
del c
c = self.get_cache(str(DBNAME))
eq_([(1, 2, 3)], c['foo'])
eq_([(1, 2, 3)], c["foo"])
def test_filter(self):
c = self.get_cache()
c['foo'] = ''
c['bar'] = ''
c['baz'] = ''
c.filter(lambda p: p != 'bar') #only 'bar' is removed
c["foo"] = ""
c["bar"] = ""
c["baz"] = ""
c.filter(lambda p: p != "bar") # only 'bar' is removed
eq_(2, len(c))
assert 'foo' in c
assert 'baz' in c
assert 'bar' not in c
assert "foo" in c
assert "baz" in c
assert "bar" not in c
def test_clear(self):
c = self.get_cache()
c['foo'] = ''
c['bar'] = ''
c['baz'] = ''
c["foo"] = ""
c["bar"] = ""
c["baz"] = ""
c.clear()
eq_(0, len(c))
assert 'foo' not in c
assert 'baz' not in c
assert 'bar' not in c
assert "foo" not in c
assert "baz" not in c
assert "bar" not in c
def test_by_id(self):
# it's possible to use the cache by referring to the files by their row_id
c = self.get_cache()
b = [(0, 0, 0), (1, 2, 3)]
c['foo'] = b
foo_id = c.get_id('foo')
c["foo"] = b
foo_id = c.get_id("foo")
eq_(c[foo_id], b)
@ -120,16 +121,16 @@ class TestCaseSqliteCache(BaseTestCaseCache):
# If we don't do this monkeypatching, we get a weird exception about trying to flush a
# closed file. I've tried setting logging level and stuff, but nothing worked. So, there we
# go, a dirty monkeypatch.
monkeypatch.setattr(logging, 'warning', lambda *args, **kw: None)
dbname = str(tmpdir.join('foo.db'))
fp = open(dbname, 'w')
fp.write('invalid sqlite content')
monkeypatch.setattr(logging, "warning", lambda *args, **kw: None)
dbname = str(tmpdir.join("foo.db"))
fp = open(dbname, "w")
fp.write("invalid sqlite content")
fp.close()
c = self.get_cache(dbname) # should not raise a DatabaseError
c['foo'] = [(1, 2, 3)]
c = self.get_cache(dbname) # should not raise a DatabaseError
c["foo"] = [(1, 2, 3)]
del c
c = self.get_cache(dbname)
eq_(c['foo'], [(1, 2, 3)])
eq_(c["foo"], [(1, 2, 3)])
class TestCaseShelveCache(BaseTestCaseCache):
@ -161,4 +162,3 @@ class TestCaseCacheSQLEscape:
del c["foo'bar"]
except KeyError:
assert False

View File

@ -1 +1 @@
from hscommon.testutil import pytest_funcarg__app # noqa
from hscommon.testutil import pytest_funcarg__app # noqa

View File

@ -14,91 +14,105 @@ from hscommon.path import Path
from hscommon.testutil import eq_
from ..fs import File
from ..directories import Directories, DirectoryState, AlreadyThereError, InvalidPathError
from ..directories import (
Directories,
DirectoryState,
AlreadyThereError,
InvalidPathError,
)
def create_fake_fs(rootpath):
# We have it as a separate function because other units are using it.
rootpath = rootpath['fs']
rootpath = rootpath["fs"]
rootpath.mkdir()
rootpath['dir1'].mkdir()
rootpath['dir2'].mkdir()
rootpath['dir3'].mkdir()
fp = rootpath['file1.test'].open('w')
fp.write('1')
rootpath["dir1"].mkdir()
rootpath["dir2"].mkdir()
rootpath["dir3"].mkdir()
fp = rootpath["file1.test"].open("w")
fp.write("1")
fp.close()
fp = rootpath['file2.test'].open('w')
fp.write('12')
fp = rootpath["file2.test"].open("w")
fp.write("12")
fp.close()
fp = rootpath['file3.test'].open('w')
fp.write('123')
fp = rootpath["file3.test"].open("w")
fp.write("123")
fp.close()
fp = rootpath['dir1']['file1.test'].open('w')
fp.write('1')
fp = rootpath["dir1"]["file1.test"].open("w")
fp.write("1")
fp.close()
fp = rootpath['dir2']['file2.test'].open('w')
fp.write('12')
fp = rootpath["dir2"]["file2.test"].open("w")
fp.write("12")
fp.close()
fp = rootpath['dir3']['file3.test'].open('w')
fp.write('123')
fp = rootpath["dir3"]["file3.test"].open("w")
fp.write("123")
fp.close()
return rootpath
testpath = None
def setup_module(module):
# In this unit, we have tests depending on two directory structure. One with only one file in it
# and another with a more complex structure.
testpath = Path(tempfile.mkdtemp())
module.testpath = testpath
rootpath = testpath['onefile']
rootpath = testpath["onefile"]
rootpath.mkdir()
fp = rootpath['test.txt'].open('w')
fp.write('test_data')
fp = rootpath["test.txt"].open("w")
fp.write("test_data")
fp.close()
create_fake_fs(testpath)
def teardown_module(module):
shutil.rmtree(str(module.testpath))
def test_empty():
d = Directories()
eq_(len(d), 0)
assert 'foobar' not in d
assert "foobar" not in d
def test_add_path():
d = Directories()
p = testpath['onefile']
p = testpath["onefile"]
d.add_path(p)
eq_(1, len(d))
assert p in d
assert (p['foobar']) in d
assert (p["foobar"]) in d
assert p.parent() not in d
p = testpath['fs']
p = testpath["fs"]
d.add_path(p)
eq_(2, len(d))
assert p in d
def test_AddPath_when_path_is_already_there():
d = Directories()
p = testpath['onefile']
p = testpath["onefile"]
d.add_path(p)
with raises(AlreadyThereError):
d.add_path(p)
with raises(AlreadyThereError):
d.add_path(p['foobar'])
d.add_path(p["foobar"])
eq_(1, len(d))
def test_add_path_containing_paths_already_there():
d = Directories()
d.add_path(testpath['onefile'])
d.add_path(testpath["onefile"])
eq_(1, len(d))
d.add_path(testpath)
eq_(len(d), 1)
eq_(d[0], testpath)
def test_AddPath_non_latin(tmpdir):
p = Path(str(tmpdir))
to_add = p['unicode\u201a']
to_add = p["unicode\u201a"]
os.mkdir(str(to_add))
d = Directories()
try:
@ -106,63 +120,69 @@ def test_AddPath_non_latin(tmpdir):
except UnicodeDecodeError:
assert False
def test_del():
d = Directories()
d.add_path(testpath['onefile'])
d.add_path(testpath["onefile"])
try:
del d[1]
assert False
except IndexError:
pass
d.add_path(testpath['fs'])
d.add_path(testpath["fs"])
del d[1]
eq_(1, len(d))
def test_states():
d = Directories()
p = testpath['onefile']
p = testpath["onefile"]
d.add_path(p)
eq_(DirectoryState.Normal, d.get_state(p))
d.set_state(p, DirectoryState.Reference)
eq_(DirectoryState.Reference, d.get_state(p))
eq_(DirectoryState.Reference, d.get_state(p['dir1']))
eq_(DirectoryState.Reference, d.get_state(p["dir1"]))
eq_(1, len(d.states))
eq_(p, list(d.states.keys())[0])
eq_(DirectoryState.Reference, d.states[p])
def test_get_state_with_path_not_there():
# When the path's not there, just return DirectoryState.Normal
d = Directories()
d.add_path(testpath['onefile'])
d.add_path(testpath["onefile"])
eq_(d.get_state(testpath), DirectoryState.Normal)
def test_states_overwritten_when_larger_directory_eat_smaller_ones():
# ref #248
# When setting the state of a folder, we overwrite previously set states for subfolders.
d = Directories()
p = testpath['onefile']
p = testpath["onefile"]
d.add_path(p)
d.set_state(p, DirectoryState.Excluded)
d.add_path(testpath)
d.set_state(testpath, DirectoryState.Reference)
eq_(d.get_state(p), DirectoryState.Reference)
eq_(d.get_state(p['dir1']), DirectoryState.Reference)
eq_(d.get_state(p["dir1"]), DirectoryState.Reference)
eq_(d.get_state(testpath), DirectoryState.Reference)
def test_get_files():
d = Directories()
p = testpath['fs']
p = testpath["fs"]
d.add_path(p)
d.set_state(p['dir1'], DirectoryState.Reference)
d.set_state(p['dir2'], DirectoryState.Excluded)
d.set_state(p["dir1"], DirectoryState.Reference)
d.set_state(p["dir2"], DirectoryState.Excluded)
files = list(d.get_files())
eq_(5, len(files))
for f in files:
if f.path.parent() == p['dir1']:
if f.path.parent() == p["dir1"]:
assert f.is_ref
else:
assert not f.is_ref
def test_get_files_with_folders():
# When fileclasses handle folders, return them and stop recursing!
class FakeFile(File):
@ -171,106 +191,115 @@ def test_get_files_with_folders():
return True
d = Directories()
p = testpath['fs']
p = testpath["fs"]
d.add_path(p)
files = list(d.get_files(fileclasses=[FakeFile]))
# We have the 3 root files and the 3 root dirs
eq_(6, len(files))
def test_get_folders():
d = Directories()
p = testpath['fs']
p = testpath["fs"]
d.add_path(p)
d.set_state(p['dir1'], DirectoryState.Reference)
d.set_state(p['dir2'], DirectoryState.Excluded)
d.set_state(p["dir1"], DirectoryState.Reference)
d.set_state(p["dir2"], DirectoryState.Excluded)
folders = list(d.get_folders())
eq_(len(folders), 3)
ref = [f for f in folders if f.is_ref]
not_ref = [f for f in folders if not f.is_ref]
eq_(len(ref), 1)
eq_(ref[0].path, p['dir1'])
eq_(ref[0].path, p["dir1"])
eq_(len(not_ref), 2)
eq_(ref[0].size, 1)
def test_get_files_with_inherited_exclusion():
d = Directories()
p = testpath['onefile']
p = testpath["onefile"]
d.add_path(p)
d.set_state(p, DirectoryState.Excluded)
eq_([], list(d.get_files()))
def test_save_and_load(tmpdir):
d1 = Directories()
d2 = Directories()
p1 = Path(str(tmpdir.join('p1')))
p1 = Path(str(tmpdir.join("p1")))
p1.mkdir()
p2 = Path(str(tmpdir.join('p2')))
p2 = Path(str(tmpdir.join("p2")))
p2.mkdir()
d1.add_path(p1)
d1.add_path(p2)
d1.set_state(p1, DirectoryState.Reference)
d1.set_state(p1['dir1'], DirectoryState.Excluded)
tmpxml = str(tmpdir.join('directories_testunit.xml'))
d1.set_state(p1["dir1"], DirectoryState.Excluded)
tmpxml = str(tmpdir.join("directories_testunit.xml"))
d1.save_to_file(tmpxml)
d2.load_from_file(tmpxml)
eq_(2, len(d2))
eq_(DirectoryState.Reference, d2.get_state(p1))
eq_(DirectoryState.Excluded, d2.get_state(p1['dir1']))
eq_(DirectoryState.Excluded, d2.get_state(p1["dir1"]))
def test_invalid_path():
d = Directories()
p = Path('does_not_exist')
p = Path("does_not_exist")
with raises(InvalidPathError):
d.add_path(p)
eq_(0, len(d))
def test_set_state_on_invalid_path():
d = Directories()
try:
d.set_state(Path('foobar',), DirectoryState.Normal)
d.set_state(Path("foobar",), DirectoryState.Normal)
except LookupError:
assert False
def test_load_from_file_with_invalid_path(tmpdir):
#This test simulates a load from file resulting in a
#InvalidPath raise. Other directories must be loaded.
# This test simulates a load from file resulting in a
# InvalidPath raise. Other directories must be loaded.
d1 = Directories()
d1.add_path(testpath['onefile'])
#Will raise InvalidPath upon loading
p = Path(str(tmpdir.join('toremove')))
d1.add_path(testpath["onefile"])
# Will raise InvalidPath upon loading
p = Path(str(tmpdir.join("toremove")))
p.mkdir()
d1.add_path(p)
p.rmdir()
tmpxml = str(tmpdir.join('directories_testunit.xml'))
tmpxml = str(tmpdir.join("directories_testunit.xml"))
d1.save_to_file(tmpxml)
d2 = Directories()
d2.load_from_file(tmpxml)
eq_(1, len(d2))
def test_unicode_save(tmpdir):
d = Directories()
p1 = Path(str(tmpdir))['hello\xe9']
p1 = Path(str(tmpdir))["hello\xe9"]
p1.mkdir()
p1['foo\xe9'].mkdir()
p1["foo\xe9"].mkdir()
d.add_path(p1)
d.set_state(p1['foo\xe9'], DirectoryState.Excluded)
tmpxml = str(tmpdir.join('directories_testunit.xml'))
d.set_state(p1["foo\xe9"], DirectoryState.Excluded)
tmpxml = str(tmpdir.join("directories_testunit.xml"))
try:
d.save_to_file(tmpxml)
except UnicodeDecodeError:
assert False
def test_get_files_refreshes_its_directories():
d = Directories()
p = testpath['fs']
p = testpath["fs"]
d.add_path(p)
files = d.get_files()
eq_(6, len(list(files)))
time.sleep(1)
os.remove(str(p['dir1']['file1.test']))
os.remove(str(p["dir1"]["file1.test"]))
files = d.get_files()
eq_(5, len(list(files)))
def test_get_files_does_not_choke_on_non_existing_directories(tmpdir):
d = Directories()
p = Path(str(tmpdir))
@ -278,36 +307,37 @@ def test_get_files_does_not_choke_on_non_existing_directories(tmpdir):
p.rmtree()
eq_([], list(d.get_files()))
def test_get_state_returns_excluded_by_default_for_hidden_directories(tmpdir):
d = Directories()
p = Path(str(tmpdir))
hidden_dir_path = p['.foo']
p['.foo'].mkdir()
hidden_dir_path = p[".foo"]
p[".foo"].mkdir()
d.add_path(p)
eq_(d.get_state(hidden_dir_path), DirectoryState.Excluded)
# But it can be overriden
d.set_state(hidden_dir_path, DirectoryState.Normal)
eq_(d.get_state(hidden_dir_path), DirectoryState.Normal)
def test_default_path_state_override(tmpdir):
# It's possible for a subclass to override the default state of a path
class MyDirectories(Directories):
def _default_state_for_path(self, path):
if 'foobar' in path:
if "foobar" in path:
return DirectoryState.Excluded
d = MyDirectories()
p1 = Path(str(tmpdir))
p1['foobar'].mkdir()
p1['foobar/somefile'].open('w').close()
p1['foobaz'].mkdir()
p1['foobaz/somefile'].open('w').close()
p1["foobar"].mkdir()
p1["foobar/somefile"].open("w").close()
p1["foobaz"].mkdir()
p1["foobaz/somefile"].open("w").close()
d.add_path(p1)
eq_(d.get_state(p1['foobaz']), DirectoryState.Normal)
eq_(d.get_state(p1['foobar']), DirectoryState.Excluded)
eq_(len(list(d.get_files())), 1) # only the 'foobaz' file is there
eq_(d.get_state(p1["foobaz"]), DirectoryState.Normal)
eq_(d.get_state(p1["foobar"]), DirectoryState.Excluded)
eq_(len(list(d.get_files())), 1) # only the 'foobaz' file is there
# However, the default state can be changed
d.set_state(p1['foobar'], DirectoryState.Normal)
eq_(d.get_state(p1['foobar']), DirectoryState.Normal)
d.set_state(p1["foobar"], DirectoryState.Normal)
eq_(d.get_state(p1["foobar"]), DirectoryState.Normal)
eq_(len(list(d.get_files())), 2)

View File

@ -13,13 +13,28 @@ from hscommon.testutil import eq_, log_calls
from .base import NamedObject
from .. import engine
from ..engine import (
get_match, getwords, Group, getfields, unpack_fields, compare_fields, compare, WEIGHT_WORDS,
MATCH_SIMILAR_WORDS, NO_FIELD_ORDER, build_word_dict, get_groups, getmatches, Match,
getmatches_by_contents, merge_similar_words, reduce_common_words
get_match,
getwords,
Group,
getfields,
unpack_fields,
compare_fields,
compare,
WEIGHT_WORDS,
MATCH_SIMILAR_WORDS,
NO_FIELD_ORDER,
build_word_dict,
get_groups,
getmatches,
Match,
getmatches_by_contents,
merge_similar_words,
reduce_common_words,
)
no = NamedObject
def get_match_triangle():
o1 = NamedObject(with_words=True)
o2 = NamedObject(with_words=True)
@ -29,6 +44,7 @@ def get_match_triangle():
m3 = get_match(o2, o3)
return [m1, m2, m3]
def get_test_group():
m1, m2, m3 = get_match_triangle()
result = Group()
@ -37,6 +53,7 @@ def get_test_group():
result.add_match(m3)
return result
def assert_match(m, name1, name2):
# When testing matches, whether objects are in first or second position very often doesn't
# matter. This function makes this test more convenient.
@ -46,53 +63,54 @@ def assert_match(m, name1, name2):
eq_(m.first.name, name2)
eq_(m.second.name, name1)
class TestCasegetwords:
def test_spaces(self):
eq_(['a', 'b', 'c', 'd'], getwords("a b c d"))
eq_(['a', 'b', 'c', 'd'], getwords(" a b c d "))
eq_(["a", "b", "c", "d"], getwords("a b c d"))
eq_(["a", "b", "c", "d"], getwords(" a b c d "))
def test_splitter_chars(self):
eq_(
[chr(i) for i in range(ord('a'), ord('z')+1)],
getwords("a-b_c&d+e(f)g;h\\i[j]k{l}m:n.o,p<q>r/s?t~u!v@w#x$y*z")
[chr(i) for i in range(ord("a"), ord("z") + 1)],
getwords("a-b_c&d+e(f)g;h\\i[j]k{l}m:n.o,p<q>r/s?t~u!v@w#x$y*z"),
)
def test_joiner_chars(self):
eq_(["aec"], getwords("a'e\u0301c"))
def test_empty(self):
eq_([], getwords(''))
eq_([], getwords(""))
def test_returns_lowercase(self):
eq_(['foo', 'bar'], getwords('FOO BAR'))
eq_(["foo", "bar"], getwords("FOO BAR"))
def test_decompose_unicode(self):
eq_(getwords('foo\xe9bar'), ['fooebar'])
eq_(getwords("foo\xe9bar"), ["fooebar"])
class TestCasegetfields:
def test_simple(self):
eq_([['a', 'b'], ['c', 'd', 'e']], getfields('a b - c d e'))
eq_([["a", "b"], ["c", "d", "e"]], getfields("a b - c d e"))
def test_empty(self):
eq_([], getfields(''))
eq_([], getfields(""))
def test_cleans_empty_fields(self):
expected = [['a', 'bc', 'def']]
actual = getfields(' - a bc def')
expected = [["a", "bc", "def"]]
actual = getfields(" - a bc def")
eq_(expected, actual)
expected = [['bc', 'def']]
expected = [["bc", "def"]]
class TestCaseunpack_fields:
def test_with_fields(self):
expected = ['a', 'b', 'c', 'd', 'e', 'f']
actual = unpack_fields([['a'], ['b', 'c'], ['d', 'e', 'f']])
expected = ["a", "b", "c", "d", "e", "f"]
actual = unpack_fields([["a"], ["b", "c"], ["d", "e", "f"]])
eq_(expected, actual)
def test_without_fields(self):
expected = ['a', 'b', 'c', 'd', 'e', 'f']
actual = unpack_fields(['a', 'b', 'c', 'd', 'e', 'f'])
expected = ["a", "b", "c", "d", "e", "f"]
actual = unpack_fields(["a", "b", "c", "d", "e", "f"])
eq_(expected, actual)
def test_empty(self):
@ -101,134 +119,151 @@ class TestCaseunpack_fields:
class TestCaseWordCompare:
def test_list(self):
eq_(100, compare(['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd']))
eq_(86, compare(['a', 'b', 'c', 'd'], ['a', 'b', 'c']))
eq_(100, compare(["a", "b", "c", "d"], ["a", "b", "c", "d"]))
eq_(86, compare(["a", "b", "c", "d"], ["a", "b", "c"]))
def test_unordered(self):
#Sometimes, users don't want fuzzy matching too much When they set the slider
#to 100, they don't expect a filename with the same words, but not the same order, to match.
#Thus, we want to return 99 in that case.
eq_(99, compare(['a', 'b', 'c', 'd'], ['d', 'b', 'c', 'a']))
# Sometimes, users don't want fuzzy matching too much When they set the slider
# to 100, they don't expect a filename with the same words, but not the same order, to match.
# Thus, we want to return 99 in that case.
eq_(99, compare(["a", "b", "c", "d"], ["d", "b", "c", "a"]))
def test_word_occurs_twice(self):
#if a word occurs twice in first, but once in second, we want the word to be only counted once
eq_(89, compare(['a', 'b', 'c', 'd', 'a'], ['d', 'b', 'c', 'a']))
# if a word occurs twice in first, but once in second, we want the word to be only counted once
eq_(89, compare(["a", "b", "c", "d", "a"], ["d", "b", "c", "a"]))
def test_uses_copy_of_lists(self):
first = ['foo', 'bar']
second = ['bar', 'bleh']
first = ["foo", "bar"]
second = ["bar", "bleh"]
compare(first, second)
eq_(['foo', 'bar'], first)
eq_(['bar', 'bleh'], second)
eq_(["foo", "bar"], first)
eq_(["bar", "bleh"], second)
def test_word_weight(self):
eq_(int((6.0 / 13.0) * 100), compare(['foo', 'bar'], ['bar', 'bleh'], (WEIGHT_WORDS, )))
eq_(
int((6.0 / 13.0) * 100),
compare(["foo", "bar"], ["bar", "bleh"], (WEIGHT_WORDS,)),
)
def test_similar_words(self):
eq_(100, compare(['the', 'white', 'stripes'], ['the', 'whites', 'stripe'], (MATCH_SIMILAR_WORDS, )))
eq_(
100,
compare(
["the", "white", "stripes"],
["the", "whites", "stripe"],
(MATCH_SIMILAR_WORDS,),
),
)
def test_empty(self):
eq_(0, compare([], []))
def test_with_fields(self):
eq_(67, compare([['a', 'b'], ['c', 'd', 'e']], [['a', 'b'], ['c', 'd', 'f']]))
eq_(67, compare([["a", "b"], ["c", "d", "e"]], [["a", "b"], ["c", "d", "f"]]))
def test_propagate_flags_with_fields(self, monkeypatch):
def mock_compare(first, second, flags):
eq_((0, 1, 2, 3, 5), flags)
monkeypatch.setattr(engine, 'compare_fields', mock_compare)
compare([['a']], [['a']], (0, 1, 2, 3, 5))
monkeypatch.setattr(engine, "compare_fields", mock_compare)
compare([["a"]], [["a"]], (0, 1, 2, 3, 5))
class TestCaseWordCompareWithFields:
def test_simple(self):
eq_(67, compare_fields([['a', 'b'], ['c', 'd', 'e']], [['a', 'b'], ['c', 'd', 'f']]))
eq_(
67,
compare_fields(
[["a", "b"], ["c", "d", "e"]], [["a", "b"], ["c", "d", "f"]]
),
)
def test_empty(self):
eq_(0, compare_fields([], []))
def test_different_length(self):
eq_(0, compare_fields([['a'], ['b']], [['a'], ['b'], ['c']]))
eq_(0, compare_fields([["a"], ["b"]], [["a"], ["b"], ["c"]]))
def test_propagates_flags(self, monkeypatch):
def mock_compare(first, second, flags):
eq_((0, 1, 2, 3, 5), flags)
monkeypatch.setattr(engine, 'compare_fields', mock_compare)
compare_fields([['a']], [['a']], (0, 1, 2, 3, 5))
monkeypatch.setattr(engine, "compare_fields", mock_compare)
compare_fields([["a"]], [["a"]], (0, 1, 2, 3, 5))
def test_order(self):
first = [['a', 'b'], ['c', 'd', 'e']]
second = [['c', 'd', 'f'], ['a', 'b']]
first = [["a", "b"], ["c", "d", "e"]]
second = [["c", "d", "f"], ["a", "b"]]
eq_(0, compare_fields(first, second))
def test_no_order(self):
first = [['a', 'b'], ['c', 'd', 'e']]
second = [['c', 'd', 'f'], ['a', 'b']]
eq_(67, compare_fields(first, second, (NO_FIELD_ORDER, )))
first = [['a', 'b'], ['a', 'b']] #a field can only be matched once.
second = [['c', 'd', 'f'], ['a', 'b']]
eq_(0, compare_fields(first, second, (NO_FIELD_ORDER, )))
first = [['a', 'b'], ['a', 'b', 'c']]
second = [['c', 'd', 'f'], ['a', 'b']]
eq_(33, compare_fields(first, second, (NO_FIELD_ORDER, )))
first = [["a", "b"], ["c", "d", "e"]]
second = [["c", "d", "f"], ["a", "b"]]
eq_(67, compare_fields(first, second, (NO_FIELD_ORDER,)))
first = [["a", "b"], ["a", "b"]] # a field can only be matched once.
second = [["c", "d", "f"], ["a", "b"]]
eq_(0, compare_fields(first, second, (NO_FIELD_ORDER,)))
first = [["a", "b"], ["a", "b", "c"]]
second = [["c", "d", "f"], ["a", "b"]]
eq_(33, compare_fields(first, second, (NO_FIELD_ORDER,)))
def test_compare_fields_without_order_doesnt_alter_fields(self):
#The NO_ORDER comp type altered the fields!
first = [['a', 'b'], ['c', 'd', 'e']]
second = [['c', 'd', 'f'], ['a', 'b']]
eq_(67, compare_fields(first, second, (NO_FIELD_ORDER, )))
eq_([['a', 'b'], ['c', 'd', 'e']], first)
eq_([['c', 'd', 'f'], ['a', 'b']], second)
# The NO_ORDER comp type altered the fields!
first = [["a", "b"], ["c", "d", "e"]]
second = [["c", "d", "f"], ["a", "b"]]
eq_(67, compare_fields(first, second, (NO_FIELD_ORDER,)))
eq_([["a", "b"], ["c", "d", "e"]], first)
eq_([["c", "d", "f"], ["a", "b"]], second)
class TestCasebuild_word_dict:
def test_with_standard_words(self):
l = [NamedObject('foo bar', True)]
l.append(NamedObject('bar baz', True))
l.append(NamedObject('baz bleh foo', True))
d = build_word_dict(l)
itemList = [NamedObject("foo bar", True)]
itemList.append(NamedObject("bar baz", True))
itemList.append(NamedObject("baz bleh foo", True))
d = build_word_dict(itemList)
eq_(4, len(d))
eq_(2, len(d['foo']))
assert l[0] in d['foo']
assert l[2] in d['foo']
eq_(2, len(d['bar']))
assert l[0] in d['bar']
assert l[1] in d['bar']
eq_(2, len(d['baz']))
assert l[1] in d['baz']
assert l[2] in d['baz']
eq_(1, len(d['bleh']))
assert l[2] in d['bleh']
eq_(2, len(d["foo"]))
assert itemList[0] in d["foo"]
assert itemList[2] in d["foo"]
eq_(2, len(d["bar"]))
assert itemList[0] in d["bar"]
assert itemList[1] in d["bar"]
eq_(2, len(d["baz"]))
assert itemList[1] in d["baz"]
assert itemList[2] in d["baz"]
eq_(1, len(d["bleh"]))
assert itemList[2] in d["bleh"]
def test_unpack_fields(self):
o = NamedObject('')
o.words = [['foo', 'bar'], ['baz']]
o = NamedObject("")
o.words = [["foo", "bar"], ["baz"]]
d = build_word_dict([o])
eq_(3, len(d))
eq_(1, len(d['foo']))
eq_(1, len(d["foo"]))
def test_words_are_unaltered(self):
o = NamedObject('')
o.words = [['foo', 'bar'], ['baz']]
o = NamedObject("")
o.words = [["foo", "bar"], ["baz"]]
build_word_dict([o])
eq_([['foo', 'bar'], ['baz']], o.words)
eq_([["foo", "bar"], ["baz"]], o.words)
def test_object_instances_can_only_be_once_in_words_object_list(self):
o = NamedObject('foo foo', True)
o = NamedObject("foo foo", True)
d = build_word_dict([o])
eq_(1, len(d['foo']))
eq_(1, len(d["foo"]))
def test_job(self):
def do_progress(p, d=''):
def do_progress(p, d=""):
self.log.append(p)
return True
j = job.Job(1, do_progress)
self.log = []
s = "foo bar"
build_word_dict([NamedObject(s, True), NamedObject(s, True), NamedObject(s, True)], j)
build_word_dict(
[NamedObject(s, True), NamedObject(s, True), NamedObject(s, True)], j
)
# We don't have intermediate log because iter_with_progress is called with every > 1
eq_(0, self.log[0])
eq_(100, self.log[1])
@ -237,51 +272,56 @@ class TestCasebuild_word_dict:
class TestCasemerge_similar_words:
def test_some_similar_words(self):
d = {
'foobar': set([1]),
'foobar1': set([2]),
'foobar2': set([3]),
"foobar": set([1]),
"foobar1": set([2]),
"foobar2": set([3]),
}
merge_similar_words(d)
eq_(1, len(d))
eq_(3, len(d['foobar']))
eq_(3, len(d["foobar"]))
class TestCasereduce_common_words:
def test_typical(self):
d = {
'foo': set([NamedObject('foo bar', True) for i in range(50)]),
'bar': set([NamedObject('foo bar', True) for i in range(49)])
"foo": set([NamedObject("foo bar", True) for i in range(50)]),
"bar": set([NamedObject("foo bar", True) for i in range(49)]),
}
reduce_common_words(d, 50)
assert 'foo' not in d
eq_(49, len(d['bar']))
assert "foo" not in d
eq_(49, len(d["bar"]))
def test_dont_remove_objects_with_only_common_words(self):
d = {
'common': set([NamedObject("common uncommon", True) for i in range(50)] + [NamedObject("common", True)]),
'uncommon': set([NamedObject("common uncommon", True)])
"common": set(
[NamedObject("common uncommon", True) for i in range(50)]
+ [NamedObject("common", True)]
),
"uncommon": set([NamedObject("common uncommon", True)]),
}
reduce_common_words(d, 50)
eq_(1, len(d['common']))
eq_(1, len(d['uncommon']))
eq_(1, len(d["common"]))
eq_(1, len(d["uncommon"]))
def test_values_still_are_set_instances(self):
d = {
'common': set([NamedObject("common uncommon", True) for i in range(50)] + [NamedObject("common", True)]),
'uncommon': set([NamedObject("common uncommon", True)])
"common": set(
[NamedObject("common uncommon", True) for i in range(50)]
+ [NamedObject("common", True)]
),
"uncommon": set([NamedObject("common uncommon", True)]),
}
reduce_common_words(d, 50)
assert isinstance(d['common'], set)
assert isinstance(d['uncommon'], set)
assert isinstance(d["common"], set)
assert isinstance(d["uncommon"], set)
def test_dont_raise_KeyError_when_a_word_has_been_removed(self):
#If a word has been removed by the reduce, an object in a subsequent common word that
#contains the word that has been removed would cause a KeyError.
# If a word has been removed by the reduce, an object in a subsequent common word that
# contains the word that has been removed would cause a KeyError.
d = {
'foo': set([NamedObject('foo bar baz', True) for i in range(50)]),
'bar': set([NamedObject('foo bar baz', True) for i in range(50)]),
'baz': set([NamedObject('foo bar baz', True) for i in range(49)])
"foo": set([NamedObject("foo bar baz", True) for i in range(50)]),
"bar": set([NamedObject("foo bar baz", True) for i in range(50)]),
"baz": set([NamedObject("foo bar baz", True) for i in range(49)]),
}
try:
reduce_common_words(d, 50)
@ -289,35 +329,37 @@ class TestCasereduce_common_words:
self.fail()
def test_unpack_fields(self):
#object.words may be fields.
# object.words may be fields.
def create_it():
o = NamedObject('')
o.words = [['foo', 'bar'], ['baz']]
o = NamedObject("")
o.words = [["foo", "bar"], ["baz"]]
return o
d = {
'foo': set([create_it() for i in range(50)])
}
d = {"foo": set([create_it() for i in range(50)])}
try:
reduce_common_words(d, 50)
except TypeError:
self.fail("must support fields.")
def test_consider_a_reduced_common_word_common_even_after_reduction(self):
#There was a bug in the code that causeda word that has already been reduced not to
#be counted as a common word for subsequent words. For example, if 'foo' is processed
#as a common word, keeping a "foo bar" file in it, and the 'bar' is processed, "foo bar"
#would not stay in 'bar' because 'foo' is not a common word anymore.
only_common = NamedObject('foo bar', True)
# There was a bug in the code that causeda word that has already been reduced not to
# be counted as a common word for subsequent words. For example, if 'foo' is processed
# as a common word, keeping a "foo bar" file in it, and the 'bar' is processed, "foo bar"
# would not stay in 'bar' because 'foo' is not a common word anymore.
only_common = NamedObject("foo bar", True)
d = {
'foo': set([NamedObject('foo bar baz', True) for i in range(49)] + [only_common]),
'bar': set([NamedObject('foo bar baz', True) for i in range(49)] + [only_common]),
'baz': set([NamedObject('foo bar baz', True) for i in range(49)])
"foo": set(
[NamedObject("foo bar baz", True) for i in range(49)] + [only_common]
),
"bar": set(
[NamedObject("foo bar baz", True) for i in range(49)] + [only_common]
),
"baz": set([NamedObject("foo bar baz", True) for i in range(49)]),
}
reduce_common_words(d, 50)
eq_(1, len(d['foo']))
eq_(1, len(d['bar']))
eq_(49, len(d['baz']))
eq_(1, len(d["foo"]))
eq_(1, len(d["bar"]))
eq_(49, len(d["baz"]))
class TestCaseget_match:
@ -326,8 +368,8 @@ class TestCaseget_match:
o2 = NamedObject("bar bleh", True)
m = get_match(o1, o2)
eq_(50, m.percentage)
eq_(['foo', 'bar'], m.first.words)
eq_(['bar', 'bleh'], m.second.words)
eq_(["foo", "bar"], m.first.words)
eq_(["bar", "bleh"], m.second.words)
assert m.first is o1
assert m.second is o2
@ -340,7 +382,9 @@ class TestCaseget_match:
assert object() not in m
def test_word_weight(self):
m = get_match(NamedObject("foo bar", True), NamedObject("bar bleh", True), (WEIGHT_WORDS, ))
m = get_match(
NamedObject("foo bar", True), NamedObject("bar bleh", True), (WEIGHT_WORDS,)
)
eq_(m.percentage, int((6.0 / 13.0) * 100))
@ -349,54 +393,59 @@ class TestCaseGetMatches:
eq_(getmatches([]), [])
def test_simple(self):
l = [NamedObject("foo bar"), NamedObject("bar bleh"), NamedObject("a b c foo")]
r = getmatches(l)
itemList = [NamedObject("foo bar"), NamedObject("bar bleh"), NamedObject("a b c foo")]
r = getmatches(itemList)
eq_(2, len(r))
m = first(m for m in r if m.percentage == 50) #"foo bar" and "bar bleh"
assert_match(m, 'foo bar', 'bar bleh')
m = first(m for m in r if m.percentage == 33) #"foo bar" and "a b c foo"
assert_match(m, 'foo bar', 'a b c foo')
m = first(m for m in r if m.percentage == 50) # "foo bar" and "bar bleh"
assert_match(m, "foo bar", "bar bleh")
m = first(m for m in r if m.percentage == 33) # "foo bar" and "a b c foo"
assert_match(m, "foo bar", "a b c foo")
def test_null_and_unrelated_objects(self):
l = [NamedObject("foo bar"), NamedObject("bar bleh"), NamedObject(""), NamedObject("unrelated object")]
r = getmatches(l)
itemList = [
NamedObject("foo bar"),
NamedObject("bar bleh"),
NamedObject(""),
NamedObject("unrelated object"),
]
r = getmatches(itemList)
eq_(len(r), 1)
m = r[0]
eq_(m.percentage, 50)
assert_match(m, 'foo bar', 'bar bleh')
assert_match(m, "foo bar", "bar bleh")
def test_twice_the_same_word(self):
l = [NamedObject("foo foo bar"), NamedObject("bar bleh")]
r = getmatches(l)
itemList = [NamedObject("foo foo bar"), NamedObject("bar bleh")]
r = getmatches(itemList)
eq_(1, len(r))
def test_twice_the_same_word_when_preworded(self):
l = [NamedObject("foo foo bar", True), NamedObject("bar bleh", True)]
r = getmatches(l)
itemList = [NamedObject("foo foo bar", True), NamedObject("bar bleh", True)]
r = getmatches(itemList)
eq_(1, len(r))
def test_two_words_match(self):
l = [NamedObject("foo bar"), NamedObject("foo bar bleh")]
r = getmatches(l)
itemList = [NamedObject("foo bar"), NamedObject("foo bar bleh")]
r = getmatches(itemList)
eq_(1, len(r))
def test_match_files_with_only_common_words(self):
#If a word occurs more than 50 times, it is excluded from the matching process
#The problem with the common_word_threshold is that the files containing only common
#words will never be matched together. We *should* match them.
# If a word occurs more than 50 times, it is excluded from the matching process
# The problem with the common_word_threshold is that the files containing only common
# words will never be matched together. We *should* match them.
# This test assumes that the common word threashold const is 50
l = [NamedObject("foo") for i in range(50)]
r = getmatches(l)
itemList = [NamedObject("foo") for i in range(50)]
r = getmatches(itemList)
eq_(1225, len(r))
def test_use_words_already_there_if_there(self):
o1 = NamedObject('foo')
o2 = NamedObject('bar')
o2.words = ['foo']
o1 = NamedObject("foo")
o2 = NamedObject("bar")
o2.words = ["foo"]
eq_(1, len(getmatches([o1, o2])))
def test_job(self):
def do_progress(p, d=''):
def do_progress(p, d=""):
self.log.append(p)
return True
@ -409,28 +458,28 @@ class TestCaseGetMatches:
eq_(100, self.log[-1])
def test_weight_words(self):
l = [NamedObject("foo bar"), NamedObject("bar bleh")]
m = getmatches(l, weight_words=True)[0]
itemList = [NamedObject("foo bar"), NamedObject("bar bleh")]
m = getmatches(itemList, weight_words=True)[0]
eq_(int((6.0 / 13.0) * 100), m.percentage)
def test_similar_word(self):
l = [NamedObject("foobar"), NamedObject("foobars")]
eq_(len(getmatches(l, match_similar_words=True)), 1)
eq_(getmatches(l, match_similar_words=True)[0].percentage, 100)
l = [NamedObject("foobar"), NamedObject("foo")]
eq_(len(getmatches(l, match_similar_words=True)), 0) #too far
l = [NamedObject("bizkit"), NamedObject("bizket")]
eq_(len(getmatches(l, match_similar_words=True)), 1)
l = [NamedObject("foobar"), NamedObject("foosbar")]
eq_(len(getmatches(l, match_similar_words=True)), 1)
itemList = [NamedObject("foobar"), NamedObject("foobars")]
eq_(len(getmatches(itemList, match_similar_words=True)), 1)
eq_(getmatches(itemList, match_similar_words=True)[0].percentage, 100)
itemList = [NamedObject("foobar"), NamedObject("foo")]
eq_(len(getmatches(itemList, match_similar_words=True)), 0) # too far
itemList = [NamedObject("bizkit"), NamedObject("bizket")]
eq_(len(getmatches(itemList, match_similar_words=True)), 1)
itemList = [NamedObject("foobar"), NamedObject("foosbar")]
eq_(len(getmatches(itemList, match_similar_words=True)), 1)
def test_single_object_with_similar_words(self):
l = [NamedObject("foo foos")]
eq_(len(getmatches(l, match_similar_words=True)), 0)
itemList = [NamedObject("foo foos")]
eq_(len(getmatches(itemList, match_similar_words=True)), 0)
def test_double_words_get_counted_only_once(self):
l = [NamedObject("foo bar foo bleh"), NamedObject("foo bar bleh bar")]
m = getmatches(l)[0]
itemList = [NamedObject("foo bar foo bleh"), NamedObject("foo bar bleh bar")]
m = getmatches(itemList)[0]
eq_(75, m.percentage)
def test_with_fields(self):
@ -450,13 +499,13 @@ class TestCaseGetMatches:
eq_(m.percentage, 50)
def test_only_match_similar_when_the_option_is_set(self):
l = [NamedObject("foobar"), NamedObject("foobars")]
eq_(len(getmatches(l, match_similar_words=False)), 0)
itemList = [NamedObject("foobar"), NamedObject("foobars")]
eq_(len(getmatches(itemList, match_similar_words=False)), 0)
def test_dont_recurse_do_match(self):
# with nosetests, the stack is increased. The number has to be high enough not to be failing falsely
sys.setrecursionlimit(200)
files = [NamedObject('foo bar') for i in range(201)]
files = [NamedObject("foo bar") for i in range(201)]
try:
getmatches(files)
except RuntimeError:
@ -465,9 +514,9 @@ class TestCaseGetMatches:
sys.setrecursionlimit(1000)