From c58a4817cade417926b41fe17ff0bf8f3e958fa0 Mon Sep 17 00:00:00 2001 From: Virgil Dupras Date: Tue, 15 Nov 2016 19:58:18 -0500 Subject: [PATCH] Add shelve-based picture cache implementation Hopefully, this will fix #394 for real this time, that is, without the need for a messy python executable ship in the app. --- cocoa/inter/app.py | 2 + core/app.py | 12 ++-- core/pe/__init__.py | 2 +- core/pe/cache_shelve.py | 131 +++++++++++++++++++++++++++++++++++++++ core/pe/cache_sqlite.py | 5 +- core/pe/matchblock.py | 15 +++-- core/tests/cache_test.py | 6 ++ 7 files changed, 162 insertions(+), 11 deletions(-) create mode 100644 core/pe/cache_shelve.py diff --git a/cocoa/inter/app.py b/cocoa/inter/app.py index f19a852f..6bc3e4e6 100644 --- a/cocoa/inter/app.py +++ b/cocoa/inter/app.py @@ -10,6 +10,8 @@ from .directories import Directories, Bundle from .photo import Photo class DupeGuru(DupeGuruBase): + PICTURE_CACHE_TYPE = 'shelve' + def __init__(self, view): DupeGuruBase.__init__(self, view) self.directories = Directories() diff --git a/core/app.py b/core/app.py index c5728923..32eef418 100644 --- a/core/app.py +++ b/core/app.py @@ -116,6 +116,8 @@ class DupeGuru(Broadcaster): NAME = PROMPT_NAME = "dupeGuru" + PICTURE_CACHE_TYPE = 'sqlite' # set to 'shelve' for a ShelveCache + def __init__(self, view): if view.get_default(DEBUG_MODE_PREFERENCE): logging.getLogger().setLevel(logging.DEBUG) @@ -133,12 +135,13 @@ class DupeGuru(Broadcaster): # In addition to "app-level" options, this dictionary also holds options that will be # sent to the scanner. They don't have default values because those defaults values are # defined in the scanner class. + picture_cache_name = 'cached_pictures.shelve' if self.PICTURE_CACHE_TYPE == 'shelve' else 'cached_pictures.db' self.options = { 'escape_filter_regexp': True, 'clean_empty_dirs': False, 'ignore_hardlink_matches': False, 'copymove_dest_type': DestType.Relative, - 'cache_path': op.join(self.appdata, 'cached_pictures.db'), + 'cache_path': op.join(self.appdata, picture_cache_name), } self.selected_dupes = [] self.details_panel = DetailsPanel(self) @@ -405,9 +408,10 @@ class DupeGuru(Broadcaster): path = path.parent() def clear_picture_cache(self): - cache = pe.cache_sqlite.SqliteCache(self.options['cache_path']) - cache.clear() - cache.close() + try: + os.remove(self.options['cache_path']) + except FileNotFoundError: + pass # we don't care def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType): source_path = dupe.path diff --git a/core/pe/__init__.py b/core/pe/__init__.py index be09829f..9cac7a5f 100644 --- a/core/pe/__init__.py +++ b/core/pe/__init__.py @@ -1 +1 @@ -from . import block, cache, cache_sqlite, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa +from . import block, cache, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa diff --git a/core/pe/cache_shelve.py b/core/pe/cache_shelve.py new file mode 100644 index 00000000..cc4a39c0 --- /dev/null +++ b/core/pe/cache_shelve.py @@ -0,0 +1,131 @@ +# Copyright 2016 Virgil Dupras +# +# This software is licensed under the "GPLv3" License as described in the "LICENSE" file, +# which should be included with this package. The terms are also available at +# http://www.gnu.org/licenses/gpl-3.0.html + +import os +import os.path as op +import shelve +import tempfile +from collections import namedtuple + +from .cache import string_to_colors, colors_to_string + +def wrap_path(path): + return 'path:{}'.format(path) + +def unwrap_path(key): + return key[5:] + +def wrap_id(path): + return 'id:{}'.format(path) + +def unwrap_id(key): + return int(key[3:]) + +CacheRow = namedtuple('CacheRow', 'id path blocks mtime') + +class ShelveCache: + """A class to cache picture blocks in a shelve backend. + """ + def __init__(self, db=None, readonly=False): + self.istmp = db is None + if self.istmp: + self.dtmp = tempfile.mkdtemp() + self.ftmp = db = op.join(self.dtmp, 'tmpdb') + flag = 'r' if readonly else 'c' + self.shelve = shelve.open(db, flag) + self.maxid = self._compute_maxid() + + def __contains__(self, key): + return wrap_path(key) in self.shelve + + def __delitem__(self, key): + row = self.shelve[wrap_path(key)] + del self.shelve[wrap_path(key)] + del self.shelve[wrap_id(row.id)] + + def __getitem__(self, key): + if isinstance(key, int): + skey = self.shelve[wrap_id(key)] + else: + skey = wrap_path(key) + return string_to_colors(self.shelve[skey].blocks) + + def __iter__(self): + return (unwrap_path(k) for k in self.shelve if k.startswith('path:')) + + def __len__(self): + return sum(1 for k in self.shelve if k.startswith('path:')) + + def __setitem__(self, path_str, blocks): + blocks = colors_to_string(blocks) + if op.exists(path_str): + mtime = int(os.stat(path_str).st_mtime) + else: + mtime = 0 + if path_str in self: + rowid = self.shelve[wrap_path(path_str)].id + else: + rowid = self._get_new_id() + row = CacheRow(rowid, path_str, blocks, mtime) + self.shelve[wrap_path(path_str)] = row + self.shelve[wrap_id(rowid)] = wrap_path(path_str) + + def _compute_maxid(self): + return max((unwrap_id(k) for k in self.shelve if k.startswith('id:')), default=1) + + def _get_new_id(self): + self.maxid += 1 + return self.maxid + + def clear(self): + self.shelve.clear() + + def close(self): + if self.shelve is not None: + self.shelve.close() + if self.istmp: + os.remove(self.ftmp) + os.rmdir(self.dtmp) + self.shelve = None + + def filter(self, func): + to_delete = [key for key in self if not func(key)] + for key in to_delete: + del self[key] + + def get_id(self, path): + if path in self: + return self.shelve[wrap_path(path)].id + else: + raise ValueError(path) + + def get_multiple(self, rowids): + for rowid in rowids: + try: + skey = self.shelve[wrap_id(rowid)] + except KeyError: + continue + yield (rowid, string_to_colors(self.shelve[skey].blocks)) + + def purge_outdated(self): + """Go through the cache and purge outdated records. + + A record is outdated if the picture doesn't exist or if its mtime is greater than the one in + the db. + """ + todelete = [] + for path in self: + row = self.shelve[wrap_path(path)] + if row.mtime and op.exists(path): + picture_mtime = os.stat(path).st_mtime + if int(picture_mtime) <= row.mtime: + # not outdated + continue + todelete.append(path) + for path in todelete: + del self[path] + + diff --git a/core/pe/cache_sqlite.py b/core/pe/cache_sqlite.py index 07c2a033..1e5dca15 100644 --- a/core/pe/cache_sqlite.py +++ b/core/pe/cache_sqlite.py @@ -12,9 +12,10 @@ import sqlite3 as sqlite from .cache import string_to_colors, colors_to_string class SqliteCache: - """A class to cache picture blocks. + """A class to cache picture blocks in a sqlite backend. """ - def __init__(self, db=':memory:'): + def __init__(self, db=':memory:', readonly=False): + # readonly is not used in the sqlite version of the cache self.dbname = db self.con = None self._create_con() diff --git a/core/pe/matchblock.py b/core/pe/matchblock.py index 825a6242..d866189a 100644 --- a/core/pe/matchblock.py +++ b/core/pe/matchblock.py @@ -16,7 +16,6 @@ from hscommon.jobprogress import job from core.engine import Match from .block import avgdiff, DifferentBlockCountError, NoBlocksError -from .cache_sqlite import SqliteCache # OPTIMIZATION NOTES: # The bottleneck of the matching phase is CPU, which is why we use multiprocessing. However, another @@ -49,12 +48,20 @@ except Exception: logging.warning("Had problems to determine cpu count on launch.") RESULTS_QUEUE_LIMIT = 8 +def get_cache(cache_path, readonly=False): + if cache_path.endswith('shelve'): + from .cache_shelve import ShelveCache + return ShelveCache(cache_path, readonly=readonly) + else: + from .cache_sqlite import SqliteCache + return SqliteCache(cache_path, readonly=readonly) + def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob): # The MemoryError handlers in there use logging without first caring about whether or not # there is enough memory left to carry on the operation because it is assumed that the # MemoryError happens when trying to read an image file, which is freed from memory by the # time that MemoryError is raised. - cache = SqliteCache(cache_path) + cache = get_cache(cache_path) cache.purge_outdated() prepared = [] # only pictures for which there was no error getting blocks try: @@ -109,7 +116,7 @@ def async_compare(ref_ids, other_ids, dbname, threshold, picinfo): # The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids # can be None. In this case, ref_ids has to be compared with itself # picinfo is a dictionary {pic_id: (dimensions, is_ref)} - cache = SqliteCache(dbname) + cache = get_cache(dbname, readonly=True) limit = 100 - threshold ref_pairs = list(cache.get_multiple(ref_ids)) if other_ids is not None: @@ -159,7 +166,7 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo j = j.start_subjob([3, 7]) pictures = prepare_pictures(pictures, cache_path, with_dimensions=not match_scaled, j=j) j = j.start_subjob([9, 1], tr("Preparing for matching")) - cache = SqliteCache(cache_path) + cache = get_cache(cache_path) id2picture = {} for picture in pictures: try: diff --git a/core/tests/cache_test.py b/core/tests/cache_test.py index 515cb14c..7073b381 100644 --- a/core/tests/cache_test.py +++ b/core/tests/cache_test.py @@ -12,6 +12,7 @@ from hscommon.testutil import eq_ try: from ..pe.cache import colors_to_string, string_to_colors from ..pe.cache_sqlite import SqliteCache + from ..pe.cache_shelve import ShelveCache except ImportError: skip("Can't import the cache module, probably hasn't been compiled.") @@ -131,6 +132,11 @@ class TestCaseSqliteCache(BaseTestCaseCache): eq_(c['foo'], [(1, 2, 3)]) +class TestCaseShelveCache(BaseTestCaseCache): + def get_cache(self, dbname=None): + return ShelveCache(dbname) + + class TestCaseCacheSQLEscape: def get_cache(self): return SqliteCache()