Add shelve-based picture cache implementation

Hopefully, this will fix #394 for real this time, that is, without the need for a messy python executable ship in the app.
2026-03-09 10:31:38 +00:00 · 2016-11-15 19:58:18 -05:00
parent f7adb5f11e
commit c58a4817ca
7 changed files with 162 additions and 11 deletions
--- a/core/pe/init.py
+++ b/core/pe/init.py
@@ -1 +1 @@
-from . import block, cache, cache_sqlite, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa
+from . import block, cache, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa
--- a/core/pe/cache_shelve.py
+++ b/core/pe/cache_shelve.py
@@ -0,0 +1,131 @@
+# Copyright 2016 Virgil Dupras
+#
+# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
+# which should be included with this package. The terms are also available at
+# http://www.gnu.org/licenses/gpl-3.0.html
+
+import os
+import os.path as op
+import shelve
+import tempfile
+from collections import namedtuple
+
+from .cache import string_to_colors, colors_to_string
+
+def wrap_path(path):
+    return 'path:{}'.format(path)
+
+def unwrap_path(key):
+    return key[5:]
+
+def wrap_id(path):
+    return 'id:{}'.format(path)
+
+def unwrap_id(key):
+    return int(key[3:])
+
+CacheRow = namedtuple('CacheRow', 'id path blocks mtime')
+
+class ShelveCache:
+    """A class to cache picture blocks in a shelve backend.
+    """
+    def __init__(self, db=None, readonly=False):
+        self.istmp = db is None
+        if self.istmp:
+            self.dtmp = tempfile.mkdtemp()
+            self.ftmp = db = op.join(self.dtmp, 'tmpdb')
+        flag = 'r' if readonly else 'c'
+        self.shelve = shelve.open(db, flag)
+        self.maxid = self._compute_maxid()
+
+    def __contains__(self, key):
+        return wrap_path(key) in self.shelve
+
+    def __delitem__(self, key):
+        row = self.shelve[wrap_path(key)]
+        del self.shelve[wrap_path(key)]
+        del self.shelve[wrap_id(row.id)]
+
+    def __getitem__(self, key):
+        if isinstance(key, int):
+            skey = self.shelve[wrap_id(key)]
+        else:
+            skey = wrap_path(key)
+        return string_to_colors(self.shelve[skey].blocks)
+
+    def __iter__(self):
+        return (unwrap_path(k) for k in self.shelve if k.startswith('path:'))
+
+    def __len__(self):
+        return sum(1 for k in self.shelve if k.startswith('path:'))
+
+    def __setitem__(self, path_str, blocks):
+        blocks = colors_to_string(blocks)
+        if op.exists(path_str):
+            mtime = int(os.stat(path_str).st_mtime)
+        else:
+            mtime = 0
+        if path_str in self:
+            rowid = self.shelve[wrap_path(path_str)].id
+        else:
+            rowid = self._get_new_id()
+        row = CacheRow(rowid, path_str, blocks, mtime)
+        self.shelve[wrap_path(path_str)] = row
+        self.shelve[wrap_id(rowid)] = wrap_path(path_str)
+
+    def _compute_maxid(self):
+        return max((unwrap_id(k) for k in self.shelve if k.startswith('id:')), default=1)
+
+    def _get_new_id(self):
+        self.maxid += 1
+        return self.maxid
+
+    def clear(self):
+        self.shelve.clear()
+
+    def close(self):
+        if self.shelve is not None:
+            self.shelve.close()
+            if self.istmp:
+                os.remove(self.ftmp)
+                os.rmdir(self.dtmp)
+        self.shelve = None
+
+    def filter(self, func):
+        to_delete = [key for key in self if not func(key)]
+        for key in to_delete:
+            del self[key]
+
+    def get_id(self, path):
+        if path in self:
+            return self.shelve[wrap_path(path)].id
+        else:
+            raise ValueError(path)
+
+    def get_multiple(self, rowids):
+        for rowid in rowids:
+            try:
+                skey = self.shelve[wrap_id(rowid)]
+            except KeyError:
+                continue
+            yield (rowid, string_to_colors(self.shelve[skey].blocks))
+
+    def purge_outdated(self):
+        """Go through the cache and purge outdated records.
+
+        A record is outdated if the picture doesn't exist or if its mtime is greater than the one in
+        the db.
+        """
+        todelete = []
+        for path in self:
+            row = self.shelve[wrap_path(path)]
+            if row.mtime and op.exists(path):
+                picture_mtime = os.stat(path).st_mtime
+                if int(picture_mtime) <= row.mtime:
+                    # not outdated
+                    continue
+            todelete.append(path)
+        for path in todelete:
+            del self[path]
+
+
--- a/core/pe/cache_sqlite.py
+++ b/core/pe/cache_sqlite.py
@@ -12,9 +12,10 @@ import sqlite3 as sqlite
 from .cache import string_to_colors, colors_to_string

 class SqliteCache:
-    """A class to cache picture blocks.
+    """A class to cache picture blocks in a sqlite backend.
    """
-    def __init__(self, db=':memory:'):
+    def __init__(self, db=':memory:', readonly=False):
+        # readonly is not used in the sqlite version of the cache
        self.dbname = db
        self.con = None
        self._create_con()
--- a/core/pe/matchblock.py
+++ b/core/pe/matchblock.py
@@ -16,7 +16,6 @@ from hscommon.jobprogress import job

 from core.engine import Match
 from .block import avgdiff, DifferentBlockCountError, NoBlocksError
-from .cache_sqlite import SqliteCache

 # OPTIMIZATION NOTES:
 # The bottleneck of the matching phase is CPU, which is why we use multiprocessing. However, another
@@ -49,12 +48,20 @@ except Exception:
    logging.warning("Had problems to determine cpu count on launch.")
    RESULTS_QUEUE_LIMIT = 8

+def get_cache(cache_path, readonly=False):
+    if cache_path.endswith('shelve'):
+        from .cache_shelve import ShelveCache
+        return ShelveCache(cache_path, readonly=readonly)
+    else:
+        from .cache_sqlite import SqliteCache
+        return SqliteCache(cache_path, readonly=readonly)
+
 def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
    # The MemoryError handlers in there use logging without first caring about whether or not
    # there is enough memory left to carry on the operation because it is assumed that the
    # MemoryError happens when trying to read an image file, which is freed from memory by the
    # time that MemoryError is raised.
-    cache = SqliteCache(cache_path)
+    cache = get_cache(cache_path)
    cache.purge_outdated()
    prepared = [] # only pictures for which there was no error getting blocks
    try:
@@ -109,7 +116,7 @@ def async_compare(ref_ids, other_ids, dbname, threshold, picinfo):
    # The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids
    # can be None. In this case, ref_ids has to be compared with itself
    # picinfo is a dictionary {pic_id: (dimensions, is_ref)}
-    cache = SqliteCache(dbname)
+    cache = get_cache(dbname, readonly=True)
    limit = 100 - threshold
    ref_pairs = list(cache.get_multiple(ref_ids))
    if other_ids is not None:
@@ -159,7 +166,7 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo
    j = j.start_subjob([3, 7])
    pictures = prepare_pictures(pictures, cache_path, with_dimensions=not match_scaled, j=j)
    j = j.start_subjob([9, 1], tr("Preparing for matching"))
-    cache = SqliteCache(cache_path)
+    cache = get_cache(cache_path)
    id2picture = {}
    for picture in pictures:
        try: