1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-22 14:41:39 +00:00

Add shelve-based picture cache implementation

Hopefully, this will fix #394 for real this time, that is, without the
need for a messy python executable ship in the app.
This commit is contained in:
Virgil Dupras
2016-11-15 19:58:18 -05:00
parent f7adb5f11e
commit c58a4817ca
7 changed files with 162 additions and 11 deletions

View File

@@ -1 +1 @@
from . import block, cache, cache_sqlite, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa
from . import block, cache, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa

131
core/pe/cache_shelve.py Normal file
View File

@@ -0,0 +1,131 @@
# Copyright 2016 Virgil Dupras
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
import os
import os.path as op
import shelve
import tempfile
from collections import namedtuple
from .cache import string_to_colors, colors_to_string
def wrap_path(path):
return 'path:{}'.format(path)
def unwrap_path(key):
return key[5:]
def wrap_id(path):
return 'id:{}'.format(path)
def unwrap_id(key):
return int(key[3:])
CacheRow = namedtuple('CacheRow', 'id path blocks mtime')
class ShelveCache:
"""A class to cache picture blocks in a shelve backend.
"""
def __init__(self, db=None, readonly=False):
self.istmp = db is None
if self.istmp:
self.dtmp = tempfile.mkdtemp()
self.ftmp = db = op.join(self.dtmp, 'tmpdb')
flag = 'r' if readonly else 'c'
self.shelve = shelve.open(db, flag)
self.maxid = self._compute_maxid()
def __contains__(self, key):
return wrap_path(key) in self.shelve
def __delitem__(self, key):
row = self.shelve[wrap_path(key)]
del self.shelve[wrap_path(key)]
del self.shelve[wrap_id(row.id)]
def __getitem__(self, key):
if isinstance(key, int):
skey = self.shelve[wrap_id(key)]
else:
skey = wrap_path(key)
return string_to_colors(self.shelve[skey].blocks)
def __iter__(self):
return (unwrap_path(k) for k in self.shelve if k.startswith('path:'))
def __len__(self):
return sum(1 for k in self.shelve if k.startswith('path:'))
def __setitem__(self, path_str, blocks):
blocks = colors_to_string(blocks)
if op.exists(path_str):
mtime = int(os.stat(path_str).st_mtime)
else:
mtime = 0
if path_str in self:
rowid = self.shelve[wrap_path(path_str)].id
else:
rowid = self._get_new_id()
row = CacheRow(rowid, path_str, blocks, mtime)
self.shelve[wrap_path(path_str)] = row
self.shelve[wrap_id(rowid)] = wrap_path(path_str)
def _compute_maxid(self):
return max((unwrap_id(k) for k in self.shelve if k.startswith('id:')), default=1)
def _get_new_id(self):
self.maxid += 1
return self.maxid
def clear(self):
self.shelve.clear()
def close(self):
if self.shelve is not None:
self.shelve.close()
if self.istmp:
os.remove(self.ftmp)
os.rmdir(self.dtmp)
self.shelve = None
def filter(self, func):
to_delete = [key for key in self if not func(key)]
for key in to_delete:
del self[key]
def get_id(self, path):
if path in self:
return self.shelve[wrap_path(path)].id
else:
raise ValueError(path)
def get_multiple(self, rowids):
for rowid in rowids:
try:
skey = self.shelve[wrap_id(rowid)]
except KeyError:
continue
yield (rowid, string_to_colors(self.shelve[skey].blocks))
def purge_outdated(self):
"""Go through the cache and purge outdated records.
A record is outdated if the picture doesn't exist or if its mtime is greater than the one in
the db.
"""
todelete = []
for path in self:
row = self.shelve[wrap_path(path)]
if row.mtime and op.exists(path):
picture_mtime = os.stat(path).st_mtime
if int(picture_mtime) <= row.mtime:
# not outdated
continue
todelete.append(path)
for path in todelete:
del self[path]

View File

@@ -12,9 +12,10 @@ import sqlite3 as sqlite
from .cache import string_to_colors, colors_to_string
class SqliteCache:
"""A class to cache picture blocks.
"""A class to cache picture blocks in a sqlite backend.
"""
def __init__(self, db=':memory:'):
def __init__(self, db=':memory:', readonly=False):
# readonly is not used in the sqlite version of the cache
self.dbname = db
self.con = None
self._create_con()

View File

@@ -16,7 +16,6 @@ from hscommon.jobprogress import job
from core.engine import Match
from .block import avgdiff, DifferentBlockCountError, NoBlocksError
from .cache_sqlite import SqliteCache
# OPTIMIZATION NOTES:
# The bottleneck of the matching phase is CPU, which is why we use multiprocessing. However, another
@@ -49,12 +48,20 @@ except Exception:
logging.warning("Had problems to determine cpu count on launch.")
RESULTS_QUEUE_LIMIT = 8
def get_cache(cache_path, readonly=False):
if cache_path.endswith('shelve'):
from .cache_shelve import ShelveCache
return ShelveCache(cache_path, readonly=readonly)
else:
from .cache_sqlite import SqliteCache
return SqliteCache(cache_path, readonly=readonly)
def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
# The MemoryError handlers in there use logging without first caring about whether or not
# there is enough memory left to carry on the operation because it is assumed that the
# MemoryError happens when trying to read an image file, which is freed from memory by the
# time that MemoryError is raised.
cache = SqliteCache(cache_path)
cache = get_cache(cache_path)
cache.purge_outdated()
prepared = [] # only pictures for which there was no error getting blocks
try:
@@ -109,7 +116,7 @@ def async_compare(ref_ids, other_ids, dbname, threshold, picinfo):
# The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids
# can be None. In this case, ref_ids has to be compared with itself
# picinfo is a dictionary {pic_id: (dimensions, is_ref)}
cache = SqliteCache(dbname)
cache = get_cache(dbname, readonly=True)
limit = 100 - threshold
ref_pairs = list(cache.get_multiple(ref_ids))
if other_ids is not None:
@@ -159,7 +166,7 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo
j = j.start_subjob([3, 7])
pictures = prepare_pictures(pictures, cache_path, with_dimensions=not match_scaled, j=j)
j = j.start_subjob([9, 1], tr("Preparing for matching"))
cache = SqliteCache(cache_path)
cache = get_cache(cache_path)
id2picture = {}
for picture in pictures:
try: