1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-22 14:41:39 +00:00

Move pe.cache.Cache into its own unit, cache_sqlite

This prepares us for an upcoming alternative cache implementation.
This commit is contained in:
Virgil Dupras
2016-11-13 17:01:20 -05:00
parent 1c20e5c770
commit cc01e8eb09
6 changed files with 190 additions and 169 deletions

View File

@@ -1 +1 @@
from . import block, cache, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa
from . import block, cache, cache_sqlite, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa

View File

@@ -1,17 +1,10 @@
# Created By: Virgil Dupras
# Created On: 2006/09/14
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
# Copyright 2016 Virgil Dupras
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
import os
import os.path as op
import logging
import sqlite3 as sqlite
from ._cache import string_to_colors
from ._cache import string_to_colors # noqa
def colors_to_string(colors):
"""Transform the 3 sized tuples 'colors' into a hex string.
@@ -19,7 +12,7 @@ def colors_to_string(colors):
[(0,100,255)] --> 0064ff
[(1,2,3),(4,5,6)] --> 010203040506
"""
return ''.join(['%02x%02x%02x' % (r, g, b) for r, g, b in colors])
return ''.join('%02x%02x%02x' % (r, g, b) for r, g, b in colors)
# This function is an important bottleneck of dupeGuru PE. It has been converted to C.
# def string_to_colors(s):
@@ -31,132 +24,3 @@ def colors_to_string(colors):
# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
# return result
class Cache:
"""A class to cache picture blocks.
"""
def __init__(self, db=':memory:'):
self.dbname = db
self.con = None
self._create_con()
def __contains__(self, key):
sql = "select count(*) from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchall()
return result[0][0] > 0
def __delitem__(self, key):
if key not in self:
raise KeyError(key)
sql = "delete from pictures where path = ?"
self.con.execute(sql, [key])
# Optimized
def __getitem__(self, key):
if isinstance(key, int):
sql = "select blocks from pictures where rowid = ?"
else:
sql = "select blocks from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchone()
if result:
result = string_to_colors(result[0])
return result
else:
raise KeyError(key)
def __iter__(self):
sql = "select path from pictures"
result = self.con.execute(sql)
return (row[0] for row in result)
def __len__(self):
sql = "select count(*) from pictures"
result = self.con.execute(sql).fetchall()
return result[0][0]
def __setitem__(self, path_str, blocks):
blocks = colors_to_string(blocks)
if op.exists(path_str):
mtime = int(os.stat(path_str).st_mtime)
else:
mtime = 0
if path_str in self:
sql = "update pictures set blocks = ?, mtime = ? where path = ?"
else:
sql = "insert into pictures(blocks,mtime,path) values(?,?,?)"
try:
self.con.execute(sql, [blocks, mtime, path_str])
except sqlite.OperationalError:
logging.warning('Picture cache could not set value for key %r', path_str)
except sqlite.DatabaseError as e:
logging.warning('DatabaseError while setting value for key %r: %s', path_str, str(e))
def _create_con(self, second_try=False):
def create_tables():
logging.debug("Creating picture cache tables.")
self.con.execute("drop table if exists pictures")
self.con.execute("drop index if exists idx_path")
self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks TEXT)")
self.con.execute("create index idx_path on pictures (path)")
self.con = sqlite.connect(self.dbname, isolation_level=None)
try:
self.con.execute("select path, mtime, blocks from pictures where 1=2")
except sqlite.OperationalError: # new db
create_tables()
except sqlite.DatabaseError as e: # corrupted db
if second_try:
raise # Something really strange is happening
logging.warning('Could not create picture cache because of an error: %s', str(e))
self.con.close()
os.remove(self.dbname)
self._create_con(second_try=True)
def clear(self):
self.close()
if self.dbname != ':memory:':
os.remove(self.dbname)
self._create_con()
def close(self):
if self.con is not None:
self.con.close()
self.con = None
def filter(self, func):
to_delete = [key for key in self if not func(key)]
for key in to_delete:
del self[key]
def get_id(self, path):
sql = "select rowid from pictures where path = ?"
result = self.con.execute(sql, [path]).fetchone()
if result:
return result[0]
else:
raise ValueError(path)
def get_multiple(self, rowids):
sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids))
cur = self.con.execute(sql)
return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)
def purge_outdated(self):
"""Go through the cache and purge outdated records.
A record is outdated if the picture doesn't exist or if its mtime is greater than the one in
the db.
"""
todelete = []
sql = "select rowid, path, mtime from pictures"
cur = self.con.execute(sql)
for rowid, path_str, mtime in cur:
if mtime and op.exists(path_str):
picture_mtime = os.stat(path_str).st_mtime
if int(picture_mtime) <= mtime:
# not outdated
continue
todelete.append(rowid)
if todelete:
sql = "delete from pictures where rowid in (%s)" % ','.join(map(str, todelete))
self.con.execute(sql)

142
core/pe/cache_sqlite.py Normal file
View File

@@ -0,0 +1,142 @@
# Copyright 2016 Virgil Dupras
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
import os
import os.path as op
import logging
import sqlite3 as sqlite
from .cache import string_to_colors, colors_to_string
class SqliteCache:
"""A class to cache picture blocks.
"""
def __init__(self, db=':memory:'):
self.dbname = db
self.con = None
self._create_con()
def __contains__(self, key):
sql = "select count(*) from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchall()
return result[0][0] > 0
def __delitem__(self, key):
if key not in self:
raise KeyError(key)
sql = "delete from pictures where path = ?"
self.con.execute(sql, [key])
# Optimized
def __getitem__(self, key):
if isinstance(key, int):
sql = "select blocks from pictures where rowid = ?"
else:
sql = "select blocks from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchone()
if result:
result = string_to_colors(result[0])
return result
else:
raise KeyError(key)
def __iter__(self):
sql = "select path from pictures"
result = self.con.execute(sql)
return (row[0] for row in result)
def __len__(self):
sql = "select count(*) from pictures"
result = self.con.execute(sql).fetchall()
return result[0][0]
def __setitem__(self, path_str, blocks):
blocks = colors_to_string(blocks)
if op.exists(path_str):
mtime = int(os.stat(path_str).st_mtime)
else:
mtime = 0
if path_str in self:
sql = "update pictures set blocks = ?, mtime = ? where path = ?"
else:
sql = "insert into pictures(blocks,mtime,path) values(?,?,?)"
try:
self.con.execute(sql, [blocks, mtime, path_str])
except sqlite.OperationalError:
logging.warning('Picture cache could not set value for key %r', path_str)
except sqlite.DatabaseError as e:
logging.warning('DatabaseError while setting value for key %r: %s', path_str, str(e))
def _create_con(self, second_try=False):
def create_tables():
logging.debug("Creating picture cache tables.")
self.con.execute("drop table if exists pictures")
self.con.execute("drop index if exists idx_path")
self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks TEXT)")
self.con.execute("create index idx_path on pictures (path)")
self.con = sqlite.connect(self.dbname, isolation_level=None)
try:
self.con.execute("select path, mtime, blocks from pictures where 1=2")
except sqlite.OperationalError: # new db
create_tables()
except sqlite.DatabaseError as e: # corrupted db
if second_try:
raise # Something really strange is happening
logging.warning('Could not create picture cache because of an error: %s', str(e))
self.con.close()
os.remove(self.dbname)
self._create_con(second_try=True)
def clear(self):
self.close()
if self.dbname != ':memory:':
os.remove(self.dbname)
self._create_con()
def close(self):
if self.con is not None:
self.con.close()
self.con = None
def filter(self, func):
to_delete = [key for key in self if not func(key)]
for key in to_delete:
del self[key]
def get_id(self, path):
sql = "select rowid from pictures where path = ?"
result = self.con.execute(sql, [path]).fetchone()
if result:
return result[0]
else:
raise ValueError(path)
def get_multiple(self, rowids):
sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids))
cur = self.con.execute(sql)
return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)
def purge_outdated(self):
"""Go through the cache and purge outdated records.
A record is outdated if the picture doesn't exist or if its mtime is greater than the one in
the db.
"""
todelete = []
sql = "select rowid, path, mtime from pictures"
cur = self.con.execute(sql)
for rowid, path_str, mtime in cur:
if mtime and op.exists(path_str):
picture_mtime = os.stat(path_str).st_mtime
if int(picture_mtime) <= mtime:
# not outdated
continue
todelete.append(rowid)
if todelete:
sql = "delete from pictures where rowid in (%s)" % ','.join(map(str, todelete))
self.con.execute(sql)

View File

@@ -16,7 +16,7 @@ from hscommon.jobprogress import job
from core.engine import Match
from .block import avgdiff, DifferentBlockCountError, NoBlocksError
from .cache import Cache
from .cache_sqlite import SqliteCache
# OPTIMIZATION NOTES:
# The bottleneck of the matching phase is CPU, which is why we use multiprocessing. However, another
@@ -54,7 +54,7 @@ def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
# there is enough memory left to carry on the operation because it is assumed that the
# MemoryError happens when trying to read an image file, which is freed from memory by the
# time that MemoryError is raised.
cache = Cache(cache_path)
cache = SqliteCache(cache_path)
cache.purge_outdated()
prepared = [] # only pictures for which there was no error getting blocks
try:
@@ -109,7 +109,7 @@ def async_compare(ref_ids, other_ids, dbname, threshold, picinfo):
# The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids
# can be None. In this case, ref_ids has to be compared with itself
# picinfo is a dictionary {pic_id: (dimensions, is_ref)}
cache = Cache(dbname)
cache = SqliteCache(dbname)
limit = 100 - threshold
ref_pairs = list(cache.get_multiple(ref_ids))
if other_ids is not None:
@@ -159,7 +159,7 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo
j = j.start_subjob([3, 7])
pictures = prepare_pictures(pictures, cache_path, with_dimensions=not match_scaled, j=j)
j = j.start_subjob([9, 1], tr("Preparing for matching"))
cache = Cache(cache_path)
cache = SqliteCache(cache_path)
id2picture = {}
for picture in pictures:
try: