From cc01e8eb09be2a64a6a6c51bf91aa37803c929ea Mon Sep 17 00:00:00 2001 From: Virgil Dupras Date: Sun, 13 Nov 2016 17:01:20 -0500 Subject: [PATCH] Move pe.cache.Cache into its own unit, cache_sqlite This prepares us for an upcoming alternative cache implementation. --- core/app.py | 2 +- core/pe/__init__.py | 2 +- core/pe/cache.py | 142 +-------------------------------------- core/pe/cache_sqlite.py | 142 +++++++++++++++++++++++++++++++++++++++ core/pe/matchblock.py | 8 +-- core/tests/cache_test.py | 63 ++++++++++------- 6 files changed, 190 insertions(+), 169 deletions(-) create mode 100644 core/pe/cache_sqlite.py diff --git a/core/app.py b/core/app.py index e20d6c30..c5728923 100644 --- a/core/app.py +++ b/core/app.py @@ -405,7 +405,7 @@ class DupeGuru(Broadcaster): path = path.parent() def clear_picture_cache(self): - cache = pe.cache.Cache(self.options['cache_path']) + cache = pe.cache_sqlite.SqliteCache(self.options['cache_path']) cache.clear() cache.close() diff --git a/core/pe/__init__.py b/core/pe/__init__.py index 9cac7a5f..be09829f 100644 --- a/core/pe/__init__.py +++ b/core/pe/__init__.py @@ -1 +1 @@ -from . import block, cache, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa +from . import block, cache, cache_sqlite, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa diff --git a/core/pe/cache.py b/core/pe/cache.py index 3e18583f..81a80abc 100644 --- a/core/pe/cache.py +++ b/core/pe/cache.py @@ -1,17 +1,10 @@ -# Created By: Virgil Dupras -# Created On: 2006/09/14 -# Copyright 2015 Hardcoded Software (http://www.hardcoded.net) +# Copyright 2016 Virgil Dupras # # This software is licensed under the "GPLv3" License as described in the "LICENSE" file, # which should be included with this package. The terms are also available at # http://www.gnu.org/licenses/gpl-3.0.html -import os -import os.path as op -import logging -import sqlite3 as sqlite - -from ._cache import string_to_colors +from ._cache import string_to_colors # noqa def colors_to_string(colors): """Transform the 3 sized tuples 'colors' into a hex string. @@ -19,7 +12,7 @@ def colors_to_string(colors): [(0,100,255)] --> 0064ff [(1,2,3),(4,5,6)] --> 010203040506 """ - return ''.join(['%02x%02x%02x' % (r, g, b) for r, g, b in colors]) + return ''.join('%02x%02x%02x' % (r, g, b) for r, g, b in colors) # This function is an important bottleneck of dupeGuru PE. It has been converted to C. # def string_to_colors(s): @@ -31,132 +24,3 @@ def colors_to_string(colors): # result.append((number >> 16, (number >> 8) & 0xff, number & 0xff)) # return result -class Cache: - """A class to cache picture blocks. - """ - def __init__(self, db=':memory:'): - self.dbname = db - self.con = None - self._create_con() - - def __contains__(self, key): - sql = "select count(*) from pictures where path = ?" - result = self.con.execute(sql, [key]).fetchall() - return result[0][0] > 0 - - def __delitem__(self, key): - if key not in self: - raise KeyError(key) - sql = "delete from pictures where path = ?" - self.con.execute(sql, [key]) - - # Optimized - def __getitem__(self, key): - if isinstance(key, int): - sql = "select blocks from pictures where rowid = ?" - else: - sql = "select blocks from pictures where path = ?" - result = self.con.execute(sql, [key]).fetchone() - if result: - result = string_to_colors(result[0]) - return result - else: - raise KeyError(key) - - def __iter__(self): - sql = "select path from pictures" - result = self.con.execute(sql) - return (row[0] for row in result) - - def __len__(self): - sql = "select count(*) from pictures" - result = self.con.execute(sql).fetchall() - return result[0][0] - - def __setitem__(self, path_str, blocks): - blocks = colors_to_string(blocks) - if op.exists(path_str): - mtime = int(os.stat(path_str).st_mtime) - else: - mtime = 0 - if path_str in self: - sql = "update pictures set blocks = ?, mtime = ? where path = ?" - else: - sql = "insert into pictures(blocks,mtime,path) values(?,?,?)" - try: - self.con.execute(sql, [blocks, mtime, path_str]) - except sqlite.OperationalError: - logging.warning('Picture cache could not set value for key %r', path_str) - except sqlite.DatabaseError as e: - logging.warning('DatabaseError while setting value for key %r: %s', path_str, str(e)) - - def _create_con(self, second_try=False): - def create_tables(): - logging.debug("Creating picture cache tables.") - self.con.execute("drop table if exists pictures") - self.con.execute("drop index if exists idx_path") - self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks TEXT)") - self.con.execute("create index idx_path on pictures (path)") - - self.con = sqlite.connect(self.dbname, isolation_level=None) - try: - self.con.execute("select path, mtime, blocks from pictures where 1=2") - except sqlite.OperationalError: # new db - create_tables() - except sqlite.DatabaseError as e: # corrupted db - if second_try: - raise # Something really strange is happening - logging.warning('Could not create picture cache because of an error: %s', str(e)) - self.con.close() - os.remove(self.dbname) - self._create_con(second_try=True) - - def clear(self): - self.close() - if self.dbname != ':memory:': - os.remove(self.dbname) - self._create_con() - - def close(self): - if self.con is not None: - self.con.close() - self.con = None - - def filter(self, func): - to_delete = [key for key in self if not func(key)] - for key in to_delete: - del self[key] - - def get_id(self, path): - sql = "select rowid from pictures where path = ?" - result = self.con.execute(sql, [path]).fetchone() - if result: - return result[0] - else: - raise ValueError(path) - - def get_multiple(self, rowids): - sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids)) - cur = self.con.execute(sql) - return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur) - - def purge_outdated(self): - """Go through the cache and purge outdated records. - - A record is outdated if the picture doesn't exist or if its mtime is greater than the one in - the db. - """ - todelete = [] - sql = "select rowid, path, mtime from pictures" - cur = self.con.execute(sql) - for rowid, path_str, mtime in cur: - if mtime and op.exists(path_str): - picture_mtime = os.stat(path_str).st_mtime - if int(picture_mtime) <= mtime: - # not outdated - continue - todelete.append(rowid) - if todelete: - sql = "delete from pictures where rowid in (%s)" % ','.join(map(str, todelete)) - self.con.execute(sql) - diff --git a/core/pe/cache_sqlite.py b/core/pe/cache_sqlite.py new file mode 100644 index 00000000..07c2a033 --- /dev/null +++ b/core/pe/cache_sqlite.py @@ -0,0 +1,142 @@ +# Copyright 2016 Virgil Dupras +# +# This software is licensed under the "GPLv3" License as described in the "LICENSE" file, +# which should be included with this package. The terms are also available at +# http://www.gnu.org/licenses/gpl-3.0.html + +import os +import os.path as op +import logging +import sqlite3 as sqlite + +from .cache import string_to_colors, colors_to_string + +class SqliteCache: + """A class to cache picture blocks. + """ + def __init__(self, db=':memory:'): + self.dbname = db + self.con = None + self._create_con() + + def __contains__(self, key): + sql = "select count(*) from pictures where path = ?" + result = self.con.execute(sql, [key]).fetchall() + return result[0][0] > 0 + + def __delitem__(self, key): + if key not in self: + raise KeyError(key) + sql = "delete from pictures where path = ?" + self.con.execute(sql, [key]) + + # Optimized + def __getitem__(self, key): + if isinstance(key, int): + sql = "select blocks from pictures where rowid = ?" + else: + sql = "select blocks from pictures where path = ?" + result = self.con.execute(sql, [key]).fetchone() + if result: + result = string_to_colors(result[0]) + return result + else: + raise KeyError(key) + + def __iter__(self): + sql = "select path from pictures" + result = self.con.execute(sql) + return (row[0] for row in result) + + def __len__(self): + sql = "select count(*) from pictures" + result = self.con.execute(sql).fetchall() + return result[0][0] + + def __setitem__(self, path_str, blocks): + blocks = colors_to_string(blocks) + if op.exists(path_str): + mtime = int(os.stat(path_str).st_mtime) + else: + mtime = 0 + if path_str in self: + sql = "update pictures set blocks = ?, mtime = ? where path = ?" + else: + sql = "insert into pictures(blocks,mtime,path) values(?,?,?)" + try: + self.con.execute(sql, [blocks, mtime, path_str]) + except sqlite.OperationalError: + logging.warning('Picture cache could not set value for key %r', path_str) + except sqlite.DatabaseError as e: + logging.warning('DatabaseError while setting value for key %r: %s', path_str, str(e)) + + def _create_con(self, second_try=False): + def create_tables(): + logging.debug("Creating picture cache tables.") + self.con.execute("drop table if exists pictures") + self.con.execute("drop index if exists idx_path") + self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks TEXT)") + self.con.execute("create index idx_path on pictures (path)") + + self.con = sqlite.connect(self.dbname, isolation_level=None) + try: + self.con.execute("select path, mtime, blocks from pictures where 1=2") + except sqlite.OperationalError: # new db + create_tables() + except sqlite.DatabaseError as e: # corrupted db + if second_try: + raise # Something really strange is happening + logging.warning('Could not create picture cache because of an error: %s', str(e)) + self.con.close() + os.remove(self.dbname) + self._create_con(second_try=True) + + def clear(self): + self.close() + if self.dbname != ':memory:': + os.remove(self.dbname) + self._create_con() + + def close(self): + if self.con is not None: + self.con.close() + self.con = None + + def filter(self, func): + to_delete = [key for key in self if not func(key)] + for key in to_delete: + del self[key] + + def get_id(self, path): + sql = "select rowid from pictures where path = ?" + result = self.con.execute(sql, [path]).fetchone() + if result: + return result[0] + else: + raise ValueError(path) + + def get_multiple(self, rowids): + sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids)) + cur = self.con.execute(sql) + return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur) + + def purge_outdated(self): + """Go through the cache and purge outdated records. + + A record is outdated if the picture doesn't exist or if its mtime is greater than the one in + the db. + """ + todelete = [] + sql = "select rowid, path, mtime from pictures" + cur = self.con.execute(sql) + for rowid, path_str, mtime in cur: + if mtime and op.exists(path_str): + picture_mtime = os.stat(path_str).st_mtime + if int(picture_mtime) <= mtime: + # not outdated + continue + todelete.append(rowid) + if todelete: + sql = "delete from pictures where rowid in (%s)" % ','.join(map(str, todelete)) + self.con.execute(sql) + diff --git a/core/pe/matchblock.py b/core/pe/matchblock.py index ce96fd9f..825a6242 100644 --- a/core/pe/matchblock.py +++ b/core/pe/matchblock.py @@ -16,7 +16,7 @@ from hscommon.jobprogress import job from core.engine import Match from .block import avgdiff, DifferentBlockCountError, NoBlocksError -from .cache import Cache +from .cache_sqlite import SqliteCache # OPTIMIZATION NOTES: # The bottleneck of the matching phase is CPU, which is why we use multiprocessing. However, another @@ -54,7 +54,7 @@ def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob): # there is enough memory left to carry on the operation because it is assumed that the # MemoryError happens when trying to read an image file, which is freed from memory by the # time that MemoryError is raised. - cache = Cache(cache_path) + cache = SqliteCache(cache_path) cache.purge_outdated() prepared = [] # only pictures for which there was no error getting blocks try: @@ -109,7 +109,7 @@ def async_compare(ref_ids, other_ids, dbname, threshold, picinfo): # The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids # can be None. In this case, ref_ids has to be compared with itself # picinfo is a dictionary {pic_id: (dimensions, is_ref)} - cache = Cache(dbname) + cache = SqliteCache(dbname) limit = 100 - threshold ref_pairs = list(cache.get_multiple(ref_ids)) if other_ids is not None: @@ -159,7 +159,7 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo j = j.start_subjob([3, 7]) pictures = prepare_pictures(pictures, cache_path, with_dimensions=not match_scaled, j=j) j = j.start_subjob([9, 1], tr("Preparing for matching")) - cache = Cache(cache_path) + cache = SqliteCache(cache_path) id2picture = {} for picture in pictures: try: diff --git a/core/tests/cache_test.py b/core/tests/cache_test.py index 3dbcd16d..515cb14c 100644 --- a/core/tests/cache_test.py +++ b/core/tests/cache_test.py @@ -1,4 +1,4 @@ -# Copyright 2016 Hardcoded Software (http://www.hardcoded.net) +# Copyright 2016 Virgil Dupras # # This software is licensed under the "GPLv3" License as described in the "LICENSE" file, # which should be included with this package. The terms are also available at @@ -10,7 +10,8 @@ from pytest import raises, skip from hscommon.testutil import eq_ try: - from ..pe.cache import Cache, colors_to_string, string_to_colors + from ..pe.cache import colors_to_string, string_to_colors + from ..pe.cache_sqlite import SqliteCache except ImportError: skip("Can't import the cache module, probably hasn't been compiled.") @@ -44,21 +45,24 @@ class TestCasestring_to_colors: eq_([], string_to_colors('102')) -class TestCaseCache: +class BaseTestCaseCache: + def get_cache(self, dbname=None): + raise NotImplementedError() + def test_empty(self): - c = Cache() + c = self.get_cache() eq_(0, len(c)) with raises(KeyError): c['foo'] def test_set_then_retrieve_blocks(self): - c = Cache() + c = self.get_cache() b = [(0, 0, 0), (1, 2, 3)] c['foo'] = b eq_(b, c['foo']) def test_delitem(self): - c = Cache() + c = self.get_cache() c['foo'] = '' del c['foo'] assert 'foo' not in c @@ -67,14 +71,14 @@ class TestCaseCache: def test_persistance(self, tmpdir): DBNAME = tmpdir.join('hstest.db') - c = Cache(str(DBNAME)) + c = self.get_cache(str(DBNAME)) c['foo'] = [(1, 2, 3)] del c - c = Cache(str(DBNAME)) + c = self.get_cache(str(DBNAME)) eq_([(1, 2, 3)], c['foo']) def test_filter(self): - c = Cache() + c = self.get_cache() c['foo'] = '' c['bar'] = '' c['baz'] = '' @@ -85,7 +89,7 @@ class TestCaseCache: assert 'bar' not in c def test_clear(self): - c = Cache() + c = self.get_cache() c['foo'] = '' c['bar'] = '' c['baz'] = '' @@ -95,6 +99,22 @@ class TestCaseCache: assert 'baz' not in c assert 'bar' not in c + def test_by_id(self): + # it's possible to use the cache by referring to the files by their row_id + c = self.get_cache() + b = [(0, 0, 0), (1, 2, 3)] + c['foo'] = b + foo_id = c.get_id('foo') + eq_(c[foo_id], b) + + +class TestCaseSqliteCache(BaseTestCaseCache): + def get_cache(self, dbname=None): + if dbname: + return SqliteCache(dbname) + else: + return SqliteCache() + def test_corrupted_db(self, tmpdir, monkeypatch): # If we don't do this monkeypatching, we get a weird exception about trying to flush a # closed file. I've tried setting logging level and stuff, but nothing worked. So, there we @@ -104,37 +124,32 @@ class TestCaseCache: fp = open(dbname, 'w') fp.write('invalid sqlite content') fp.close() - c = Cache(dbname) # should not raise a DatabaseError + c = self.get_cache(dbname) # should not raise a DatabaseError c['foo'] = [(1, 2, 3)] del c - c = Cache(dbname) + c = self.get_cache(dbname) eq_(c['foo'], [(1, 2, 3)]) - def test_by_id(self): - # it's possible to use the cache by referring to the files by their row_id - c = Cache() - b = [(0, 0, 0), (1, 2, 3)] - c['foo'] = b - foo_id = c.get_id('foo') - eq_(c[foo_id], b) - class TestCaseCacheSQLEscape: + def get_cache(self): + return SqliteCache() + def test_contains(self): - c = Cache() + c = self.get_cache() assert "foo'bar" not in c def test_getitem(self): - c = Cache() + c = self.get_cache() with raises(KeyError): c["foo'bar"] def test_setitem(self): - c = Cache() + c = self.get_cache() c["foo'bar"] = [] def test_delitem(self): - c = Cache() + c = self.get_cache() c["foo'bar"] = [] try: del c["foo'bar"]