From f1153c85c0684497101002aa8adbe8ee6a52d100 Mon Sep 17 00:00:00 2001 From: Dobatymo Date: Tue, 27 Sep 2022 17:34:57 +0800 Subject: [PATCH 1/2] serialize/deserialize colors to/from bytes instead of strings it's a tiny bit faster and saves a bit of memory --- core/pe/cache.py | 23 +++++-------------- core/pe/cache.pyi | 4 ++-- core/pe/cache_shelve.py | 8 +++---- core/pe/cache_sqlite.py | 10 ++++---- core/pe/matchblock.py | 2 +- core/pe/modules/cache.c | 49 ++++++++++++---------------------------- core/pe/modules/common.c | 2 +- core/pe/modules/common.h | 2 +- core/tests/cache_test.py | 25 ++++++++++---------- 9 files changed, 47 insertions(+), 78 deletions(-) diff --git a/core/pe/cache.py b/core/pe/cache.py index 31fcb0bb..738037f7 100644 --- a/core/pe/cache.py +++ b/core/pe/cache.py @@ -4,24 +4,13 @@ # which should be included with this package. The terms are also available at # http://www.gnu.org/licenses/gpl-3.0.html -from core.pe._cache import string_to_colors # noqa +from core.pe._cache import bytes_to_colors # noqa -def colors_to_string(colors): - """Transform the 3 sized tuples 'colors' into a hex string. +def colors_to_bytes(colors): + """Transform the 3 sized tuples 'colors' into a bytes string. - [(0,100,255)] --> 0064ff - [(1,2,3),(4,5,6)] --> 010203040506 + [(0,100,255)] --> b'\x00d\xff' + [(1,2,3),(4,5,6)] --> b'\x01\x02\x03\x04\x05\x06' """ - return "".join("{:02x}{:02x}{:02x}".format(r, g, b) for r, g, b in colors) - - -# This function is an important bottleneck of dupeGuru PE. It has been converted to C. -# def string_to_colors(s): -# """Transform the string 's' in a list of 3 sized tuples. -# """ -# result = [] -# for i in xrange(0, len(s), 6): -# number = int(s[i:i+6], 16) -# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff)) -# return result + return b"".join(map(bytes, colors)) diff --git a/core/pe/cache.pyi b/core/pe/cache.pyi index fbf1e8c8..dd59b510 100644 --- a/core/pe/cache.pyi +++ b/core/pe/cache.pyi @@ -2,5 +2,5 @@ from typing import Union, Tuple, List _block = Tuple[int, int, int] -def colors_to_string(colors: List[_block]) -> str: ... # noqa: E302 -def string_to_colors(s: str) -> Union[List[_block], None]: ... +def colors_to_bytes(colors: List[_block]) -> bytes: ... # noqa: E302 +def bytes_to_colors(s: bytes) -> Union[List[_block], None]: ... diff --git a/core/pe/cache_shelve.py b/core/pe/cache_shelve.py index 57f42775..9092ab9f 100644 --- a/core/pe/cache_shelve.py +++ b/core/pe/cache_shelve.py @@ -10,7 +10,7 @@ import shelve import tempfile from collections import namedtuple -from core.pe.cache import string_to_colors, colors_to_string +from core.pe.cache import bytes_to_colors, colors_to_bytes def wrap_path(path): @@ -57,7 +57,7 @@ class ShelveCache: skey = self.shelve[wrap_id(key)] else: skey = wrap_path(key) - return string_to_colors(self.shelve[skey].blocks) + return bytes_to_colors(self.shelve[skey].blocks) def __iter__(self): return (unwrap_path(k) for k in self.shelve if k.startswith("path:")) @@ -66,7 +66,7 @@ class ShelveCache: return sum(1 for k in self.shelve if k.startswith("path:")) def __setitem__(self, path_str, blocks): - blocks = colors_to_string(blocks) + blocks = colors_to_bytes(blocks) if op.exists(path_str): mtime = int(os.stat(path_str).st_mtime) else: @@ -114,7 +114,7 @@ class ShelveCache: skey = self.shelve[wrap_id(rowid)] except KeyError: continue - yield (rowid, string_to_colors(self.shelve[skey].blocks)) + yield (rowid, bytes_to_colors(self.shelve[skey].blocks)) def purge_outdated(self): """Go through the cache and purge outdated records. diff --git a/core/pe/cache_sqlite.py b/core/pe/cache_sqlite.py index ebaa8e66..bf9f7c5b 100644 --- a/core/pe/cache_sqlite.py +++ b/core/pe/cache_sqlite.py @@ -9,7 +9,7 @@ import os.path as op import logging import sqlite3 as sqlite -from core.pe.cache import string_to_colors, colors_to_string +from core.pe.cache import bytes_to_colors, colors_to_bytes class SqliteCache: @@ -40,7 +40,7 @@ class SqliteCache: sql = "select blocks from pictures where path = ?" result = self.con.execute(sql, [key]).fetchone() if result: - result = string_to_colors(result[0]) + result = bytes_to_colors(result[0]) return result else: raise KeyError(key) @@ -56,7 +56,7 @@ class SqliteCache: return result[0][0] def __setitem__(self, path_str, blocks): - blocks = colors_to_string(blocks) + blocks = colors_to_bytes(blocks) if op.exists(path_str): mtime = int(os.stat(path_str).st_mtime) else: @@ -77,7 +77,7 @@ class SqliteCache: logging.debug("Creating picture cache tables.") self.con.execute("drop table if exists pictures") self.con.execute("drop index if exists idx_path") - self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks TEXT)") + self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks BLOB)") self.con.execute("create index idx_path on pictures (path)") self.con = sqlite.connect(self.dbname, isolation_level=None) @@ -120,7 +120,7 @@ class SqliteCache: def get_multiple(self, rowids): sql = "select rowid, blocks from pictures where rowid in (%s)" % ",".join(map(str, rowids)) cur = self.con.execute(sql) - return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur) + return ((rowid, bytes_to_colors(blocks)) for rowid, blocks in cur) def purge_outdated(self): """Go through the cache and purge outdated records. diff --git a/core/pe/matchblock.py b/core/pe/matchblock.py index 447d8ae7..a98a48cd 100644 --- a/core/pe/matchblock.py +++ b/core/pe/matchblock.py @@ -27,7 +27,7 @@ from core.pe.block import avgdiff, DifferentBlockCountError, NoBlocksError # to files in other chunks. So chunkifying doesn't save us any actual comparison, but the advantage # is that instead of reading blocks from disk number_of_files**2 times, we read it # number_of_files*number_of_chunks times. -# Determining the right chunk size is tricky, bceause if it's too big, too many blocks will be in +# Determining the right chunk size is tricky, because if it's too big, too many blocks will be in # memory at the same time and we might end up with memory trashing, which is awfully slow. So, # because our *real* bottleneck is CPU, the chunk size must simply be enough so that the CPU isn't # starved by Disk IOs. diff --git a/core/pe/modules/cache.c b/core/pe/modules/cache.c index 1ebb611f..b975a2e8 100644 --- a/core/pe/modules/cache.c +++ b/core/pe/modules/cache.c @@ -9,51 +9,31 @@ #include "common.h" -/* I know that there strtol out there, but it requires a pointer to - * a char, which would in turn require me to buffer my chars around, - * making the whole process slower. - */ -static long -xchar_to_long(char c) -{ - if ((c >= 48) && (c <= 57)) { /* 0-9 */ - return c - 48; - } - else if ((c >= 65) && (c <= 70)) { /* A-F */ - return c - 55; - } - else if ((c >= 97) && (c <= 102)) { /* a-f */ - return c - 87; - } - return 0; -} - static PyObject* -cache_string_to_colors(PyObject *self, PyObject *args) +cache_bytes_to_colors(PyObject *self, PyObject *args) { - char *s; - Py_ssize_t char_count, color_count, i; + char *y; + Py_ssize_t char_count, i, color_count; PyObject *result; - - if (!PyArg_ParseTuple(args, "s#", &s, &char_count)) { + unsigned long r, g, b; + Py_ssize_t ci; + PyObject *color_tuple; + + if (!PyArg_ParseTuple(args, "y#", &y, &char_count)) { return NULL; } - color_count = (char_count / 6); + color_count = char_count / 3; result = PyList_New(color_count); if (result == NULL) { return NULL; } for (i=0; i Date: Fri, 13 Jan 2023 00:05:47 -0600 Subject: [PATCH 2/2] feat: Add migration for picture cache db - Add migration (just delete db and change to new schema) for picture cache following the same sort of strategy as the file digest cache - Rename mtime column to mtime_ns to match file cache for consistency --- core/pe/cache_sqlite.py | 52 +++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/core/pe/cache_sqlite.py b/core/pe/cache_sqlite.py index bf9f7c5b..4cb3c588 100644 --- a/core/pe/cache_sqlite.py +++ b/core/pe/cache_sqlite.py @@ -15,6 +15,14 @@ from core.pe.cache import bytes_to_colors, colors_to_bytes class SqliteCache: """A class to cache picture blocks in a sqlite backend.""" + schema_version = 1 + schema_version_description = "Changed from string to bytes for blocks." + + create_table_query = "CREATE TABLE IF NOT EXISTS pictures(path TEXT, mtime_ns INTEGER, blocks BLOB)" + create_index_query = "CREATE INDEX IF NOT EXISTS idx_path on pictures (path)" + drop_table_query = "DROP TABLE IF EXISTS pictures" + drop_index_query = "DROP INDEX IF EXISTS idx_path" + def __init__(self, db=":memory:", readonly=False): # readonly is not used in the sqlite version of the cache self.dbname = db @@ -62,9 +70,9 @@ class SqliteCache: else: mtime = 0 if path_str in self: - sql = "update pictures set blocks = ?, mtime = ? where path = ?" + sql = "update pictures set blocks = ?, mtime_ns = ? where path = ?" else: - sql = "insert into pictures(blocks,mtime,path) values(?,?,?)" + sql = "insert into pictures(blocks,mtime_ns,path) values(?,?,?)" try: self.con.execute(sql, [blocks, mtime, path_str]) except sqlite.OperationalError: @@ -73,18 +81,9 @@ class SqliteCache: logging.warning("DatabaseError while setting value for key %r: %s", path_str, str(e)) def _create_con(self, second_try=False): - def create_tables(): - logging.debug("Creating picture cache tables.") - self.con.execute("drop table if exists pictures") - self.con.execute("drop index if exists idx_path") - self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks BLOB)") - self.con.execute("create index idx_path on pictures (path)") - - self.con = sqlite.connect(self.dbname, isolation_level=None) try: - self.con.execute("select path, mtime, blocks from pictures where 1=2") - except sqlite.OperationalError: # new db - create_tables() + self.con = sqlite.connect(self.dbname, isolation_level=None) + self._check_upgrade() except sqlite.DatabaseError as e: # corrupted db if second_try: raise # Something really strange is happening @@ -93,6 +92,25 @@ class SqliteCache: os.remove(self.dbname) self._create_con(second_try=True) + def _check_upgrade(self) -> None: + with self.con as conn: + has_schema = conn.execute( + "SELECT NAME FROM sqlite_master WHERE type='table' AND name='schema_version'" + ).fetchall() + version = None + if has_schema: + version = conn.execute("SELECT version FROM schema_version ORDER BY version DESC").fetchone()[0] + else: + conn.execute("CREATE TABLE schema_version (version int PRIMARY KEY, description TEXT)") + if version != self.schema_version: + conn.execute(self.drop_table_query) + conn.execute( + "INSERT OR REPLACE INTO schema_version VALUES (:version, :description)", + {"version": self.schema_version, "description": self.schema_version_description}, + ) + conn.execute(self.create_table_query) + conn.execute(self.create_index_query) + def clear(self): self.close() if self.dbname != ":memory:": @@ -129,12 +147,12 @@ class SqliteCache: the db. """ todelete = [] - sql = "select rowid, path, mtime from pictures" + sql = "select rowid, path, mtime_ns from pictures" cur = self.con.execute(sql) - for rowid, path_str, mtime in cur: - if mtime and op.exists(path_str): + for rowid, path_str, mtime_ns in cur: + if mtime_ns and op.exists(path_str): picture_mtime = os.stat(path_str).st_mtime - if int(picture_mtime) <= mtime: + if int(picture_mtime) <= mtime_ns: # not outdated continue todelete.append(rowid)