serialize/deserialize colors to/from bytes instead of strings

it's a tiny bit faster and saves a bit of memory
This commit is contained in:
Dobatymo 2022-09-27 17:34:57 +08:00
parent 1f1dfa88dc
commit f1153c85c0
9 changed files with 47 additions and 78 deletions

View File

@ -4,24 +4,13 @@
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from core.pe._cache import string_to_colors # noqa
from core.pe._cache import bytes_to_colors # noqa
def colors_to_string(colors):
"""Transform the 3 sized tuples 'colors' into a hex string.
def colors_to_bytes(colors):
"""Transform the 3 sized tuples 'colors' into a bytes string.
[(0,100,255)] --> 0064ff
[(1,2,3),(4,5,6)] --> 010203040506
[(0,100,255)] --> b'\x00d\xff'
[(1,2,3),(4,5,6)] --> b'\x01\x02\x03\x04\x05\x06'
"""
return "".join("{:02x}{:02x}{:02x}".format(r, g, b) for r, g, b in colors)
# This function is an important bottleneck of dupeGuru PE. It has been converted to C.
# def string_to_colors(s):
# """Transform the string 's' in a list of 3 sized tuples.
# """
# result = []
# for i in xrange(0, len(s), 6):
# number = int(s[i:i+6], 16)
# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
# return result
return b"".join(map(bytes, colors))

View File

@ -2,5 +2,5 @@ from typing import Union, Tuple, List
_block = Tuple[int, int, int]
def colors_to_string(colors: List[_block]) -> str: ... # noqa: E302
def string_to_colors(s: str) -> Union[List[_block], None]: ...
def colors_to_bytes(colors: List[_block]) -> bytes: ... # noqa: E302
def bytes_to_colors(s: bytes) -> Union[List[_block], None]: ...

View File

@ -10,7 +10,7 @@ import shelve
import tempfile
from collections import namedtuple
from core.pe.cache import string_to_colors, colors_to_string
from core.pe.cache import bytes_to_colors, colors_to_bytes
def wrap_path(path):
@ -57,7 +57,7 @@ class ShelveCache:
skey = self.shelve[wrap_id(key)]
else:
skey = wrap_path(key)
return string_to_colors(self.shelve[skey].blocks)
return bytes_to_colors(self.shelve[skey].blocks)
def __iter__(self):
return (unwrap_path(k) for k in self.shelve if k.startswith("path:"))
@ -66,7 +66,7 @@ class ShelveCache:
return sum(1 for k in self.shelve if k.startswith("path:"))
def __setitem__(self, path_str, blocks):
blocks = colors_to_string(blocks)
blocks = colors_to_bytes(blocks)
if op.exists(path_str):
mtime = int(os.stat(path_str).st_mtime)
else:
@ -114,7 +114,7 @@ class ShelveCache:
skey = self.shelve[wrap_id(rowid)]
except KeyError:
continue
yield (rowid, string_to_colors(self.shelve[skey].blocks))
yield (rowid, bytes_to_colors(self.shelve[skey].blocks))
def purge_outdated(self):
"""Go through the cache and purge outdated records.

View File

@ -9,7 +9,7 @@ import os.path as op
import logging
import sqlite3 as sqlite
from core.pe.cache import string_to_colors, colors_to_string
from core.pe.cache import bytes_to_colors, colors_to_bytes
class SqliteCache:
@ -40,7 +40,7 @@ class SqliteCache:
sql = "select blocks from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchone()
if result:
result = string_to_colors(result[0])
result = bytes_to_colors(result[0])
return result
else:
raise KeyError(key)
@ -56,7 +56,7 @@ class SqliteCache:
return result[0][0]
def __setitem__(self, path_str, blocks):
blocks = colors_to_string(blocks)
blocks = colors_to_bytes(blocks)
if op.exists(path_str):
mtime = int(os.stat(path_str).st_mtime)
else:
@ -77,7 +77,7 @@ class SqliteCache:
logging.debug("Creating picture cache tables.")
self.con.execute("drop table if exists pictures")
self.con.execute("drop index if exists idx_path")
self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks TEXT)")
self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks BLOB)")
self.con.execute("create index idx_path on pictures (path)")
self.con = sqlite.connect(self.dbname, isolation_level=None)
@ -120,7 +120,7 @@ class SqliteCache:
def get_multiple(self, rowids):
sql = "select rowid, blocks from pictures where rowid in (%s)" % ",".join(map(str, rowids))
cur = self.con.execute(sql)
return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)
return ((rowid, bytes_to_colors(blocks)) for rowid, blocks in cur)
def purge_outdated(self):
"""Go through the cache and purge outdated records.

View File

@ -27,7 +27,7 @@ from core.pe.block import avgdiff, DifferentBlockCountError, NoBlocksError
# to files in other chunks. So chunkifying doesn't save us any actual comparison, but the advantage
# is that instead of reading blocks from disk number_of_files**2 times, we read it
# number_of_files*number_of_chunks times.
# Determining the right chunk size is tricky, bceause if it's too big, too many blocks will be in
# Determining the right chunk size is tricky, because if it's too big, too many blocks will be in
# memory at the same time and we might end up with memory trashing, which is awfully slow. So,
# because our *real* bottleneck is CPU, the chunk size must simply be enough so that the CPU isn't
# starved by Disk IOs.

View File

@ -9,51 +9,31 @@
#include "common.h"
/* I know that there strtol out there, but it requires a pointer to
* a char, which would in turn require me to buffer my chars around,
* making the whole process slower.
*/
static long
xchar_to_long(char c)
{
if ((c >= 48) && (c <= 57)) { /* 0-9 */
return c - 48;
}
else if ((c >= 65) && (c <= 70)) { /* A-F */
return c - 55;
}
else if ((c >= 97) && (c <= 102)) { /* a-f */
return c - 87;
}
return 0;
}
static PyObject*
cache_string_to_colors(PyObject *self, PyObject *args)
cache_bytes_to_colors(PyObject *self, PyObject *args)
{
char *s;
Py_ssize_t char_count, color_count, i;
char *y;
Py_ssize_t char_count, i, color_count;
PyObject *result;
if (!PyArg_ParseTuple(args, "s#", &s, &char_count)) {
unsigned long r, g, b;
Py_ssize_t ci;
PyObject *color_tuple;
if (!PyArg_ParseTuple(args, "y#", &y, &char_count)) {
return NULL;
}
color_count = (char_count / 6);
color_count = char_count / 3;
result = PyList_New(color_count);
if (result == NULL) {
return NULL;
}
for (i=0; i<color_count; i++) {
long r, g, b;
Py_ssize_t ci;
PyObject *color_tuple;
ci = i * 6;
r = (xchar_to_long(s[ci]) << 4) + xchar_to_long(s[ci+1]);
g = (xchar_to_long(s[ci+2]) << 4) + xchar_to_long(s[ci+3]);
b = (xchar_to_long(s[ci+4]) << 4) + xchar_to_long(s[ci+5]);
ci = i * 3;
r = (unsigned char) y[ci];
g = (unsigned char) y[ci+1];
b = (unsigned char) y[ci+2];
color_tuple = inttuple(3, r, g, b);
if (color_tuple == NULL) {
@ -67,8 +47,7 @@ cache_string_to_colors(PyObject *self, PyObject *args)
}
static PyMethodDef CacheMethods[] = {
{"string_to_colors", cache_string_to_colors, METH_VARARGS,
"Transform the string 's' in a list of 3 sized tuples."},
{"bytes_to_colors", cache_bytes_to_colors, METH_VARARGS, "Transform the bytes 's' into a list of 3 sized tuples."},
{NULL, NULL, 0, NULL} /* Sentinel */
};

View File

@ -32,7 +32,7 @@ PyObject* inttuple(int n, ...)
result = PyTuple_New(n);
for (i=0; i<n; i++) {
pnumber = PyLong_FromLong(va_arg(numbers, long));
pnumber = PyLong_FromUnsignedLong(va_arg(numbers, long));
if (pnumber == NULL) {
Py_DECREF(result);
return NULL;

View File

@ -17,4 +17,4 @@ int min(int a, int b);
#endif
/* Create a tuple out of an array of integers. */
PyObject* inttuple(int n, ...);
PyObject* inttuple(int n, ...);

View File

@ -10,7 +10,7 @@ from pytest import raises, skip
from hscommon.testutil import eq_
try:
from core.pe.cache import colors_to_string, string_to_colors
from core.pe.cache import colors_to_bytes, bytes_to_colors
from core.pe.cache_sqlite import SqliteCache
from core.pe.cache_shelve import ShelveCache
except ImportError:
@ -19,32 +19,33 @@ except ImportError:
class TestCaseColorsToString:
def test_no_color(self):
eq_("", colors_to_string([]))
eq_(b"", colors_to_bytes([]))
def test_single_color(self):
eq_("000000", colors_to_string([(0, 0, 0)]))
eq_("010101", colors_to_string([(1, 1, 1)]))
eq_("0a141e", colors_to_string([(10, 20, 30)]))
eq_(b"\x00\x00\x00", colors_to_bytes([(0, 0, 0)]))
eq_(b"\x01\x01\x01", colors_to_bytes([(1, 1, 1)]))
eq_(b"\x0a\x14\x1e", colors_to_bytes([(10, 20, 30)]))
def test_two_colors(self):
eq_("000102030405", colors_to_string([(0, 1, 2), (3, 4, 5)]))
eq_(b"\x00\x01\x02\x03\x04\x05", colors_to_bytes([(0, 1, 2), (3, 4, 5)]))
class TestCaseStringToColors:
def test_empty(self):
eq_([], string_to_colors(""))
eq_([], bytes_to_colors(b""))
def test_single_color(self):
eq_([(0, 0, 0)], string_to_colors("000000"))
eq_([(2, 3, 4)], string_to_colors("020304"))
eq_([(10, 20, 30)], string_to_colors("0a141e"))
eq_([(0, 0, 0)], bytes_to_colors(b"\x00\x00\x00"))
eq_([(2, 3, 4)], bytes_to_colors(b"\x02\x03\x04"))
eq_([(10, 20, 30)], bytes_to_colors(b"\x0a\x14\x1e"))
def test_two_colors(self):
eq_([(10, 20, 30), (40, 50, 60)], string_to_colors("0a141e28323c"))
eq_([(10, 20, 30), (40, 50, 60)], bytes_to_colors(b"\x0a\x14\x1e\x28\x32\x3c"))
def test_incomplete_color(self):
# don't return anything if it's not a complete color
eq_([], string_to_colors("102"))
eq_([], bytes_to_colors(b"\x01"))
eq_([(1, 2, 3)], bytes_to_colors(b"\x01\x02\x03\x04"))
class BaseTestCaseCache: