1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2025-05-08 09:49:51 +00:00

Update hash db code to upgrade schema

This commit is contained in:
Andrew Senetar 2022-03-23 23:44:24 -05:00
parent e16df489bd
commit b6fe312693
Signed by: arsenetar
GPG Key ID: C63300DCE48AB2F1

View File

@ -11,12 +11,13 @@
# resulting needless complexity and memory usage. It's been a while since I wanted to do that fork, # resulting needless complexity and memory usage. It's been a while since I wanted to do that fork,
# and I'm doing it now. # and I'm doing it now.
import os
import xxhash import xxhash
from math import floor from math import floor
import logging import logging
import sqlite3 import sqlite3
from threading import Lock from threading import Lock
from typing import Any from typing import Any, AnyStr, Union
from hscommon.path import Path from hscommon.path import Path
from hscommon.util import nonone, get_file_ext from hscommon.util import nonone, get_file_ext
@ -83,9 +84,11 @@ class OperationError(FSError):
class FilesDB: class FilesDB:
schema_version = 1
schema_version_description = "Changed from md5 to xxhash"
create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)" create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, digest BLOB, digest_partial BLOB, digest_samples BLOB)"
drop_table_query = "DROP TABLE files;" drop_table_query = "DROP TABLE IF EXISTS files;"
select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns" select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns"
insert_query = """ insert_query = """
INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value) INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value)
@ -97,24 +100,37 @@ class FilesDB:
self.cur = None self.cur = None
self.lock = None self.lock = None
def connect(self, path): def connect(self, path: Union[AnyStr, os.PathLike]) -> None:
# type: (str, ) -> None
self.conn = sqlite3.connect(path, check_same_thread=False) self.conn = sqlite3.connect(path, check_same_thread=False)
self.cur = self.conn.cursor() self.cur = self.conn.cursor()
self.cur.execute(self.create_table_query)
self.lock = Lock() self.lock = Lock()
self._check_upgrade()
def clear(self): def _check_upgrade(self) -> None:
# type: () -> None with self.lock:
has_schema = self.cur.execute(
"SELECT NAME FROM sqlite_master WHERE type='table' AND name='schema_version'"
).fetchall()
version = None
if has_schema:
version = self.cur.execute("SELECT version FROM schema_version ORDER BY version DESC").fetchone()[0]
else:
self.cur.execute("CREATE TABLE schema_version (version int PRIMARY KEY, description TEXT)")
if version != self.schema_version:
self.cur.execute(self.drop_table_query)
self.cur.execute(
"INSERT OR REPLACE INTO schema_version VALUES (:version, :description)",
{"version": self.schema_version, "description": self.schema_version_description},
)
self.cur.execute(self.create_table_query)
self.conn.commit()
def clear(self) -> None:
with self.lock: with self.lock:
self.cur.execute(self.drop_table_query) self.cur.execute(self.drop_table_query)
self.cur.execute(self.create_table_query) self.cur.execute(self.create_table_query)
def get(self, path, key): def get(self, path: Path, key: str) -> Union[bytes, None]:
# type: (Path, str) -> bytes
stat = path.stat() stat = path.stat()
size = stat.st_size size = stat.st_size
mtime_ns = stat.st_mtime_ns mtime_ns = stat.st_mtime_ns
@ -128,9 +144,7 @@ class FilesDB:
return None return None
def put(self, path, key, value): def put(self, path: Path, key: str, value: Any) -> None:
# type: (Path, str, Any) -> None
stat = path.stat() stat = path.stat()
size = stat.st_size size = stat.st_size
mtime_ns = stat.st_mtime_ns mtime_ns = stat.st_mtime_ns
@ -141,15 +155,11 @@ class FilesDB:
{"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value}, {"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value},
) )
def commit(self): def commit(self) -> None:
# type: () -> None
with self.lock: with self.lock:
self.conn.commit() self.conn.commit()
def close(self): def close(self) -> None:
# type: () -> None
with self.lock: with self.lock:
self.cur.close() self.cur.close()
self.conn.close() self.conn.close()
@ -214,35 +224,8 @@ class File:
partial_data = fp.read(size) partial_data = fp.read(size)
return xxhash.xxh128_digest(partial_data) return xxhash.xxh128_digest(partial_data)
def _read_info(self, field): def _calc_digest_samples(self) -> bytes:
# print(f"_read_info({field}) for {self}")
if field in ("size", "mtime"):
stats = self.path.stat()
self.size = nonone(stats.st_size, 0)
self.mtime = nonone(stats.st_mtime, 0)
elif field == "digest_partial":
try:
self.digest_partial = filesdb.get(self.path, "md5partial")
if self.digest_partial is None:
self.digest_partial = self._calc_digest_partial()
filesdb.put(self.path, "md5partial", self.digest_partial)
except Exception as e:
logging.warning("Couldn't get digest_partial for %s: %s", self.path, e)
elif field == "digest":
try:
self.digest = filesdb.get(self.path, "md5")
if self.digest is None:
self.digest = self._calc_digest()
filesdb.put(self.path, "md5", self.digest)
except Exception as e:
logging.warning("Couldn't get digest for %s: %s", self.path, e)
elif field == "digest_samples":
size = self.size size = self.size
# Might as well hash such small files entirely.
if size <= MIN_FILE_SIZE:
setattr(self, field, self.digest)
return
try:
with self.path.open("rb") as fp: with self.path.open("rb") as fp:
# Chunk at 25% of the file # Chunk at 25% of the file
fp.seek(floor(size * 25 / 100), 0) fp.seek(floor(size * 25 / 100), 0)
@ -258,9 +241,43 @@ class File:
fp.seek(-CHUNK_SIZE, 2) fp.seek(-CHUNK_SIZE, 2)
file_data = fp.read(CHUNK_SIZE) file_data = fp.read(CHUNK_SIZE)
file_hash.update(file_data) file_hash.update(file_data)
setattr(self, field, file_hash.digest()) return file_hash.digest()
def _read_info(self, field):
# print(f"_read_info({field}) for {self}")
if field in ("size", "mtime"):
stats = self.path.stat()
self.size = nonone(stats.st_size, 0)
self.mtime = nonone(stats.st_mtime, 0)
elif field == "digest_partial":
try:
self.digest_partial = filesdb.get(self.path, "digest_partial")
if self.digest_partial is None:
self.digest_partial = self._calc_digest_partial()
filesdb.put(self.path, "digest_partial", self.digest_partial)
except Exception as e: except Exception as e:
logging.error(f"Error computing digest_samples: {e}") logging.warning("Couldn't get digest_partial for %s: %s", self.path, e)
elif field == "digest":
try:
self.digest = filesdb.get(self.path, "digest")
if self.digest is None:
self.digest = self._calc_digest()
filesdb.put(self.path, "digest", self.digest)
except Exception as e:
logging.warning("Couldn't get digest for %s: %s", self.path, e)
elif field == "digest_samples":
size = self.size
# Might as well hash such small files entirely.
if size <= MIN_FILE_SIZE:
setattr(self, field, self.digest)
return
try:
self.digest_samples = filesdb.get(self.path, "digest_samples")
if self.digest_samples is None:
self.digest_samples = self._calc_digest_samples()
filesdb.put(self.path, "digest_samples", self.digest_samples)
except Exception as e:
logging.warning(f"Couldn't get digest_samples for {self.path}: {e}")
def _read_all_info(self, attrnames=None): def _read_all_info(self, attrnames=None):
"""Cache all possible info. """Cache all possible info.