1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-22 14:41:39 +00:00

Squashed commit of the following:

commit 5eb515f666bfa1ff06c2e96bdc351a4b7456580e
Author: Andrew Senetar <arsenetar@gmail.com>
Date:   Sun Mar 27 22:19:39 2022 -0500

    Add fallback to md5 if xxhash not available

    Mainly here for the case when distributions have not packaged python3-xxhash.

commit 51b18d4c84
Author: Andrew Senetar <arsenetar@gmail.com>
Date:   Sat Mar 19 15:25:46 2022 -0500

    Switch file hashing to xxhash instead of md5

    - Improves performance significantly in some cases
    - Add xxhash to requirements.txt and sort requirements
    - Rename md5 based members to digest
    - Update all tests to use new member names and hashing methods
    - Update hash db code to upgrade schema

    NOTE: May consider supporting multiple hashing algorithms in the future.
This commit is contained in:
2022-03-27 22:27:13 -05:00
parent 86bf9b39d0
commit 9f40e4e786
10 changed files with 212 additions and 180 deletions

View File

@@ -11,12 +11,22 @@
# resulting needless complexity and memory usage. It's been a while since I wanted to do that fork,
# and I'm doing it now.
import hashlib
import os
try:
import xxhash
hasher = xxhash.xxh128
except ImportError:
import hashlib
hasher = hashlib.md5
from math import floor
import logging
import sqlite3
from threading import Lock
from typing import Any
from typing import Any, AnyStr, Union
from hscommon.path import Path
from hscommon.util import nonone, get_file_ext
@@ -40,7 +50,7 @@ NOT_SET = object()
# CPU.
CHUNK_SIZE = 1024 * 1024 # 1 MiB
# Minimum size below which partial hashes don't need to be computed
# Minimum size below which partial hashing is not used
MIN_FILE_SIZE = 3 * CHUNK_SIZE # 3MiB, because we take 3 samples
@@ -83,9 +93,11 @@ class OperationError(FSError):
class FilesDB:
schema_version = 1
schema_version_description = "Changed from md5 to xxhash if available."
create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
drop_table_query = "DROP TABLE files;"
create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, digest BLOB, digest_partial BLOB, digest_samples BLOB)"
drop_table_query = "DROP TABLE IF EXISTS files;"
select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns"
insert_query = """
INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value)
@@ -97,24 +109,37 @@ class FilesDB:
self.cur = None
self.lock = None
def connect(self, path):
# type: (str, ) -> None
def connect(self, path: Union[AnyStr, os.PathLike]) -> None:
self.conn = sqlite3.connect(path, check_same_thread=False)
self.cur = self.conn.cursor()
self.cur.execute(self.create_table_query)
self.lock = Lock()
self._check_upgrade()
def clear(self):
# type: () -> None
def _check_upgrade(self) -> None:
with self.lock:
has_schema = self.cur.execute(
"SELECT NAME FROM sqlite_master WHERE type='table' AND name='schema_version'"
).fetchall()
version = None
if has_schema:
version = self.cur.execute("SELECT version FROM schema_version ORDER BY version DESC").fetchone()[0]
else:
self.cur.execute("CREATE TABLE schema_version (version int PRIMARY KEY, description TEXT)")
if version != self.schema_version:
self.cur.execute(self.drop_table_query)
self.cur.execute(
"INSERT OR REPLACE INTO schema_version VALUES (:version, :description)",
{"version": self.schema_version, "description": self.schema_version_description},
)
self.cur.execute(self.create_table_query)
self.conn.commit()
def clear(self) -> None:
with self.lock:
self.cur.execute(self.drop_table_query)
self.cur.execute(self.create_table_query)
def get(self, path, key):
# type: (Path, str) -> bytes
def get(self, path: Path, key: str) -> Union[bytes, None]:
stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns
@@ -128,9 +153,7 @@ class FilesDB:
return None
def put(self, path, key, value):
# type: (Path, str, Any) -> None
def put(self, path: Path, key: str, value: Any) -> None:
stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns
@@ -141,15 +164,11 @@ class FilesDB:
{"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value},
)
def commit(self):
# type: () -> None
def commit(self) -> None:
with self.lock:
self.conn.commit()
def close(self):
# type: () -> None
def close(self) -> None:
with self.lock:
self.cur.close()
self.conn.close()
@@ -161,7 +180,7 @@ filesdb = FilesDB() # Singleton
class File:
"""Represents a file and holds metadata to be used for scanning."""
INITIAL_INFO = {"size": 0, "mtime": 0, "md5": b"", "md5partial": b"", "md5samples": b""}
INITIAL_INFO = {"size": 0, "mtime": 0, "digest": b"", "digest_partial": b"", "digest_samples": b""}
# Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
# files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
# even greater when we take into account read attributes (70%!). Yeah, it's worth it.
@@ -187,32 +206,51 @@ class File:
result = self.INITIAL_INFO[attrname]
return result
def _calc_md5(self):
def _calc_digest(self):
# type: () -> bytes
with self.path.open("rb") as fp:
md5 = hashlib.md5()
file_hash = hasher()
# The goal here is to not run out of memory on really big files. However, the chunk
# size has to be large enough so that the python loop isn't too costly in terms of
# CPU.
CHUNK_SIZE = 1024 * 1024 # 1 mb
filedata = fp.read(CHUNK_SIZE)
while filedata:
md5.update(filedata)
file_hash.update(filedata)
filedata = fp.read(CHUNK_SIZE)
return md5.digest()
return file_hash.digest()
def _calc_md5partial(self):
def _calc_digest_partial(self):
# type: () -> bytes
# This offset is where we should start reading the file to get a partial md5
# This offset is where we should start reading the file to get a partial hash
# For audio file, it should be where audio data starts
offset, size = (0x4000, 0x4000)
with self.path.open("rb") as fp:
fp.seek(offset)
partialdata = fp.read(size)
return hashlib.md5(partialdata).digest()
partial_data = fp.read(size)
return hasher(partial_data).digest()
def _calc_digest_samples(self) -> bytes:
size = self.size
with self.path.open("rb") as fp:
# Chunk at 25% of the file
fp.seek(floor(size * 25 / 100), 0)
file_data = fp.read(CHUNK_SIZE)
file_hash = hasher(file_data)
# Chunk at 60% of the file
fp.seek(floor(size * 60 / 100), 0)
file_data = fp.read(CHUNK_SIZE)
file_hash.update(file_data)
# Last chunk of the file
fp.seek(-CHUNK_SIZE, 2)
file_data = fp.read(CHUNK_SIZE)
file_hash.update(file_data)
return file_hash.digest()
def _read_info(self, field):
# print(f"_read_info({field}) for {self}")
@@ -220,48 +258,35 @@ class File:
stats = self.path.stat()
self.size = nonone(stats.st_size, 0)
self.mtime = nonone(stats.st_mtime, 0)
elif field == "md5partial":
elif field == "digest_partial":
try:
self.md5partial = filesdb.get(self.path, "md5partial")
if self.md5partial is None:
self.md5partial = self._calc_md5partial()
filesdb.put(self.path, "md5partial", self.md5partial)
self.digest_partial = filesdb.get(self.path, "digest_partial")
if self.digest_partial is None:
self.digest_partial = self._calc_digest_partial()
filesdb.put(self.path, "digest_partial", self.digest_partial)
except Exception as e:
logging.warning("Couldn't get md5partial for %s: %s", self.path, e)
elif field == "md5":
logging.warning("Couldn't get digest_partial for %s: %s", self.path, e)
elif field == "digest":
try:
self.md5 = filesdb.get(self.path, "md5")
if self.md5 is None:
self.md5 = self._calc_md5()
filesdb.put(self.path, "md5", self.md5)
self.digest = filesdb.get(self.path, "digest")
if self.digest is None:
self.digest = self._calc_digest()
filesdb.put(self.path, "digest", self.digest)
except Exception as e:
logging.warning("Couldn't get md5 for %s: %s", self.path, e)
elif field == "md5samples":
logging.warning("Couldn't get digest for %s: %s", self.path, e)
elif field == "digest_samples":
size = self.size
# Might as well hash such small files entirely.
if size <= MIN_FILE_SIZE:
setattr(self, field, self.digest)
return
try:
with self.path.open("rb") as fp:
size = self.size
# Might as well hash such small files entirely.
if size <= MIN_FILE_SIZE:
setattr(self, field, self.md5)
return
# Chunk at 25% of the file
fp.seek(floor(size * 25 / 100), 0)
filedata = fp.read(CHUNK_SIZE)
md5 = hashlib.md5(filedata)
# Chunk at 60% of the file
fp.seek(floor(size * 60 / 100), 0)
filedata = fp.read(CHUNK_SIZE)
md5.update(filedata)
# Last chunk of the file
fp.seek(-CHUNK_SIZE, 2)
filedata = fp.read(CHUNK_SIZE)
md5.update(filedata)
setattr(self, field, md5.digest())
self.digest_samples = filesdb.get(self.path, "digest_samples")
if self.digest_samples is None:
self.digest_samples = self._calc_digest_samples()
filesdb.put(self.path, "digest_samples", self.digest_samples)
except Exception as e:
logging.error(f"Error computing md5samples: {e}")
logging.warning(f"Couldn't get digest_samples for {self.path}: {e}")
def _read_all_info(self, attrnames=None):
"""Cache all possible info.
@@ -314,7 +339,7 @@ class File:
class Folder(File):
"""A wrapper around a folder path.
It has the size/md5 info of a File, but its value is the sum of its subitems.
It has the size/digest info of a File, but its value is the sum of its subitems.
"""
__slots__ = File.__slots__ + ("_subfolders",)
@@ -335,19 +360,18 @@ class Folder(File):
self.size = size
stats = self.path.stat()
self.mtime = nonone(stats.st_mtime, 0)
elif field in {"md5", "md5partial", "md5samples"}:
elif field in {"digest", "digest_partial", "digest_samples"}:
# What's sensitive here is that we must make sure that subfiles'
# md5 are always added up in the same order, but we also want a
# different md5 if a file gets moved in a different subdirectory.
# digest are always added up in the same order, but we also want a
# different digest if a file gets moved in a different subdirectory.
def get_dir_md5_concat():
def get_dir_digest_concat():
items = self._all_items()
items.sort(key=lambda f: f.path)
md5s = [getattr(f, field) for f in items]
return b"".join(md5s)
digests = [getattr(f, field) for f in items]
return b"".join(digests)
md5 = hashlib.md5(get_dir_md5_concat())
digest = md5.digest()
digest = hasher(get_dir_digest_concat()).digest()
setattr(self, field, digest)
@property