1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-22 14:41:39 +00:00

Add unit tests for hash sample optimization

* Instead of keeping md5 samples separate, merge them as one hash computed from the various selected chunks we picked.
* We don't need to keep a boolean to see whether or not the user chose to optimize; we can simply compare the value of the threshold, since 0 means no optimization currently active.
This commit is contained in:
glubsy
2021-06-21 22:44:05 +02:00
parent e07dfd5955
commit 277bc3fbb8
7 changed files with 152 additions and 43 deletions

View File

@@ -83,9 +83,9 @@ class File:
INITIAL_INFO = {
"size": 0,
"mtime": 0,
"md5": "",
"md5partial": "",
"md5samples": []
"md5": b"",
"md5partial": b"",
"md5samples": b""
}
# Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
# files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
@@ -104,7 +104,6 @@ class File:
result = object.__getattribute__(self, attrname)
if result is NOT_SET:
try:
print(f"Try get attr for {self} {attrname}")
self._read_info(attrname)
except Exception as e:
logging.warning(
@@ -121,12 +120,12 @@ class File:
return (0x4000, 0x4000) # 16Kb
def _read_info(self, field):
# print(f"_read_info({field}) for {self}")
if field in ("size", "mtime"):
stats = self.path.stat()
self.size = nonone(stats.st_size, 0)
self.mtime = nonone(stats.st_mtime, 0)
elif field == "md5partial":
print(f"_read_info md5partial {self}")
try:
with self.path.open("rb") as fp:
offset, size = self._get_md5partial_offset_and_size()
@@ -146,27 +145,24 @@ class File:
except Exception:
pass
elif field == "md5samples":
print(f"computing md5chunks for {self}, caller: {inspect.stack()[1][3]}")
try:
with self.path.open("rb") as fp:
md5chunks = []
size = self.size
# Chunk at 25% of the file
fp.seek(floor(self.size * 25 / 100), 0)
fp.seek(floor(size * 25 / 100), 0)
filedata = fp.read(CHUNK_SIZE)
md5chunks.append(hashlib.md5(filedata).hexdigest())
md5 = hashlib.md5(filedata)
# Chunk at 60% of the file
fp.seek(floor(self.size * 60 / 100), 0)
fp.seek(floor(size * 60 / 100), 0)
filedata = fp.read(CHUNK_SIZE)
md5chunks.append(hashlib.md5(filedata).hexdigest())
md5.update(filedata)
# Last chunk of the file
fp.seek(-CHUNK_SIZE, 2)
filedata = fp.read(CHUNK_SIZE)
md5chunks.append(hashlib.md5(filedata).hexdigest())
# Use setattr to avoid circular (de)reference
setattr(self, field, tuple(md5chunks))
md5.update(filedata)
setattr(self, field, md5.digest())
except Exception as e:
logging.error(f"Error computing md5samples: {e}")
pass
@@ -239,16 +235,16 @@ class Folder(File):
return folders + files
def _read_info(self, field):
# print(f"_read_info({field}) for Folder {self}")
if field in {"size", "mtime"}:
size = sum((f.size for f in self._all_items()), 0)
self.size = size
stats = self.path.stat()
self.mtime = nonone(stats.st_mtime, 0)
elif field in {"md5", "md5partial"}:
elif field in {"md5", "md5partial", "md5samples"}:
# What's sensitive here is that we must make sure that subfiles'
# md5 are always added up in the same order, but we also want a
# different md5 if a file gets moved in a different subdirectory.
print(f"Getting {field} of folder {self}...")
def get_dir_md5_concat():
items = self._all_items()