mirror of
https://github.com/arsenetar/dupeguru.git
synced 2025-03-10 05:34:36 +00:00
Avoid partially hashing small files
Computing 3 hash samples for files less than 3MiB (3 * CHUNK_SIZE) is not efficient since spans of later samples would overlap a previous one. Therefore we can simply return the hash of the entire small file instead.
This commit is contained in:
parent
718ca5b313
commit
7b764f183e
@ -147,6 +147,11 @@ class File:
|
|||||||
try:
|
try:
|
||||||
with self.path.open("rb") as fp:
|
with self.path.open("rb") as fp:
|
||||||
size = self.size
|
size = self.size
|
||||||
|
# Might as well hash such small files entirely.
|
||||||
|
if size <= CHUNK_SIZE * 3: # 3MiB, because 3 samples
|
||||||
|
setattr(self, field, self.md5)
|
||||||
|
return
|
||||||
|
|
||||||
# Chunk at 25% of the file
|
# Chunk at 25% of the file
|
||||||
fp.seek(floor(size * 25 / 100), 0)
|
fp.seek(floor(size * 25 / 100), 0)
|
||||||
filedata = fp.read(CHUNK_SIZE)
|
filedata = fp.read(CHUNK_SIZE)
|
||||||
@ -219,7 +224,7 @@ class File:
|
|||||||
class Folder(File):
|
class Folder(File):
|
||||||
"""A wrapper around a folder path.
|
"""A wrapper around a folder path.
|
||||||
|
|
||||||
It has the size/md5 info of a File, but it's value are the sum of its subitems.
|
It has the size/md5 info of a File, but its value is the sum of its subitems.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__slots__ = File.__slots__ + ("_subfolders",)
|
__slots__ = File.__slots__ + ("_subfolders",)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user