1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-22 14:41:39 +00:00

Squashed commit of the following:

commit 5eb515f666bfa1ff06c2e96bdc351a4b7456580e
Author: Andrew Senetar <arsenetar@gmail.com>
Date:   Sun Mar 27 22:19:39 2022 -0500

    Add fallback to md5 if xxhash not available

    Mainly here for the case when distributions have not packaged python3-xxhash.

commit 51b18d4c84
Author: Andrew Senetar <arsenetar@gmail.com>
Date:   Sat Mar 19 15:25:46 2022 -0500

    Switch file hashing to xxhash instead of md5

    - Improves performance significantly in some cases
    - Add xxhash to requirements.txt and sort requirements
    - Rename md5 based members to digest
    - Update all tests to use new member names and hashing methods
    - Update hash db code to upgrade schema

    NOTE: May consider supporting multiple hashing algorithms in the future.
This commit is contained in:
2022-03-27 22:27:13 -05:00
parent 86bf9b39d0
commit 9f40e4e786
10 changed files with 212 additions and 180 deletions

View File

@@ -283,7 +283,7 @@ def getmatches_by_contents(files, bigsize=0, j=job.nulljob):
"""Returns a list of :class:`Match` within ``files`` if their contents is the same.
:param bigsize: The size in bytes over which we consider files big enough to
justify taking samples of md5. If 0, compute md5 as usual.
justify taking samples of the file for hashing. If 0, compute digest as usual.
:param j: A :ref:`job progress instance <jobs>`.
"""
size2files = defaultdict(set)
@@ -300,15 +300,15 @@ def getmatches_by_contents(files, bigsize=0, j=job.nulljob):
if first.is_ref and second.is_ref:
continue # Don't spend time comparing two ref pics together.
if first.size == 0 and second.size == 0:
# skip md5 for zero length files
# skip hashing for zero length files
result.append(Match(first, second, 100))
continue
if first.md5partial == second.md5partial:
if first.digest_partial == second.digest_partial:
if bigsize > 0 and first.size > bigsize:
if first.md5samples == second.md5samples:
if first.digest_samples == second.digest_samples:
result.append(Match(first, second, 100))
else:
if first.md5 == second.md5:
if first.digest == second.digest:
result.append(Match(first, second, 100))
group_count += 1
j.add_progress(desc=PROGRESS_MESSAGE % (len(result), group_count))