match all orientations (#1127)

* match all orientations

* use rotation as option

---------

Co-authored-by: Andrew Senetar <arsenetar@gmail.com>
Co-authored-by: Luke <byunghun.hyun26@gmail.com>
This commit is contained in:
Bruno Cabral 2024-02-19 07:19:33 -08:00 committed by GitHub
parent 70d956b4f8
commit 85a4557525
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 78 additions and 35 deletions

View File

@ -304,12 +304,12 @@ def getmatches_by_contents(files, bigsize=0, j=job.nulljob):
result.append(Match(first, second, 100)) result.append(Match(first, second, 100))
continue continue
# if digests are the same (and not None) then files match # if digests are the same (and not None) then files match
if first.digest_partial == second.digest_partial and first.digest_partial is not None: if first.digest_partial is not None and first.digest_partial == second.digest_partial:
if bigsize > 0 and first.size > bigsize: if bigsize > 0 and first.size > bigsize:
if first.digest_samples == second.digest_samples and first.digest_samples is not None: if first.digest_samples is not None and first.digest_samples == second.digest_samples:
result.append(Match(first, second, 100)) result.append(Match(first, second, 100))
else: else:
if first.digest == second.digest and first.digest is not None: if first.digest is not None and first.digest == second.digest:
result.append(Match(first, second, 100)) result.append(Match(first, second, 100))
group_count += 1 group_count += 1
j.add_progress(desc=PROGRESS_MESSAGE % (len(result), group_count)) j.add_progress(desc=PROGRESS_MESSAGE % (len(result), group_count))

View File

@ -206,7 +206,7 @@ class File:
# Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
# files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
# even greater when we take into account read attributes (70%!). Yeah, it's worth it. # even greater when we take into account read attributes (70%!). Yeah, it's worth it.
__slots__ = ("path", "is_ref", "words") + tuple(INITIAL_INFO.keys()) __slots__ = ("path", "unicode_path", "is_ref", "words") + tuple(INITIAL_INFO.keys())
def __init__(self, path): def __init__(self, path):
for attrname in self.INITIAL_INFO: for attrname in self.INITIAL_INFO:
@ -217,6 +217,8 @@ class File:
self.mtime = nonone(path.stat().st_mtime, 0) self.mtime = nonone(path.stat().st_mtime, 0)
else: else:
self.path = path self.path = path
if self.path:
self.unicode_path = str(self.path)
def __repr__(self): def __repr__(self):
return f"<{self.__class__.__name__} {str(self.path)}>" return f"<{self.__class__.__name__} {str(self.path)}>"

View File

@ -15,10 +15,10 @@ from core.pe.cache import bytes_to_colors, colors_to_bytes
class SqliteCache: class SqliteCache:
"""A class to cache picture blocks in a sqlite backend.""" """A class to cache picture blocks in a sqlite backend."""
schema_version = 1 schema_version = 2
schema_version_description = "Changed from string to bytes for blocks." schema_version_description = "Added blocks for all 8 orientations."
create_table_query = "CREATE TABLE IF NOT EXISTS pictures(path TEXT, mtime_ns INTEGER, blocks BLOB)" create_table_query = "CREATE TABLE IF NOT EXISTS pictures(path TEXT, mtime_ns INTEGER, blocks BLOB, blocks2 BLOB, blocks3 BLOB, blocks4 BLOB, blocks5 BLOB, blocks6 BLOB, blocks7 BLOB, blocks8 BLOB)"
create_index_query = "CREATE INDEX IF NOT EXISTS idx_path on pictures (path)" create_index_query = "CREATE INDEX IF NOT EXISTS idx_path on pictures (path)"
drop_table_query = "DROP TABLE IF EXISTS pictures" drop_table_query = "DROP TABLE IF EXISTS pictures"
drop_index_query = "DROP INDEX IF EXISTS idx_path" drop_index_query = "DROP INDEX IF EXISTS idx_path"
@ -43,12 +43,12 @@ class SqliteCache:
# Optimized # Optimized
def __getitem__(self, key): def __getitem__(self, key):
if isinstance(key, int): if isinstance(key, int):
sql = "select blocks from pictures where rowid = ?" sql = "select blocks, blocks2, blocks3, blocks4, blocks5, blocks6, blocks7, blocks8 from pictures where rowid = ?"
else: else:
sql = "select blocks from pictures where path = ?" sql = "select blocks, blocks2, blocks3, blocks4, blocks5, blocks6, blocks7, blocks8 from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchone() blocks = self.con.execute(sql, [key]).fetchone()
if result: if blocks:
result = bytes_to_colors(result[0]) result = [bytes_to_colors(block) for block in blocks]
return result return result
else: else:
raise KeyError(key) raise KeyError(key)
@ -64,17 +64,17 @@ class SqliteCache:
return result[0][0] return result[0][0]
def __setitem__(self, path_str, blocks): def __setitem__(self, path_str, blocks):
blocks = colors_to_bytes(blocks) blocks = [colors_to_bytes(block) for block in blocks]
if op.exists(path_str): if op.exists(path_str):
mtime = int(os.stat(path_str).st_mtime) mtime = int(os.stat(path_str).st_mtime)
else: else:
mtime = 0 mtime = 0
if path_str in self: if path_str in self:
sql = "update pictures set blocks = ?, mtime_ns = ? where path = ?" sql = "update pictures set blocks = ?, blocks2 = ?, blocks3 = ?, blocks4 = ?, blocks5 = ?, blocks6 = ?, blocks7 = ?, blocks8 = ?, mtime_ns = ? where path = ?"
else: else:
sql = "insert into pictures(blocks,mtime_ns,path) values(?,?,?)" sql = "insert into pictures(blocks,blocks2,blocks3,blocks4,blocks5,blocks6,blocks7,blocks8,mtime_ns,path) values(?,?,?,?,?,?,?,?,?,?)"
try: try:
self.con.execute(sql, [blocks, mtime, path_str]) self.con.execute(sql, blocks + [mtime, path_str])
except sqlite.OperationalError: except sqlite.OperationalError:
logging.warning("Picture cache could not set value for key %r", path_str) logging.warning("Picture cache could not set value for key %r", path_str)
except sqlite.DatabaseError as e: except sqlite.DatabaseError as e:
@ -136,9 +136,9 @@ class SqliteCache:
raise ValueError(path) raise ValueError(path)
def get_multiple(self, rowids): def get_multiple(self, rowids):
sql = "select rowid, blocks from pictures where rowid in (%s)" % ",".join(map(str, rowids)) sql = "select rowid, blocks, blocks2, blocks3, blocks4, blocks5, blocks6, blocks7, blocks8 from pictures where rowid in (%s)" % ",".join(map(str, rowids))
cur = self.con.execute(sql) cur = self.con.execute(sql)
return ((rowid, bytes_to_colors(blocks)) for rowid, blocks in cur) return ((rowid, [bytes_to_colors(blocks), bytes_to_colors(blocks2), bytes_to_colors(blocks3), bytes_to_colors(blocks4), bytes_to_colors(blocks5), bytes_to_colors(blocks6), bytes_to_colors(blocks7), bytes_to_colors(blocks8)]) for rowid, blocks, blocks2, blocks3, blocks4, blocks5, blocks6, blocks7, blocks8 in cur)
def purge_outdated(self): def purge_outdated(self):
"""Go through the cache and purge outdated records. """Go through the cache and purge outdated records.

View File

@ -72,13 +72,12 @@ def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
# entry in iPhoto library. # entry in iPhoto library.
logging.warning("We have a picture with a null path here") logging.warning("We have a picture with a null path here")
continue continue
picture.unicode_path = str(picture.path)
logging.debug("Analyzing picture at %s", picture.unicode_path) logging.debug("Analyzing picture at %s", picture.unicode_path)
if with_dimensions: if with_dimensions:
picture.dimensions # pre-read dimensions picture.dimensions # pre-read dimensions
try: try:
if picture.unicode_path not in cache: if picture.unicode_path not in cache:
blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE) blocks = [picture.get_blocks(BLOCK_COUNT_PER_SIDE, orientation) for orientation in range(1, 9)]
cache[picture.unicode_path] = blocks cache[picture.unicode_path] = blocks
prepared.append(picture) prepared.append(picture)
except (OSError, ValueError) as e: except (OSError, ValueError) as e:
@ -119,13 +118,13 @@ def get_match(first, second, percentage):
return Match(first, second, percentage) return Match(first, second, percentage)
def async_compare(ref_ids, other_ids, dbname, threshold, picinfo): def async_compare(ref_ids, other_ids, dbname, threshold, picinfo, match_rotated=False):
# The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids # The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids
# can be None. In this case, ref_ids has to be compared with itself # can be None. In this case, ref_ids has to be compared with itself
# picinfo is a dictionary {pic_id: (dimensions, is_ref)} # picinfo is a dictionary {pic_id: (dimensions, is_ref)}
cache = get_cache(dbname, readonly=True) cache = get_cache(dbname, readonly=True)
limit = 100 - threshold limit = 100 - threshold
ref_pairs = list(cache.get_multiple(ref_ids)) ref_pairs = list(cache.get_multiple(ref_ids)) # (rowid, [b, b2, ..., b8])
if other_ids is not None: if other_ids is not None:
other_pairs = list(cache.get_multiple(other_ids)) other_pairs = list(cache.get_multiple(other_ids))
comparisons_to_do = [(r, o) for r in ref_pairs for o in other_pairs] comparisons_to_do = [(r, o) for r in ref_pairs for o in other_pairs]
@ -138,22 +137,35 @@ def async_compare(ref_ids, other_ids, dbname, threshold, picinfo):
if ref_is_ref and other_is_ref: if ref_is_ref and other_is_ref:
continue continue
if ref_dimensions != other_dimensions: if ref_dimensions != other_dimensions:
continue if match_rotated:
try: rotated_ref_dimensions = (ref_dimensions[1], ref_dimensions[0])
diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS) if rotated_ref_dimensions != other_dimensions:
percentage = 100 - diff continue
except (DifferentBlockCountError, NoBlocksError): else:
percentage = 0 continue
if percentage >= threshold:
results.append((ref_id, other_id, percentage)) orientation_range = 1
if match_rotated:
orientation_range = 8
for orientation_ref in range(orientation_range):
try:
diff = avgdiff(ref_blocks[orientation_ref], other_blocks[0], limit, MIN_ITERATIONS)
percentage = 100 - diff
except (DifferentBlockCountError, NoBlocksError):
percentage = 0
if percentage >= threshold:
results.append((ref_id, other_id, percentage))
break
cache.close() cache.close()
return results return results
def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljob): def getmatches(pictures, cache_path, threshold, match_scaled=False, match_rotated=False, j=job.nulljob):
def get_picinfo(p): def get_picinfo(p):
if match_scaled: if match_scaled:
return (None, p.is_ref) return ((None, None), p.is_ref)
else: else:
return (p.dimensions, p.is_ref) return (p.dimensions, p.is_ref)
@ -205,7 +217,7 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo
picinfo.update({p.cache_id: get_picinfo(p) for p in other_chunk}) picinfo.update({p.cache_id: get_picinfo(p) for p in other_chunk})
else: else:
other_ids = None other_ids = None
args = (ref_ids, other_ids, cache_path, threshold, picinfo) args = (ref_ids, other_ids, cache_path, threshold, picinfo, match_rotated)
async_results.append(pool.apply_async(async_compare, args)) async_results.append(pool.apply_async(async_compare, args))
collect_results() collect_results()
collect_results(collect_all=True) collect_results(collect_all=True)

View File

@ -100,5 +100,8 @@ class Photo(fs.File):
elif field == "exif_timestamp": elif field == "exif_timestamp":
self.exif_timestamp = self._get_exif_timestamp() self.exif_timestamp = self._get_exif_timestamp()
def get_blocks(self, block_count_per_side): def get_blocks(self, block_count_per_side, orientation: int = None):
return self._plat_get_blocks(block_count_per_side, self._get_orientation()) if orientation is None:
return self._plat_get_blocks(block_count_per_side, self._get_orientation())
else:
return self._plat_get_blocks(block_count_per_side, orientation)

View File

@ -14,6 +14,7 @@ from core.pe import matchblock, matchexif
class ScannerPE(Scanner): class ScannerPE(Scanner):
cache_path = None cache_path = None
match_scaled = False match_scaled = False
match_rotated = False
@staticmethod @staticmethod
def get_scan_options(): def get_scan_options():
@ -29,6 +30,7 @@ class ScannerPE(Scanner):
cache_path=self.cache_path, cache_path=self.cache_path,
threshold=self.min_match_percentage, threshold=self.min_match_percentage,
match_scaled=self.match_scaled, match_scaled=self.match_scaled,
match_rotated=self.match_rotated,
j=j, j=j,
) )
elif self.scan_type == ScanType.EXIFTIMESTAMP: elif self.scan_type == ScanType.EXIFTIMESTAMP:

View File

@ -14,6 +14,10 @@ Preferences
If you check this box, pictures of different dimensions will be allowed in the same If you check this box, pictures of different dimensions will be allowed in the same
duplicate group. duplicate group.
**Match pictures of different rotations:**
If you check this box, pictures of different rotations will be allowed in the same
duplicate group.
.. _filter-hardness: .. _filter-hardness:
**Filter Hardness:** **Filter Hardness:**

View File

@ -307,6 +307,10 @@ msgstr "Debug mode (restart required)"
msgid "Match pictures of different dimensions" msgid "Match pictures of different dimensions"
msgstr "Match pictures of different dimensions" msgstr "Match pictures of different dimensions"
#: qt/pe/preferences_dialog.py:19 cocoa/en.lproj/Localizable.strings:0
msgid "Match pictures of different rotations"
msgstr "Match pictures of different rotations"
#: qt/preferences_dialog.py:43 #: qt/preferences_dialog.py:43
msgid "Filter Hardness:" msgid "Filter Hardness:"
msgstr "Filter Hardness:" msgstr "Filter Hardness:"

View File

@ -316,6 +316,10 @@ msgstr "Mode de depuración (se requiere reinicio)"
msgid "Match pictures of different dimensions" msgid "Match pictures of different dimensions"
msgstr "Coincidencia de imágenes de distintas dimensiones" msgstr "Coincidencia de imágenes de distintas dimensiones"
#: qt/pe/preferences_dialog.py:19 cocoa/en.lproj/Localizable.strings:0
msgid "Match pictures of different rotations"
msgstr "Coincidencia de imágenes de distintas rotaciones"
#: qt/preferences_dialog.py:43 #: qt/preferences_dialog.py:43
msgid "Filter Hardness:" msgid "Filter Hardness:"
msgstr "Dureza del Filtro:" msgstr "Dureza del Filtro:"

View File

@ -314,6 +314,10 @@ msgstr "Modo de Depuração (requer reinício)"
msgid "Match pictures of different dimensions" msgid "Match pictures of different dimensions"
msgstr "Coincidir fotos de dimensões diferentes" msgstr "Coincidir fotos de dimensões diferentes"
#: qt/pe/preferences_dialog.py:19 cocoa/en.lproj/Localizable.strings:0
msgid "Match pictures of different rotations"
msgstr "Coincidir fotos de rotações diferentes"
#: qt/preferences_dialog.py:43 #: qt/preferences_dialog.py:43
msgid "Filter Hardness:" msgid "Filter Hardness:"
msgstr "Pressão do Filtro:" msgstr "Pressão do Filtro:"

View File

@ -192,6 +192,7 @@ class DupeGuru(QObject):
scanned_tags.add("year") scanned_tags.add("year")
self.model.options["scanned_tags"] = scanned_tags self.model.options["scanned_tags"] = scanned_tags
self.model.options["match_scaled"] = self.prefs.match_scaled self.model.options["match_scaled"] = self.prefs.match_scaled
self.model.options["match_rotated"] = self.prefs.match_rotated
self.model.options["include_exists_check"] = self.prefs.include_exists_check self.model.options["include_exists_check"] = self.prefs.include_exists_check
self.model.options["rehash_ignore_mtime"] = self.prefs.rehash_ignore_mtime self.model.options["rehash_ignore_mtime"] = self.prefs.rehash_ignore_mtime

View File

@ -21,6 +21,8 @@ class PreferencesDialog(PreferencesDialogBase):
self.widgetsVLayout.addLayout(self.filterHardnessHLayout) self.widgetsVLayout.addLayout(self.filterHardnessHLayout)
self._setupAddCheckbox("matchScaledBox", tr("Match pictures of different dimensions")) self._setupAddCheckbox("matchScaledBox", tr("Match pictures of different dimensions"))
self.widgetsVLayout.addWidget(self.matchScaledBox) self.widgetsVLayout.addWidget(self.matchScaledBox)
self._setupAddCheckbox("matchRotatedBox", tr("Match pictures of different rotations"))
self.widgetsVLayout.addWidget(self.matchRotatedBox)
self._setupAddCheckbox("mixFileKindBox", tr("Can mix file kind")) self._setupAddCheckbox("mixFileKindBox", tr("Can mix file kind"))
self.widgetsVLayout.addWidget(self.mixFileKindBox) self.widgetsVLayout.addWidget(self.mixFileKindBox)
self._setupAddCheckbox("useRegexpBox", tr("Use regular expressions when filtering")) self._setupAddCheckbox("useRegexpBox", tr("Use regular expressions when filtering"))
@ -57,6 +59,7 @@ show scrollbars to span the view around"
def _load(self, prefs, setchecked, section): def _load(self, prefs, setchecked, section):
setchecked(self.matchScaledBox, prefs.match_scaled) setchecked(self.matchScaledBox, prefs.match_scaled)
setchecked(self.matchRotatedBox, prefs.match_rotated)
# Update UI state based on selected scan type # Update UI state based on selected scan type
scan_type = prefs.get_scan_type(AppMode.PICTURE) scan_type = prefs.get_scan_type(AppMode.PICTURE)
@ -67,5 +70,6 @@ show scrollbars to span the view around"
def _save(self, prefs, ischecked): def _save(self, prefs, ischecked):
prefs.match_scaled = ischecked(self.matchScaledBox) prefs.match_scaled = ischecked(self.matchScaledBox)
prefs.match_rotated = ischecked(self.matchRotatedBox)
prefs.details_dialog_override_theme_icons = ischecked(self.details_dialog_override_theme_icons) prefs.details_dialog_override_theme_icons = ischecked(self.details_dialog_override_theme_icons)
prefs.details_dialog_viewers_show_scrollbars = ischecked(self.details_dialog_viewers_show_scrollbars) prefs.details_dialog_viewers_show_scrollbars = ischecked(self.details_dialog_viewers_show_scrollbars)

View File

@ -225,6 +225,7 @@ class Preferences(PreferencesBase):
self.scan_tag_genre = get("ScanTagGenre", self.scan_tag_genre) self.scan_tag_genre = get("ScanTagGenre", self.scan_tag_genre)
self.scan_tag_year = get("ScanTagYear", self.scan_tag_year) self.scan_tag_year = get("ScanTagYear", self.scan_tag_year)
self.match_scaled = get("MatchScaled", self.match_scaled) self.match_scaled = get("MatchScaled", self.match_scaled)
self.match_rotated = get("MatchRotated", self.match_rotated)
def reset(self): def reset(self):
self.filter_hardness = 95 self.filter_hardness = 95
@ -277,6 +278,7 @@ class Preferences(PreferencesBase):
self.scan_tag_genre = False self.scan_tag_genre = False
self.scan_tag_year = False self.scan_tag_year = False
self.match_scaled = False self.match_scaled = False
self.match_rotated = False
def _save_values(self, settings): def _save_values(self, settings):
set_ = self.set_value set_ = self.set_value
@ -330,6 +332,7 @@ class Preferences(PreferencesBase):
set_("ScanTagGenre", self.scan_tag_genre) set_("ScanTagGenre", self.scan_tag_genre)
set_("ScanTagYear", self.scan_tag_year) set_("ScanTagYear", self.scan_tag_year)
set_("MatchScaled", self.match_scaled) set_("MatchScaled", self.match_scaled)
set_("MatchRotated", self.match_rotated)
# scan_type is special because we save it immediately when we set it. # scan_type is special because we save it immediately when we set it.
def get_scan_type(self, app_mode): def get_scan_type(self, app_mode):