Added the EXIF Timestamp scan type in dgpe.

--HG-- rename : core_pe/matchbase.py => core_pe/matchblock.py
2026-03-09 18:41:38 +00:00 · 2011-04-21 17:17:19 +02:00
parent a0e2b11663
commit 275c6be108
18 changed files with 690 additions and 121 deletions
--- a/core_pe/exif.py
+++ b/core_pe/exif.py
@@ -8,8 +8,6 @@

 # Heavily based on http://topo.math.u-psud.fr/~bousch/exifdump.py by Thierry Bousch (Public Domain)

-import os
-import sys
 import logging

 EXIF_TAGS = {
@@ -260,7 +258,6 @@ def read_exif_header(fp):
    try:
        index = large_data.index(b'Exif')
        data = large_data[index-6:index+6]
-        print('hello!', data)
        # large_data omits the first 12 bytes, and the index is at the middle of the header, so we
        # must seek index + 18
        fp.seek(index+18)
@@ -324,25 +321,3 @@ def get_fields(fp):
        for tag, type, values in IFD:
            add_tag_to_result(tag, values)
    return result
-
-def main():
-    # logging.getLogger().setLevel(logging.DEBUG)
-    if len(sys.argv) < 2:
-        filenames = os.listdir('.')
-    else:
-        filenames = sys.argv[1:]
-    for filename in filenames:
-        print(filename+':')
-        try:
-            file = open(filename, 'rb')
-            fields = get_fields(file)
-            if 'DateTime' in fields:
-                print(fields['DateTime'])
-            else:
-                print(repr(fields))
-        except (IOError, ValueError):
-            print(' Cannot open file')
-    sys.exit(0)
-
-if __name__ == '__main__':
-    main()
--- a/core_pe/matchblock.py
+++ b/core_pe/matchblock.py
--- a/core_pe/matchexif.py
+++ b/core_pe/matchexif.py
@@ -0,0 +1,34 @@
+# Created By: Virgil Dupras
+# Created On: 2011-04-20
+# Copyright 2011 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "BSD" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/bsd_license
+
+import logging
+from collections import defaultdict
+from itertools import combinations
+
+from hscommon import io
+from hscommon.trans import tr
+
+from core.engine import Match
+from . import exif
+
+def getmatches(files, j):
+    timestamp2pic = defaultdict(set)
+    for picture in j.iter_with_progress(files, tr("Read EXIF of %d/%d pictures")):
+        try:
+            with io.open(picture.path, 'rb') as fp:
+                exifdata = exif.get_fields(fp)
+                timestamp = exifdata['DateTimeOriginal']
+                timestamp2pic[timestamp].add(picture)
+        except Exception:
+            logging.warning("Couldn't read EXIF of picture: %s", picture.path)
+    if '0000:00:00 00:00:00' in timestamp2pic: # very likely false matches
+        del timestamp2pic['0000:00:00 00:00:00']
+    matches = []
+    for pictures in timestamp2pic.values():
+        matches += [Match(p1, p2, 100) for p1, p2 in combinations(pictures, 2)]
+    return matches
--- a/core_pe/scanner.py
+++ b/core_pe/scanner.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 # Created By: Virgil Dupras
 # Created On: 2009-10-18
 # Copyright 2011 Hardcoded Software (http://www.hardcoded.net)
@@ -7,9 +6,9 @@
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/bsd_license

-from core.scanner import Scanner
+from core.scanner import Scanner, ScanType

-from . import matchbase
+from . import matchblock, matchexif
 from .cache import Cache

 class ScannerPE(Scanner):
@@ -18,7 +17,12 @@ class ScannerPE(Scanner):
    threshold = 75
    
    def _getmatches(self, files, j):
-        return matchbase.getmatches(files, self.cache_path, self.threshold, self.match_scaled, j)
+        if self.scan_type == ScanType.FuzzyBlock:
+            return matchblock.getmatches(files, self.cache_path, self.threshold, self.match_scaled, j)
+        elif self.scan_type == ScanType.ExifTimestamp:
+            return matchexif.getmatches(files, j)
+        else:
+            raise Exception("Invalid scan type")
    
    def clear_picture_cache(self):
        cache = Cache(self.cache_path)