Merge core_{se,me,pe} into core.{se,me,pe}

2026-03-09 10:31:38 +00:00 · 2016-05-31 22:32:37 -04:00
parent d4919054f9
commit a65077f871
39 changed files with 55 additions and 75 deletions
--- a/core/pe/init.py
+++ b/core/pe/init.py
@@ -0,0 +1 @@
+from . import block, cache, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa
--- a/core/pe/block.py
+++ b/core/pe/block.py
@@ -0,0 +1,122 @@
+# Created By: Virgil Dupras
+# Created On: 2006/09/01
+# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
+#
+# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
+# which should be included with this package. The terms are also available at
+# http://www.gnu.org/licenses/gpl-3.0.html
+
+from ._block import NoBlocksError, DifferentBlockCountError, avgdiff, getblocks2 # NOQA
+
+# Converted to C
+# def getblock(image):
+#     """Returns a 3 sized tuple containing the mean color of 'image'.
+#
+#     image: a PIL image or crop.
+#     """
+#     if image.size[0]:
+#         pixel_count = image.size[0] * image.size[1]
+#         red = green = blue = 0
+#         for r,g,b in image.getdata():
+#             red += r
+#             green += g
+#             blue += b
+#         return (red // pixel_count, green // pixel_count, blue // pixel_count)
+#     else:
+#         return (0,0,0)
+
+# This is not used anymore
+# def getblocks(image,blocksize):
+#     """Returns a list of blocks (3 sized tuples).
+#
+#     image: A PIL image to base the blocks on.
+#     blocksize: The size of the blocks to be create. This is a single integer, defining
+#         both width and height (blocks are square).
+#     """
+#     if min(image.size) < blocksize:
+#         return ()
+#     result = []
+#     for i in xrange(image.size[1] // blocksize):
+#         for j in xrange(image.size[0] // blocksize):
+#             box = (blocksize * j, blocksize * i, blocksize * (j + 1), blocksize * (i + 1))
+#             crop = image.crop(box)
+#             result.append(getblock(crop))
+#     return result
+
+# Converted to C
+# def getblocks2(image,block_count_per_side):
+#     """Returns a list of blocks (3 sized tuples).
+#
+#     image: A PIL image to base the blocks on.
+#     block_count_per_side: This integer determine the number of blocks the function will return.
+#     If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not
+#     necessarely cover square areas. The area covered by each block will be proportional to the image
+#     itself.
+#     """
+#     if not image.size[0]:
+#         return []
+#     width,height = image.size
+#     block_width = max(width // block_count_per_side,1)
+#     block_height = max(height // block_count_per_side,1)
+#     result = []
+#     for ih in range(block_count_per_side):
+#         top = min(ih * block_height, height - block_height)
+#         bottom = top + block_height
+#         for iw in range(block_count_per_side):
+#             left = min(iw * block_width, width - block_width)
+#             right = left + block_width
+#             box = (left,top,right,bottom)
+#             crop = image.crop(box)
+#             result.append(getblock(crop))
+#     return result
+
+# Converted to C
+# def diff(first, second):
+#     """Returns the difference between the first block and the second.
+#
+#     It returns an absolute sum of the 3 differences (RGB).
+#     """
+#     r1, g1, b1 = first
+#     r2, g2, b2 = second
+#     return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2)
+
+# Converted to C
+# def avgdiff(first, second, limit=768, min_iterations=1):
+#     """Returns the average diff between first blocks and seconds.
+#
+#     If the result surpasses limit, limit + 1 is returned, except if less than min_iterations
+#     iterations have been made in the blocks.
+#     """
+#     if len(first) != len(second):
+#         raise DifferentBlockCountError
+#     if not first:
+#         raise NoBlocksError
+#     count = len(first)
+#     sum = 0
+#     zipped = izip(xrange(1, count + 1), first, second)
+#     for i, first, second in zipped:
+#         sum += diff(first, second)
+#         if sum > limit * i and i >= min_iterations:
+#             return limit + 1
+#     result = sum // count
+#     if (not result) and sum:
+#         result = 1
+#     return result
+
+# This is not used anymore
+# def maxdiff(first,second,limit=768):
+#     """Returns the max diff between first blocks and seconds.
+#
+#     If the result surpasses limit, the first max being over limit is returned.
+#     """
+#     if len(first) != len(second):
+#         raise DifferentBlockCountError
+#     if not first:
+#         raise NoBlocksError
+#     result = 0
+#     zipped = zip(first,second)
+#     for first,second in zipped:
+#         result = max(result,diff(first,second))
+#         if result > limit:
+#             return result
+#     return result
--- a/core/pe/cache.py
+++ b/core/pe/cache.py
@@ -0,0 +1,162 @@
+# Created By: Virgil Dupras
+# Created On: 2006/09/14
+# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
+#
+# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
+# which should be included with this package. The terms are also available at
+# http://www.gnu.org/licenses/gpl-3.0.html
+
+import os
+import os.path as op
+import logging
+import sqlite3 as sqlite
+
+from ._cache import string_to_colors
+
+def colors_to_string(colors):
+    """Transform the 3 sized tuples 'colors' into a hex string.
+
+    [(0,100,255)] --> 0064ff
+    [(1,2,3),(4,5,6)] --> 010203040506
+    """
+    return ''.join(['%02x%02x%02x' % (r, g, b) for r, g, b in colors])
+
+# This function is an important bottleneck of dupeGuru PE. It has been converted to C.
+# def string_to_colors(s):
+#     """Transform the string 's' in a list of 3 sized tuples.
+#     """
+#     result = []
+#     for i in xrange(0, len(s), 6):
+#         number = int(s[i:i+6], 16)
+#         result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
+#     return result
+
+class Cache:
+    """A class to cache picture blocks.
+    """
+    def __init__(self, db=':memory:'):
+        self.dbname = db
+        self.con = None
+        self._create_con()
+
+    def __contains__(self, key):
+        sql = "select count(*) from pictures where path = ?"
+        result = self.con.execute(sql, [key]).fetchall()
+        return result[0][0] > 0
+
+    def __delitem__(self, key):
+        if key not in self:
+            raise KeyError(key)
+        sql = "delete from pictures where path = ?"
+        self.con.execute(sql, [key])
+
+    # Optimized
+    def __getitem__(self, key):
+        if isinstance(key, int):
+            sql = "select blocks from pictures where rowid = ?"
+        else:
+            sql = "select blocks from pictures where path = ?"
+        result = self.con.execute(sql, [key]).fetchone()
+        if result:
+            result = string_to_colors(result[0])
+            return result
+        else:
+            raise KeyError(key)
+
+    def __iter__(self):
+        sql = "select path from pictures"
+        result = self.con.execute(sql)
+        return (row[0] for row in result)
+
+    def __len__(self):
+        sql = "select count(*) from pictures"
+        result = self.con.execute(sql).fetchall()
+        return result[0][0]
+
+    def __setitem__(self, path_str, blocks):
+        blocks = colors_to_string(blocks)
+        if op.exists(path_str):
+            mtime = int(os.stat(path_str).st_mtime)
+        else:
+            mtime = 0
+        if path_str in self:
+            sql = "update pictures set blocks = ?, mtime = ? where path = ?"
+        else:
+            sql = "insert into pictures(blocks,mtime,path) values(?,?,?)"
+        try:
+            self.con.execute(sql, [blocks, mtime, path_str])
+        except sqlite.OperationalError:
+            logging.warning('Picture cache could not set value for key %r', path_str)
+        except sqlite.DatabaseError as e:
+            logging.warning('DatabaseError while setting value for key %r: %s', path_str, str(e))
+
+    def _create_con(self, second_try=False):
+        def create_tables():
+            logging.debug("Creating picture cache tables.")
+            self.con.execute("drop table if exists pictures")
+            self.con.execute("drop index if exists idx_path")
+            self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks TEXT)")
+            self.con.execute("create index idx_path on pictures (path)")
+
+        self.con = sqlite.connect(self.dbname, isolation_level=None)
+        try:
+            self.con.execute("select path, mtime, blocks from pictures where 1=2")
+        except sqlite.OperationalError: # new db
+            create_tables()
+        except sqlite.DatabaseError as e: # corrupted db
+            if second_try:
+                raise # Something really strange is happening
+            logging.warning('Could not create picture cache because of an error: %s', str(e))
+            self.con.close()
+            os.remove(self.dbname)
+            self._create_con(second_try=True)
+
+    def clear(self):
+        self.close()
+        if self.dbname != ':memory:':
+            os.remove(self.dbname)
+        self._create_con()
+
+    def close(self):
+        if self.con is not None:
+            self.con.close()
+        self.con = None
+
+    def filter(self, func):
+        to_delete = [key for key in self if not func(key)]
+        for key in to_delete:
+            del self[key]
+
+    def get_id(self, path):
+        sql = "select rowid from pictures where path = ?"
+        result = self.con.execute(sql, [path]).fetchone()
+        if result:
+            return result[0]
+        else:
+            raise ValueError(path)
+
+    def get_multiple(self, rowids):
+        sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids))
+        cur = self.con.execute(sql)
+        return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)
+
+    def purge_outdated(self):
+        """Go through the cache and purge outdated records.
+
+        A record is outdated if the picture doesn't exist or if its mtime is greater than the one in
+        the db.
+        """
+        todelete = []
+        sql = "select rowid, path, mtime from pictures"
+        cur = self.con.execute(sql)
+        for rowid, path_str, mtime in cur:
+            if mtime and op.exists(path_str):
+                picture_mtime = os.stat(path_str).st_mtime
+                if int(picture_mtime) <= mtime:
+                    # not outdated
+                    continue
+            todelete.append(rowid)
+        if todelete:
+            sql = "delete from pictures where rowid in (%s)" % ','.join(map(str, todelete))
+            self.con.execute(sql)
+
--- a/core/pe/exif.py
+++ b/core/pe/exif.py
@@ -0,0 +1,335 @@
+# Created By: Virgil Dupras
+# Created On: 2011-04-20
+# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
+#
+# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
+# which should be included with this package. The terms are also available at
+# http://www.gnu.org/licenses/gpl-3.0.html
+
+# Heavily based on http://topo.math.u-psud.fr/~bousch/exifdump.py by Thierry Bousch (Public Domain)
+
+import logging
+
+EXIF_TAGS = {
+    0x0100: "ImageWidth",
+    0x0101: "ImageLength",
+    0x0102: "BitsPerSample",
+    0x0103: "Compression",
+    0x0106: "PhotometricInterpretation",
+    0x010A: "FillOrder",
+    0x010D: "DocumentName",
+    0x010E: "ImageDescription",
+    0x010F: "Make",
+    0x0110: "Model",
+    0x0111: "StripOffsets",
+    0x0112: "Orientation",
+    0x0115: "SamplesPerPixel",
+    0x0116: "RowsPerStrip",
+    0x0117: "StripByteCounts",
+    0x011A: "XResolution",
+    0x011B: "YResolution",
+    0x011C: "PlanarConfiguration",
+    0x0128: "ResolutionUnit",
+    0x012D: "TransferFunction",
+    0x0131: "Software",
+    0x0132: "DateTime",
+    0x013B: "Artist",
+    0x013E: "WhitePoint",
+    0x013F: "PrimaryChromaticities",
+    0x0156: "TransferRange",
+    0x0200: "JPEGProc",
+    0x0201: "JPEGInterchangeFormat",
+    0x0202: "JPEGInterchangeFormatLength",
+    0x0211: "YCbCrCoefficients",
+    0x0212: "YCbCrSubSampling",
+    0x0213: "YCbCrPositioning",
+    0x0214: "ReferenceBlackWhite",
+    0x828F: "BatteryLevel",
+    0x8298: "Copyright",
+    0x829A: "ExposureTime",
+    0x829D: "FNumber",
+    0x83BB: "IPTC/NAA",
+    0x8769: "ExifIFDPointer",
+    0x8773: "InterColorProfile",
+    0x8822: "ExposureProgram",
+    0x8824: "SpectralSensitivity",
+    0x8825: "GPSInfoIFDPointer",
+    0x8827: "ISOSpeedRatings",
+    0x8828: "OECF",
+    0x9000: "ExifVersion",
+    0x9003: "DateTimeOriginal",
+    0x9004: "DateTimeDigitized",
+    0x9101: "ComponentsConfiguration",
+    0x9102: "CompressedBitsPerPixel",
+    0x9201: "ShutterSpeedValue",
+    0x9202: "ApertureValue",
+    0x9203: "BrightnessValue",
+    0x9204: "ExposureBiasValue",
+    0x9205: "MaxApertureValue",
+    0x9206: "SubjectDistance",
+    0x9207: "MeteringMode",
+    0x9208: "LightSource",
+    0x9209: "Flash",
+    0x920A: "FocalLength",
+    0x9214: "SubjectArea",
+    0x927C: "MakerNote",
+    0x9286: "UserComment",
+    0x9290: "SubSecTime",
+    0x9291: "SubSecTimeOriginal",
+    0x9292: "SubSecTimeDigitized",
+    0xA000: "FlashPixVersion",
+    0xA001: "ColorSpace",
+    0xA002: "PixelXDimension",
+    0xA003: "PixelYDimension",
+    0xA004: "RelatedSoundFile",
+    0xA005: "InteroperabilityIFDPointer",
+    0xA20B: "FlashEnergy",          # 0x920B in TIFF/EP
+    0xA20C: "SpatialFrequencyResponse", # 0x920C    -  -
+    0xA20E: "FocalPlaneXResolution",    # 0x920E    -  -
+    0xA20F: "FocalPlaneYResolution",    # 0x920F    -  -
+    0xA210: "FocalPlaneResolutionUnit", # 0x9210    -  -
+    0xA214: "SubjectLocation",      # 0x9214    -  -
+    0xA215: "ExposureIndex",        # 0x9215    -  -
+    0xA217: "SensingMethod",        # 0x9217    -  -
+    0xA300: "FileSource",
+    0xA301: "SceneType",
+    0xA302: "CFAPattern",           # 0x828E in TIFF/EP
+    0xA401: "CustomRendered",
+    0xA402: "ExposureMode",
+    0xA403: "WhiteBalance",
+    0xA404: "DigitalZoomRatio",
+    0xA405: "FocalLengthIn35mmFilm",
+    0xA406: "SceneCaptureType",
+    0xA407: "GainControl",
+    0xA408: "Contrast",
+    0xA409: "Saturation",
+    0xA40A: "Sharpness",
+    0xA40B: "DeviceSettingDescription",
+    0xA40C: "SubjectDistanceRange",
+    0xA420: "ImageUniqueID",
+}
+
+INTR_TAGS = {
+    0x0001: "InteroperabilityIndex",
+    0x0002: "InteroperabilityVersion",
+    0x1000: "RelatedImageFileFormat",
+    0x1001: "RelatedImageWidth",
+    0x1002: "RelatedImageLength",
+}
+
+GPS_TA0GS = {
+    0x00: "GPSVersionID",
+    0x01: "GPSLatitudeRef",
+    0x02: "GPSLatitude",
+    0x03: "GPSLongitudeRef",
+    0x04: "GPSLongitude",
+    0x05: "GPSAltitudeRef",
+    0x06: "GPSAltitude",
+    0x07: "GPSTimeStamp",
+    0x08: "GPSSatellites",
+    0x09: "GPSStatus",
+    0x0A: "GPSMeasureMode",
+    0x0B: "GPSDOP",
+    0x0C: "GPSSpeedRef",
+    0x0D: "GPSSpeed",
+    0x0E: "GPSTrackRef",
+    0x0F: "GPSTrack",
+    0x10: "GPSImgDirectionRef",
+    0x11: "GPSImgDirection",
+    0x12: "GPSMapDatum",
+    0x13: "GPSDestLatitudeRef",
+    0x14: "GPSDestLatitude",
+    0x15: "GPSDestLongitudeRef",
+    0x16: "GPSDestLongitude",
+    0x17: "GPSDestBearingRef",
+    0x18: "GPSDestBearing",
+    0x19: "GPSDestDistanceRef",
+    0x1A: "GPSDestDistance",
+    0x1B: "GPSProcessingMethod",
+    0x1C: "GPSAreaInformation",
+    0x1D: "GPSDateStamp",
+    0x1E: "GPSDifferential"
+}
+
+INTEL_ENDIAN = ord('I')
+MOTOROLA_ENDIAN = ord('M')
+
+# About MAX_COUNT: It's possible to have corrupted exif tags where the entry count is way too high
+# and thus makes us loop, not endlessly, but for heck of a long time for nothing. Therefore, we put
+# an arbitrary limit on the entry count we'll allow ourselves to read and any IFD reporting more
+# entries than that will be considered corrupt.
+MAX_COUNT = 0xffff
+
+def s2n_motorola(bytes):
+    x = 0
+    for c in bytes:
+        x = (x << 8) | c
+    return x
+
+def s2n_intel(bytes):
+    x = 0
+    y = 0
+    for c in bytes:
+        x = x | (c << y)
+        y = y + 8
+    return x
+
+class Fraction:
+    def __init__(self, num, den):
+        self.num = num
+        self.den = den
+
+    def __repr__(self):
+        return '%d/%d' % (self.num, self.den)
+
+
+class TIFF_file:
+    def __init__(self, data):
+        self.data = data
+        self.endian = data[0]
+        self.s2nfunc = s2n_intel if self.endian == INTEL_ENDIAN else s2n_motorola
+
+    def s2n(self, offset, length, signed=0, debug=False):
+        slice = self.data[offset:offset+length]
+        val = self.s2nfunc(slice)
+        # Sign extension ?
+        if signed:
+            msb = 1 << (8*length - 1)
+            if val & msb:
+                val = val - (msb << 1)
+        if debug:
+            logging.debug(self.endian)
+            logging.debug("Slice for offset %d length %d: %r and value: %d", offset, length, slice, val)
+        return val
+
+    def first_IFD(self):
+        return self.s2n(4, 4)
+
+    def next_IFD(self, ifd):
+        entries = self.s2n(ifd, 2)
+        return self.s2n(ifd + 2 + 12 * entries, 4)
+
+    def list_IFDs(self):
+        i = self.first_IFD()
+        a = []
+        while i:
+            a.append(i)
+            i = self.next_IFD(i)
+        return a
+
+    def dump_IFD(self, ifd):
+        entries = self.s2n(ifd, 2)
+        logging.debug("Entries for IFD %d: %d", ifd, entries)
+        if entries > MAX_COUNT:
+            logging.debug("Probably corrupt. Aborting.")
+            return []
+        a = []
+        for i in range(entries):
+            entry = ifd + 2 + 12*i
+            tag = self.s2n(entry, 2)
+            type = self.s2n(entry+2, 2)
+            if not 1 <= type <= 10:
+                continue # not handled
+            typelen = [1, 1, 2, 4, 8, 1, 1, 2, 4, 8][type-1]
+            count = self.s2n(entry+4, 4)
+            if count > MAX_COUNT:
+                logging.debug("Probably corrupt. Aborting.")
+                return []
+            offset = entry+8
+            if count*typelen > 4:
+                offset = self.s2n(offset, 4)
+            if type == 2:
+                # Special case: nul-terminated ASCII string
+                values = str(self.data[offset:offset+count-1], encoding='latin-1')
+            else:
+                values = []
+                signed = (type == 6 or type >= 8)
+                for j in range(count):
+                    if type in {5, 10}:
+                        # The type is either 5 or 10
+                        value_j = Fraction(self.s2n(offset, 4, signed),
+                                           self.s2n(offset+4, 4, signed))
+                    else:
+                        # Not a fraction
+                        value_j = self.s2n(offset, typelen, signed)
+                    values.append(value_j)
+                    offset = offset + typelen
+            # Now "values" is either a string or an array
+            a.append((tag, type, values))
+        return a
+
+def read_exif_header(fp):
+    # If `fp`'s first bytes are not exif, it tries to find it in the next 4kb
+    def isexif(data):
+        return data[0:4] == b'\377\330\377\341' and data[6:10] == b'Exif'
+    data = fp.read(12)
+    if isexif(data):
+        return data
+    # ok, not exif, try to find it
+    large_data = fp.read(4096)
+    try:
+        index = large_data.index(b'Exif')
+        data = large_data[index-6:index+6]
+        # large_data omits the first 12 bytes, and the index is at the middle of the header, so we
+        # must seek index + 18
+        fp.seek(index+18)
+        return data
+    except ValueError:
+        raise ValueError("Not an Exif file")
+
+def get_fields(fp):
+    data = read_exif_header(fp)
+    length = data[4] * 256 + data[5]
+    logging.debug("Exif header length: %d bytes", length)
+    data = fp.read(length-8)
+    data_format = data[0]
+    logging.debug("%s format", {INTEL_ENDIAN: 'Intel', MOTOROLA_ENDIAN: 'Motorola'}[data_format])
+    T = TIFF_file(data)
+    # There may be more than one IFD per file, but we only read the first one because others are
+    # most likely thumbnails.
+    main_IFD_offset = T.first_IFD()
+    result = {}
+
+    def add_tag_to_result(tag, values):
+        try:
+            stag = EXIF_TAGS[tag]
+        except KeyError:
+            stag = '0x%04X' % tag
+        if stag in result:
+            return # don't overwrite data
+        result[stag] = values
+
+    logging.debug("IFD at offset %d", main_IFD_offset)
+    IFD = T.dump_IFD(main_IFD_offset)
+    exif_off = gps_off = 0
+    for tag, type, values in IFD:
+        if tag == 0x8769:
+            exif_off = values[0]
+            continue
+        if tag == 0x8825:
+            gps_off = values[0]
+            continue
+        add_tag_to_result(tag, values)
+    if exif_off:
+        logging.debug("Exif SubIFD at offset %d:", exif_off)
+        IFD = T.dump_IFD(exif_off)
+        # Recent digital cameras have a little subdirectory
+        # here, pointed to by tag 0xA005. Apparently, it's the
+        # "Interoperability IFD", defined in Exif 2.1 and DCF.
+        intr_off = 0
+        for tag, type, values in IFD:
+            if tag == 0xA005:
+                intr_off = values[0]
+                continue
+            add_tag_to_result(tag, values)
+        if intr_off:
+            logging.debug("Exif Interoperability SubSubIFD at offset %d:", intr_off)
+            IFD = T.dump_IFD(intr_off)
+            for tag, type, values in IFD:
+                add_tag_to_result(tag, values)
+    if gps_off:
+        logging.debug("GPS SubIFD at offset %d:", gps_off)
+        IFD = T.dump_IFD(gps_off)
+        for tag, type, values in IFD:
+            add_tag_to_result(tag, values)
+    return result
--- a/core/pe/iphoto_plist.py
+++ b/core/pe/iphoto_plist.py
@@ -0,0 +1,31 @@
+# Created By: Virgil Dupras
+# Created On: 2014-03-15
+# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "GPLv3" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.gnu.org/licenses/gpl-3.0.html
+
+import plistlib
+
+class IPhotoPlistParser(plistlib._PlistParser):
+    """A parser for iPhoto plists.
+
+    iPhoto plists tend to be malformed, so we have to subclass the built-in parser to be a bit more
+    lenient.
+    """
+    def __init__(self):
+        plistlib._PlistParser.__init__(self, use_builtin_types=True, dict_type=dict)
+        # For debugging purposes, we remember the last bit of data to be analyzed so that we can
+        # log it in case of an exception
+        self.lastdata = ''
+
+    def get_data(self):
+        self.lastdata = plistlib._PlistParser.get_data(self)
+        return self.lastdata
+
+    def end_integer(self):
+        try:
+            self.add_object(int(self.get_data()))
+        except ValueError:
+            self.add_object(0)
--- a/core/pe/matchblock.py
+++ b/core/pe/matchblock.py
@@ -0,0 +1,222 @@
+# Created By: Virgil Dupras
+# Created On: 2007/02/25
+# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
+#
+# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
+# which should be included with this package. The terms are also available at
+# http://www.gnu.org/licenses/gpl-3.0.html
+
+import logging
+import multiprocessing
+from itertools import combinations
+
+from hscommon.util import extract, iterconsume
+from hscommon.trans import tr
+from hscommon.jobprogress import job
+
+from core.engine import Match
+from .block import avgdiff, DifferentBlockCountError, NoBlocksError
+from .cache import Cache
+
+# OPTIMIZATION NOTES:
+# The bottleneck of the matching phase is CPU, which is why we use multiprocessing. However, another
+# bottleneck that shows up when a lot of pictures are involved is Disk IO's because blocks
+# constantly have to be read from disks by subprocesses. This problem is especially big on CPUs
+# with a lot of cores. Therefore, we must minimize Disk IOs. The best way to achieve that is to
+# separate the files to scan in "chunks" and it's by chunk that blocks are read in memory and
+# compared to each other. Each file in a chunk has to be compared to each other, of course, but also
+# to files in other chunks. So chunkifying doesn't save us any actual comparison, but the advantage
+# is that instead of reading blocks from disk number_of_files**2 times, we read it
+# number_of_files*number_of_chunks times.
+# Determining the right chunk size is tricky, bceause if it's too big, too many blocks will be in
+# memory at the same time and we might end up with memory trashing, which is awfully slow. So,
+# because our *real* bottleneck is CPU, the chunk size must simply be enough so that the CPU isn't
+# starved by Disk IOs.
+
+MIN_ITERATIONS = 3
+BLOCK_COUNT_PER_SIDE = 15
+DEFAULT_CHUNK_SIZE = 1000
+MIN_CHUNK_SIZE = 100
+
+# Enough so that we're sure that the main thread will not wait after a result.get() call
+# cpucount+1 should be enough to be sure that the spawned process will not wait after the results
+# collection made by the main process.
+try:
+    RESULTS_QUEUE_LIMIT = multiprocessing.cpu_count() + 1
+except Exception:
+    # I had an IOError on app launch once. It seems to be a freak occurrence. In any case, we want
+    # the app to launch, so let's just put an arbitrary value.
+    logging.warning("Had problems to determine cpu count on launch.")
+    RESULTS_QUEUE_LIMIT = 8
+
+def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
+    # The MemoryError handlers in there use logging without first caring about whether or not
+    # there is enough memory left to carry on the operation because it is assumed that the
+    # MemoryError happens when trying to read an image file, which is freed from memory by the
+    # time that MemoryError is raised.
+    cache = Cache(cache_path)
+    cache.purge_outdated()
+    prepared = [] # only pictures for which there was no error getting blocks
+    try:
+        for picture in j.iter_with_progress(pictures, tr("Analyzed %d/%d pictures")):
+            if not picture.path:
+                # XXX Find the root cause of this. I've received reports of crashes where we had
+                # "Analyzing picture at " (without a path) in the debug log. It was an iPhoto scan.
+                # For now, I'm simply working around the crash by ignoring those, but it would be
+                # interesting to know exactly why this happens. I'm suspecting a malformed
+                # entry in iPhoto library.
+                logging.warning("We have a picture with a null path here")
+                continue
+            picture.unicode_path = str(picture.path)
+            logging.debug("Analyzing picture at %s", picture.unicode_path)
+            if with_dimensions:
+                picture.dimensions # pre-read dimensions
+            try:
+                if picture.unicode_path not in cache:
+                    blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE)
+                    cache[picture.unicode_path] = blocks
+                prepared.append(picture)
+            except (IOError, ValueError) as e:
+                logging.warning(str(e))
+            except MemoryError:
+                logging.warning("Ran out of memory while reading %s of size %d", picture.unicode_path, picture.size)
+                if picture.size < 10 * 1024 * 1024: # We're really running out of memory
+                    raise
+    except MemoryError:
+        logging.warning('Ran out of memory while preparing pictures')
+    cache.close()
+    return prepared
+
+def get_chunks(pictures):
+    min_chunk_count = multiprocessing.cpu_count() * 2 # have enough chunks to feed all subprocesses
+    chunk_count = len(pictures) // DEFAULT_CHUNK_SIZE
+    chunk_count = max(min_chunk_count, chunk_count)
+    chunk_size = (len(pictures) // chunk_count) + 1
+    chunk_size = max(MIN_CHUNK_SIZE, chunk_size)
+    logging.info(
+        "Creating %d chunks with a chunk size of %d for %d pictures", chunk_count,
+        chunk_size, len(pictures)
+    )
+    chunks = [pictures[i:i+chunk_size] for i in range(0, len(pictures), chunk_size)]
+    return chunks
+
+def get_match(first, second, percentage):
+    if percentage < 0:
+        percentage = 0
+    return Match(first, second, percentage)
+
+def async_compare(ref_ids, other_ids, dbname, threshold, picinfo):
+    # The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids
+    # can be None. In this case, ref_ids has to be compared with itself
+    # picinfo is a dictionary {pic_id: (dimensions, is_ref)}
+    cache = Cache(dbname)
+    limit = 100 - threshold
+    ref_pairs = list(cache.get_multiple(ref_ids))
+    if other_ids is not None:
+        other_pairs = list(cache.get_multiple(other_ids))
+        comparisons_to_do = [(r, o) for r in ref_pairs for o in other_pairs]
+    else:
+        comparisons_to_do = list(combinations(ref_pairs, 2))
+    results = []
+    for (ref_id, ref_blocks), (other_id, other_blocks) in comparisons_to_do:
+        ref_dimensions, ref_is_ref = picinfo[ref_id]
+        other_dimensions, other_is_ref = picinfo[other_id]
+        if ref_is_ref and other_is_ref:
+            continue
+        if ref_dimensions != other_dimensions:
+            continue
+        try:
+            diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
+            percentage = 100 - diff
+        except (DifferentBlockCountError, NoBlocksError):
+            percentage = 0
+        if percentage >= threshold:
+            results.append((ref_id, other_id, percentage))
+    cache.close()
+    return results
+
+def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nulljob):
+    def get_picinfo(p):
+        if match_scaled:
+            return (None, p.is_ref)
+        else:
+            return (p.dimensions, p.is_ref)
+
+    def collect_results(collect_all=False):
+        # collect results and wait until the queue is small enough to accomodate a new results.
+        nonlocal async_results, matches, comparison_count, comparisons_to_do
+        limit = 0 if collect_all else RESULTS_QUEUE_LIMIT
+        while len(async_results) > limit:
+            ready, working = extract(lambda r: r.ready(), async_results)
+            for result in ready:
+                matches += result.get()
+                async_results.remove(result)
+                comparison_count += 1
+        # About the NOQA below: I think there's a bug in pyflakes. To investigate...
+        progress_msg = tr("Performed %d/%d chunk matches") % (comparison_count, len(comparisons_to_do)) # NOQA
+        j.set_progress(comparison_count, progress_msg)
+
+    j = j.start_subjob([3, 7])
+    pictures = prepare_pictures(pictures, cache_path, with_dimensions=not match_scaled, j=j)
+    j = j.start_subjob([9, 1], tr("Preparing for matching"))
+    cache = Cache(cache_path)
+    id2picture = {}
+    for picture in pictures:
+        try:
+            picture.cache_id = cache.get_id(picture.unicode_path)
+            id2picture[picture.cache_id] = picture
+        except ValueError:
+            pass
+    cache.close()
+    pictures = [p for p in pictures if hasattr(p, 'cache_id')]
+    pool = multiprocessing.Pool()
+    async_results = []
+    matches = []
+    chunks = get_chunks(pictures)
+    # We add a None element at the end of the chunk list because each chunk has to be compared
+    # with itself. Thus, each chunk will show up as a ref_chunk having other_chunk set to None once.
+    comparisons_to_do = list(combinations(chunks + [None], 2))
+    comparison_count = 0
+    j.start_job(len(comparisons_to_do))
+    try:
+        for ref_chunk, other_chunk in comparisons_to_do:
+            picinfo = {p.cache_id: get_picinfo(p) for p in ref_chunk}
+            ref_ids = [p.cache_id for p in ref_chunk]
+            if other_chunk is not None:
+                other_ids = [p.cache_id for p in other_chunk]
+                picinfo.update({p.cache_id: get_picinfo(p) for p in other_chunk})
+            else:
+                other_ids = None
+            args = (ref_ids, other_ids, cache_path, threshold, picinfo)
+            async_results.append(pool.apply_async(async_compare, args))
+            collect_results()
+        collect_results(collect_all=True)
+    except MemoryError:
+        # Rare, but possible, even in 64bit situations (ref #264). What do we do now? We free us
+        # some wiggle room, log about the incident, and stop matching right here. We then process
+        # the matches we have. The rest of the process doesn't allocate much and we should be
+        # alright.
+        del comparisons_to_do, chunks, pictures # some wiggle room for the next statements
+        logging.warning("Ran out of memory when scanning! We had %d matches.", len(matches))
+        del matches[-len(matches)//3:] # some wiggle room to ensure we don't run out of memory again.
+    pool.close()
+    result = []
+    myiter = j.iter_with_progress(
+        iterconsume(matches, reverse=False),
+        tr("Verified %d/%d matches"),
+        every=10,
+        count=len(matches),
+    )
+    for ref_id, other_id, percentage in myiter:
+        ref = id2picture[ref_id]
+        other = id2picture[other_id]
+        if percentage == 100 and ref.md5 != other.md5:
+            percentage = 99
+        if percentage >= threshold:
+            ref.dimensions # pre-read dimensions for display in results
+            other.dimensions
+            result.append(get_match(ref, other, percentage))
+    return result
+
+multiprocessing.freeze_support()
+
--- a/core/pe/matchexif.py
+++ b/core/pe/matchexif.py
@@ -0,0 +1,31 @@
+# Created By: Virgil Dupras
+# Created On: 2011-04-20
+# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
+#
+# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
+# which should be included with this package. The terms are also available at
+# http://www.gnu.org/licenses/gpl-3.0.html
+
+from collections import defaultdict
+from itertools import combinations
+
+from hscommon.trans import tr
+
+from core.engine import Match
+
+def getmatches(files, match_scaled, j):
+    timestamp2pic = defaultdict(set)
+    for picture in j.iter_with_progress(files, tr("Read EXIF of %d/%d pictures")):
+        timestamp = picture.exif_timestamp
+        if timestamp:
+            timestamp2pic[timestamp].add(picture)
+    if '0000:00:00 00:00:00' in timestamp2pic: # very likely false matches
+        del timestamp2pic['0000:00:00 00:00:00']
+    matches = []
+    for pictures in timestamp2pic.values():
+        for p1, p2 in combinations(pictures, 2):
+            if (not match_scaled) and (p1.dimensions != p2.dimensions):
+                continue
+            matches.append(Match(p1, p2, 100))
+    return matches
+
--- a/core/pe/modules/block.c
+++ b/core/pe/modules/block.c
@@ -0,0 +1,253 @@
+/* Created By: Virgil Dupras
+ * Created On: 2010-01-30
+ * Copyright 2014 Hardcoded Software (http://www.hardcoded.net)
+ *
+ * This software is licensed under the "BSD" License as described in the "LICENSE" file, 
+ * which should be included with this package. The terms are also available at 
+ * http://www.hardcoded.net/licenses/bsd_license
+ */
+
+#include "common.h"
+
+/* avgdiff/maxdiff has been called with empty lists */
+static PyObject *NoBlocksError;
+/* avgdiff/maxdiff has been called with 2 block lists of different size. */
+static PyObject *DifferentBlockCountError;
+
+/* Returns a 3 sized tuple containing the mean color of 'image'.    
+ * image: a PIL image or crop.
+ */
+static PyObject* getblock(PyObject *image)
+{
+    int i, totr, totg, totb;
+    Py_ssize_t pixel_count;
+    PyObject *ppixels;
+    
+    totr = totg = totb = 0;
+    ppixels = PyObject_CallMethod(image, "getdata", NULL);
+    if (ppixels == NULL) {
+        return NULL;
+    }
+    
+    pixel_count = PySequence_Length(ppixels);
+    for (i=0; i<pixel_count; i++) {
+        PyObject *ppixel, *pr, *pg, *pb;
+        int r, g, b;
+        
+        ppixel = PySequence_ITEM(ppixels, i);
+        pr = PySequence_ITEM(ppixel, 0);
+        pg = PySequence_ITEM(ppixel, 1);
+        pb = PySequence_ITEM(ppixel, 2);
+        Py_DECREF(ppixel);
+        r = PyLong_AsLong(pr);
+        g = PyLong_AsLong(pg);
+        b = PyLong_AsLong(pb);
+        Py_DECREF(pr);
+        Py_DECREF(pg);
+        Py_DECREF(pb);
+        
+        totr += r;
+        totg += g;
+        totb += b;
+    }
+    
+    Py_DECREF(ppixels);
+    
+    if (pixel_count) {
+        totr /= pixel_count;
+        totg /= pixel_count;
+        totb /= pixel_count;
+    }
+    
+    return inttuple(3, totr, totg, totb);
+}
+
+/* Returns the difference between the first block and the second.
+ * It returns an absolute sum of the 3 differences (RGB).
+ */
+static int diff(PyObject *first, PyObject *second)
+{
+    int r1, g1, b1, r2, b2, g2;
+    PyObject *pr, *pg, *pb;
+    pr = PySequence_ITEM(first, 0);
+    pg = PySequence_ITEM(first, 1);
+    pb = PySequence_ITEM(first, 2);
+    r1 = PyLong_AsLong(pr);
+    g1 = PyLong_AsLong(pg);
+    b1 = PyLong_AsLong(pb);
+    Py_DECREF(pr);
+    Py_DECREF(pg);
+    Py_DECREF(pb);
+    
+    pr = PySequence_ITEM(second, 0);
+    pg = PySequence_ITEM(second, 1);
+    pb = PySequence_ITEM(second, 2);
+    r2 = PyLong_AsLong(pr);
+    g2 = PyLong_AsLong(pg);
+    b2 = PyLong_AsLong(pb);
+    Py_DECREF(pr);
+    Py_DECREF(pg);
+    Py_DECREF(pb);
+    
+    return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2);
+}
+
+PyDoc_STRVAR(block_getblocks2_doc,
+"Returns a list of blocks (3 sized tuples).\n\
+\n\
+image: A PIL image to base the blocks on.\n\
+block_count_per_side: This integer determine the number of blocks the function will return.\n\
+If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not\n\
+necessarely cover square areas. The area covered by each block will be proportional to the image\n\
+itself.\n");
+
+static PyObject* block_getblocks2(PyObject *self, PyObject *args)
+{
+    int block_count_per_side, width, height, block_width, block_height, ih;
+    PyObject *image;
+    PyObject *pimage_size, *pwidth, *pheight;
+    PyObject *result;
+    
+    if (!PyArg_ParseTuple(args, "Oi", &image, &block_count_per_side)) {
+        return NULL;
+    }
+    
+    pimage_size = PyObject_GetAttrString(image, "size");
+    pwidth = PySequence_ITEM(pimage_size, 0);
+    pheight = PySequence_ITEM(pimage_size, 1);
+    width = PyLong_AsLong(pwidth);
+    height = PyLong_AsLong(pheight);
+    Py_DECREF(pimage_size);
+    Py_DECREF(pwidth);
+    Py_DECREF(pheight);
+    
+    if (!(width && height)) {
+        return PyList_New(0);
+    }
+    
+    block_width = max(width / block_count_per_side, 1);
+    block_height = max(height / block_count_per_side, 1);
+    
+    result = PyList_New(block_count_per_side * block_count_per_side);
+    if (result == NULL) {
+        return NULL;
+    }
+    
+    for (ih=0; ih<block_count_per_side; ih++) {
+        int top, bottom, iw;
+        top = min(ih*block_height, height-block_height);
+        bottom = top + block_height;
+        for (iw=0; iw<block_count_per_side; iw++) {
+            int left, right;
+            PyObject *pbox;
+            PyObject *pmethodname;
+            PyObject *pcrop;
+            PyObject *pblock;
+            
+            left = min(iw*block_width, width-block_width);
+            right = left + block_width;
+            pbox = inttuple(4, left, top, right, bottom);
+            pmethodname = PyUnicode_FromString("crop");
+            pcrop = PyObject_CallMethodObjArgs(image, pmethodname, pbox, NULL);
+            Py_DECREF(pmethodname);
+            Py_DECREF(pbox);
+            if (pcrop == NULL) {
+                Py_DECREF(result);
+                return NULL;
+            }
+            pblock = getblock(pcrop);
+            Py_DECREF(pcrop);
+            if (pblock == NULL) {
+                Py_DECREF(result);
+                return NULL;
+            }
+            PyList_SET_ITEM(result, ih*block_count_per_side+iw, pblock);
+        }
+    }
+    
+    return result;
+}
+
+PyDoc_STRVAR(block_avgdiff_doc,
+"Returns the average diff between first blocks and seconds.\n\
+\n\
+If the result surpasses limit, limit + 1 is returned, except if less than min_iterations\n\
+iterations have been made in the blocks.\n");
+
+static PyObject* block_avgdiff(PyObject *self, PyObject *args)
+{
+    PyObject *first, *second;
+    int limit, min_iterations;
+    Py_ssize_t count;
+    int sum, i, result;
+    
+    if (!PyArg_ParseTuple(args, "OOii", &first, &second, &limit, &min_iterations)) {
+        return NULL;
+    }
+    
+    count = PySequence_Length(first);
+    if (count != PySequence_Length(second)) {
+        PyErr_SetString(DifferentBlockCountError, "");
+        return NULL;
+    }
+    if (!count) {
+        PyErr_SetString(NoBlocksError, "");
+        return NULL;
+    }
+    
+    sum = 0;
+    for (i=0; i<count; i++) {
+        int iteration_count;
+        PyObject *item1, *item2;
+        
+        iteration_count = i + 1;
+        item1 = PySequence_ITEM(first, i);
+        item2 = PySequence_ITEM(second, i);
+        sum += diff(item1, item2);
+        Py_DECREF(item1);
+        Py_DECREF(item2);
+        if ((sum > limit*iteration_count) && (iteration_count >= min_iterations)) {
+            return PyLong_FromLong(limit + 1);
+        }
+    }
+    
+    result = sum / count;
+    if (!result && sum) {
+        result = 1;
+    }
+    return PyLong_FromLong(result);
+}
+
+static PyMethodDef BlockMethods[] = {
+    {"getblocks2",  block_getblocks2, METH_VARARGS, block_getblocks2_doc},
+    {"avgdiff",  block_avgdiff, METH_VARARGS, block_avgdiff_doc},
+    {NULL, NULL, 0, NULL} /* Sentinel */
+};
+
+static struct PyModuleDef BlockDef = {
+    PyModuleDef_HEAD_INIT,
+    "_block",
+    NULL,
+    -1,
+    BlockMethods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+
+PyObject *
+PyInit__block(void)
+{
+    PyObject *m = PyModule_Create(&BlockDef);
+    if (m == NULL) {
+        return NULL;
+    }
+    
+    NoBlocksError = PyErr_NewException("_block.NoBlocksError", NULL, NULL);
+    PyModule_AddObject(m, "NoBlocksError", NoBlocksError);
+    DifferentBlockCountError = PyErr_NewException("_block.DifferentBlockCountError", NULL, NULL);
+    PyModule_AddObject(m, "DifferentBlockCountError", DifferentBlockCountError);
+
+    return m;
+}
--- a/core/pe/modules/block_osx.m
+++ b/core/pe/modules/block_osx.m
@@ -0,0 +1,303 @@
+/* Created By: Virgil Dupras
+ * Created On: 2010-02-04
+ * Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
+ *
+ * This software is licensed under the "GPLv3" License as described in the "LICENSE" file, 
+ * which should be included with this package. The terms are also available at 
+ * http://www.gnu.org/licenses/gpl-3.0.html
+**/
+
+#include "common.h"
+
+#import <Foundation/Foundation.h>
+
+#define RADIANS( degrees ) ( degrees * M_PI / 180 )
+
+static CFStringRef
+pystring2cfstring(PyObject *pystring)
+{
+    PyObject *encoded;
+    UInt8 *s;
+    CFIndex size;
+    CFStringRef result;
+    
+    if (PyUnicode_Check(pystring)) {
+        encoded = PyUnicode_AsUTF8String(pystring);
+        if (encoded == NULL) {
+            return NULL;
+        }
+    } else {
+        encoded = pystring;
+        Py_INCREF(encoded);
+    }
+    
+    s = (UInt8*)PyBytes_AS_STRING(encoded);
+    size = PyBytes_GET_SIZE(encoded);
+    result = CFStringCreateWithBytes(NULL, s, size, kCFStringEncodingUTF8, FALSE);
+    Py_DECREF(encoded);
+    return result;
+}
+
+static PyObject* block_osx_get_image_size(PyObject *self, PyObject *args)
+{
+    PyObject *path;
+    CFStringRef image_path;
+    CFURLRef image_url;
+    CGImageSourceRef source;
+    CGImageRef image;
+    long width, height;
+    PyObject *pwidth, *pheight;
+    PyObject *result;
+    
+    width = 0;
+    height = 0;
+    if (!PyArg_ParseTuple(args, "O", &path)) {
+        return NULL;
+    }
+    
+    image_path = pystring2cfstring(path);
+    if (image_path == NULL) {
+        return PyErr_NoMemory();
+    }
+    image_url = CFURLCreateWithFileSystemPath(NULL, image_path, kCFURLPOSIXPathStyle, FALSE);
+    CFRelease(image_path);
+    
+    source = CGImageSourceCreateWithURL(image_url, NULL);
+    CFRelease(image_url);
+    if (source != NULL) {
+        image = CGImageSourceCreateImageAtIndex(source, 0, NULL);
+        if (image != NULL) {
+            width = CGImageGetWidth(image);
+            height = CGImageGetHeight(image);
+            CGImageRelease(image);
+        }
+        CFRelease(source);
+    }
+    
+    pwidth = PyLong_FromLong(width);
+    if (pwidth == NULL) {
+        return NULL;
+    }
+    pheight = PyLong_FromLong(height);
+    if (pheight == NULL) {
+        return NULL;
+    }
+    result = PyTuple_Pack(2, pwidth, pheight);
+    Py_DECREF(pwidth);
+    Py_DECREF(pheight);
+    return result;
+}
+
+static CGContextRef
+MyCreateBitmapContext(int width, int height) 
+{
+    CGContextRef context = NULL;
+    CGColorSpaceRef colorSpace;
+    void *bitmapData;
+    int bitmapByteCount;
+    int bitmapBytesPerRow;
+    
+    bitmapBytesPerRow = (width * 4);
+    bitmapByteCount = (bitmapBytesPerRow * height);
+    
+    colorSpace = CGColorSpaceCreateWithName(kCGColorSpaceGenericRGB);
+    
+    // calloc() must be used to allocate bitmapData here because the buffer has to be zeroed.
+    // If it's not zeroes, when images with transparency are drawn in the context, this buffer
+    // will stay with undefined pixels, which means that two pictures with the same pixels will
+    // most likely have different blocks (which is not supposed to happen).
+    bitmapData = calloc(bitmapByteCount, 1);
+    if (bitmapData == NULL) {
+        fprintf(stderr, "Memory not allocated!");
+        return NULL;
+    }
+    
+    context = CGBitmapContextCreate(bitmapData, width, height, 8, bitmapBytesPerRow, colorSpace,
+        (CGBitmapInfo)kCGImageAlphaNoneSkipLast);
+    if (context== NULL) {
+        free(bitmapData);
+        fprintf(stderr, "Context not created!");
+        return NULL;
+    }
+    CGColorSpaceRelease(colorSpace);
+    return context;
+}
+
+static PyObject* getblock(unsigned char *imageData, int imageWidth, int imageHeight, int boxX, int boxY, int boxW, int boxH)
+{
+    int i,j, totalR, totalG, totalB;
+    
+    totalR = totalG = totalB = 0;
+    for(i=boxY; i<boxY+boxH; i++) {
+        for(j=boxX; j<boxX+boxW; j++) {
+            int offset = (i * imageWidth * 4) + (j * 4);
+            totalR += *(imageData + offset);
+            totalG += *(imageData + offset + 1);
+            totalB += *(imageData + offset + 2);
+        }
+    }
+    int pixelCount = boxH * boxW;
+    totalR /= pixelCount;
+    totalG /= pixelCount;
+    totalB /= pixelCount;
+    
+    return inttuple(3, totalR, totalG, totalB);
+}
+
+static PyObject* block_osx_getblocks(PyObject *self, PyObject *args)
+{
+    PyObject *path, *result;
+    CFStringRef image_path;
+    CFURLRef image_url;
+    CGImageSourceRef source;
+    CGImageRef image;
+    size_t width, height, image_width, image_height;
+    int block_count, block_width, block_height, orientation, i;
+    
+    if (!PyArg_ParseTuple(args, "Oii", &path, &block_count, &orientation)) {
+        return NULL;
+    }
+    
+    if (PySequence_Length(path) == 0) {
+        PyErr_SetString(PyExc_ValueError, "empty path");
+        return NULL;
+    }
+    
+    if ((orientation > 8) || (orientation < 0)) {
+        orientation = 0; // simplifies checks later since we can only have values in 0-8
+    }
+    
+    image_path = pystring2cfstring(path);
+    if (image_path == NULL) {
+        return PyErr_NoMemory();
+    }
+    image_url = CFURLCreateWithFileSystemPath(NULL, image_path, kCFURLPOSIXPathStyle, FALSE);
+    CFRelease(image_path);
+    
+    source = CGImageSourceCreateWithURL(image_url, NULL);
+    CFRelease(image_url);
+    if (source == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    image = CGImageSourceCreateImageAtIndex(source, 0, NULL);
+    if (image == NULL) {
+        CFRelease(source);
+        return PyErr_NoMemory();
+    }
+    
+    
+    width = image_width = CGImageGetWidth(image);
+    height = image_height = CGImageGetHeight(image);
+    if (orientation >= 5) {
+        // orientations 5-8 rotate the photo sideways, so we have to swap width and height
+        width = image_height;
+        height = image_width;
+    }
+    
+    CGContextRef context = MyCreateBitmapContext(width, height);
+    
+    if (orientation == 2) {
+        // Flip X
+        CGContextTranslateCTM(context, width, 0);
+        CGContextScaleCTM(context, -1, 1);
+    }
+    else if (orientation == 3) {
+        // Rot 180
+        CGContextTranslateCTM(context, width, height);
+        CGContextRotateCTM(context, RADIANS(180)); 
+    }
+    else if (orientation == 4) {
+        // Flip Y
+        CGContextTranslateCTM(context, 0, height);
+        CGContextScaleCTM(context, 1, -1);
+    }
+    else if (orientation == 5) {
+        // Flip X + Rot CW 90
+        CGContextTranslateCTM(context, width, 0);
+        CGContextScaleCTM(context, -1, 1);
+        CGContextTranslateCTM(context, 0, height);
+        CGContextRotateCTM(context, RADIANS(-90));
+    }
+    else if (orientation == 6) {
+        // Rot CW 90
+        CGContextTranslateCTM(context, 0, height);
+        CGContextRotateCTM(context, RADIANS(-90));
+    }
+    else if (orientation == 7) {
+        // Rot CCW 90 + Flip X
+        CGContextTranslateCTM(context, width, 0);
+        CGContextScaleCTM(context, -1, 1);
+        CGContextTranslateCTM(context, width, 0);
+        CGContextRotateCTM(context, RADIANS(90));
+    }
+    else if (orientation == 8) {
+        // Rot CCW 90
+        CGContextTranslateCTM(context, width, 0);
+        CGContextRotateCTM(context, RADIANS(90));
+    }
+    CGRect myBoundingBox = CGRectMake(0, 0, image_width, image_height);
+    CGContextDrawImage(context, myBoundingBox, image);
+    unsigned char *bitmapData = CGBitmapContextGetData(context);
+    CGContextRelease(context);
+    
+    CGImageRelease(image);
+    CFRelease(source);
+    if (bitmapData == NULL) {
+        return PyErr_NoMemory();
+    }
+    
+    block_width = max(width/block_count, 1);
+    block_height = max(height/block_count, 1);
+    
+    result = PyList_New(block_count * block_count);
+    if (result == NULL) {
+        return NULL;
+    }
+    
+    for(i=0; i<block_count; i++) {
+        int j, top;
+        top = min(i*block_height, height-block_height);
+        for(j=0; j<block_count; j++) {
+            int left;
+            left = min(j*block_width, width-block_width);
+            PyObject *block = getblock(bitmapData, width, height, left, top, block_width, block_height);
+            if (block == NULL) {
+                Py_DECREF(result);
+                return NULL;
+            }
+            PyList_SET_ITEM(result, i*block_count+j, block);
+        }
+    }
+    
+    free(bitmapData); 
+    return result;
+}
+
+static PyMethodDef BlockOsxMethods[] = {
+    {"get_image_size",  block_osx_get_image_size, METH_VARARGS, ""},
+    {"getblocks",  block_osx_getblocks, METH_VARARGS, ""},
+    {NULL, NULL, 0, NULL} /* Sentinel */
+};
+
+static struct PyModuleDef BlockOsxDef = {
+    PyModuleDef_HEAD_INIT,
+    "_block_osx",
+    NULL,
+    -1,
+    BlockOsxMethods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+
+PyObject *
+PyInit__block_osx(void)
+{
+    PyObject *m = PyModule_Create(&BlockOsxDef);
+    if (m == NULL) {
+        return NULL;
+    }
+    return m;
+}
--- a/core/pe/modules/cache.c
+++ b/core/pe/modules/cache.c
@@ -0,0 +1,95 @@
+/* Created By: Virgil Dupras
+ * Created On: 2010-01-30
+ * Copyright 2014 Hardcoded Software (http://www.hardcoded.net)
+ *
+ * This software is licensed under the "BSD" License as described in the "LICENSE" file, 
+ * which should be included with this package. The terms are also available at 
+ * http://www.hardcoded.net/licenses/bsd_license
+ */
+
+#include "common.h"
+
+/* I know that there strtol out there, but it requires a pointer to
+ * a char, which would in turn require me to buffer my chars around,
+ * making the whole process slower.
+ */
+static long
+xchar_to_long(char c)
+{
+    if ((c >= 48) && (c <= 57)) { /* 0-9 */
+        return c - 48;
+    }
+    else if ((c >= 65) && (c <= 70)) { /* A-F */
+        return c - 55;
+    }
+    else if ((c >= 97) && (c <= 102)) { /* a-f */
+        return c - 87;
+    }
+    return 0;
+}
+
+static PyObject*
+cache_string_to_colors(PyObject *self, PyObject *args)
+{
+    char *s;
+    Py_ssize_t char_count, color_count, i;
+    PyObject *result;
+    
+    if (!PyArg_ParseTuple(args, "s#", &s, &char_count)) {
+        return NULL;
+    }
+    
+    color_count = (char_count / 6);
+    result = PyList_New(color_count);
+    if (result == NULL) {
+        return NULL;
+    }
+    
+    for (i=0; i<color_count; i++) {
+        long r, g, b;
+        Py_ssize_t ci;
+        PyObject *color_tuple;
+        
+        ci = i * 6;
+        r = (xchar_to_long(s[ci]) << 4) + xchar_to_long(s[ci+1]);
+        g = (xchar_to_long(s[ci+2]) << 4) + xchar_to_long(s[ci+3]);
+        b = (xchar_to_long(s[ci+4]) << 4) + xchar_to_long(s[ci+5]);
+        
+        color_tuple = inttuple(3, r, g, b);
+        if (color_tuple == NULL) {
+            Py_DECREF(result);
+            return NULL;
+        }
+        PyList_SET_ITEM(result, i, color_tuple);
+    }
+    
+    return result;
+}
+
+static PyMethodDef CacheMethods[] = {
+    {"string_to_colors",  cache_string_to_colors, METH_VARARGS,
+     "Transform the string 's' in a list of 3 sized tuples."},
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+
+static struct PyModuleDef CacheDef = {
+    PyModuleDef_HEAD_INIT,
+    "_cache",
+    NULL,
+    -1,
+    CacheMethods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+
+PyObject *
+PyInit__cache(void)
+{
+    PyObject *m = PyModule_Create(&CacheDef);
+    if (m == NULL) {
+        return NULL;
+    }
+    return m;
+}
--- a/core/pe/modules/common.c
+++ b/core/pe/modules/common.c
@@ -0,0 +1,45 @@
+/* Created By: Virgil Dupras
+ * Created On: 2010-02-04
+ * Copyright 2014 Hardcoded Software (http://www.hardcoded.net)
+ *
+ * This software is licensed under the "BSD" License as described in the "LICENSE" file, 
+ * which should be included with this package. The terms are also available at 
+ * http://www.hardcoded.net/licenses/bsd_license
+ */
+
+#include "common.h"
+
+#ifndef _MSC_VER
+int max(int a, int b)
+{
+    return b > a ? b : a;
+}
+
+int min(int a, int b)
+{
+    return b < a ? b : a;
+}
+#endif
+
+PyObject* inttuple(int n, ...)
+{
+    int i;
+    PyObject *pnumber;
+    PyObject *result;
+    va_list numbers;
+    
+    va_start(numbers, n);
+    result = PyTuple_New(n);
+    
+    for (i=0; i<n; i++) {
+        pnumber = PyLong_FromLong(va_arg(numbers, long));
+        if (pnumber == NULL) {
+            Py_DECREF(result);
+            return NULL;
+        }
+        PyTuple_SET_ITEM(result, i, pnumber);
+    }
+    
+    va_end(numbers);
+    return result;
+}
--- a/core/pe/modules/common.h
+++ b/core/pe/modules/common.h
@@ -0,0 +1,20 @@
+/* Created By: Virgil Dupras
+ * Created On: 2010-02-04
+ * Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
+ *
+ * This software is licensed under the "GPLv3" License as described in the "LICENSE" file, 
+ * which should be included with this package. The terms are also available at 
+ * http://www.gnu.org/licenses/gpl-3.0.html
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+
+/* It seems like MS VC defines min/max already */
+#ifndef _MSC_VER
+int max(int a, int b);
+int min(int a, int b);
+#endif
+
+/* Create a tuple out of an array of integers. */
+PyObject* inttuple(int n, ...);
--- a/core/pe/photo.py
+++ b/core/pe/photo.py
@@ -0,0 +1,106 @@
+# Copyright 2016 Hardcoded Software (http://www.hardcoded.net)
+#
+# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
+# which should be included with this package. The terms are also available at
+# http://www.gnu.org/licenses/gpl-3.0.html
+
+import logging
+from hscommon.util import get_file_ext, format_size
+
+from core.util import format_timestamp, format_perc, format_dupe_count
+from core import fs
+from . import exif
+
+# This global value is set by the platform-specific subclasser of the Photo base class
+PLAT_SPECIFIC_PHOTO_CLASS = None
+
+def format_dimensions(dimensions):
+    return '%d x %d' % (dimensions[0], dimensions[1])
+
+def get_delta_dimensions(value, ref_value):
+    return (value[0]-ref_value[0], value[1]-ref_value[1])
+
+
+class Photo(fs.File):
+    INITIAL_INFO = fs.File.INITIAL_INFO.copy()
+    INITIAL_INFO.update({
+        'dimensions': (0, 0),
+        'exif_timestamp': '',
+    })
+    __slots__ = fs.File.__slots__ + tuple(INITIAL_INFO.keys())
+
+    # These extensions are supported on all platforms
+    HANDLED_EXTS = {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'tif'}
+
+    def _plat_get_dimensions(self):
+        raise NotImplementedError()
+
+    def _plat_get_blocks(self, block_count_per_side, orientation):
+        raise NotImplementedError()
+
+    def _get_orientation(self):
+        if not hasattr(self, '_cached_orientation'):
+            try:
+                with self.path.open('rb') as fp:
+                    exifdata = exif.get_fields(fp)
+                    # the value is a list (probably one-sized) of ints
+                    orientations = exifdata['Orientation']
+                    self._cached_orientation = orientations[0]
+            except Exception: # Couldn't read EXIF data, no transforms
+                self._cached_orientation = 0
+        return self._cached_orientation
+
+    def _get_exif_timestamp(self):
+        try:
+            with self.path.open('rb') as fp:
+                exifdata = exif.get_fields(fp)
+                return exifdata['DateTimeOriginal']
+        except Exception:
+            logging.info("Couldn't read EXIF of picture: %s", self.path)
+        return ''
+
+    @classmethod
+    def can_handle(cls, path):
+        return fs.File.can_handle(path) and get_file_ext(path.name) in cls.HANDLED_EXTS
+
+    def get_display_info(self, group, delta):
+        size = self.size
+        mtime = self.mtime
+        dimensions = self.dimensions
+        m = group.get_match_of(self)
+        if m:
+            percentage = m.percentage
+            dupe_count = 0
+            if delta:
+                r = group.ref
+                size -= r.size
+                mtime -= r.mtime
+                dimensions = get_delta_dimensions(dimensions, r.dimensions)
+        else:
+            percentage = group.percentage
+            dupe_count = len(group.dupes)
+        dupe_folder_path = getattr(self, 'display_folder_path', self.folder_path)
+        return {
+            'name': self.name,
+            'folder_path': str(dupe_folder_path),
+            'size': format_size(size, 0, 1, False),
+            'extension': self.extension,
+            'dimensions': format_dimensions(dimensions),
+            'exif_timestamp': self.exif_timestamp,
+            'mtime': format_timestamp(mtime, delta and m),
+            'percentage': format_perc(percentage),
+            'dupe_count': format_dupe_count(dupe_count),
+        }
+
+    def _read_info(self, field):
+        fs.File._read_info(self, field)
+        if field == 'dimensions':
+            self.dimensions = self._plat_get_dimensions()
+            if self._get_orientation() in {5, 6, 7, 8}:
+                self.dimensions = (self.dimensions[1], self.dimensions[0])
+        elif field == 'exif_timestamp':
+            self.exif_timestamp = self._get_exif_timestamp()
+
+    def get_blocks(self, block_count_per_side):
+        return self._plat_get_blocks(block_count_per_side, self._get_orientation())
+
--- a/core/pe/prioritize.py
+++ b/core/pe/prioritize.py
@@ -0,0 +1,31 @@
+# Created On: 2011/09/16
+# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
+#
+# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
+# which should be included with this package. The terms are also available at
+# http://www.gnu.org/licenses/gpl-3.0.html
+
+from hscommon.trans import trget
+
+from core.prioritize import (
+    KindCategory, FolderCategory, FilenameCategory, NumericalCategory,
+    SizeCategory, MtimeCategory
+)
+
+coltr = trget('columns')
+
+class DimensionsCategory(NumericalCategory):
+    NAME = coltr("Dimensions")
+
+    def extract_value(self, dupe):
+        return dupe.dimensions
+
+    def invert_numerical_value(self, value):
+        width, height = value
+        return (-width, -height)
+
+def all_categories():
+    return [
+        KindCategory, FolderCategory, FilenameCategory, SizeCategory, DimensionsCategory,
+        MtimeCategory
+    ]
--- a/core/pe/result_table.py
+++ b/core/pe/result_table.py
@@ -0,0 +1,28 @@
+# Created On: 2011-11-27
+# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "GPLv3" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.gnu.org/licenses/gpl-3.0.html
+
+from hscommon.gui.column import Column
+from hscommon.trans import trget
+
+from core.gui.result_table import ResultTable as ResultTableBase
+
+coltr = trget('columns')
+
+class ResultTable(ResultTableBase):
+    COLUMNS = [
+        Column('marked', ''),
+        Column('name', coltr("Filename")),
+        Column('folder_path', coltr("Folder"), optional=True),
+        Column('size', coltr("Size (KB)"), optional=True),
+        Column('extension', coltr("Kind"), visible=False, optional=True),
+        Column('dimensions', coltr("Dimensions"), optional=True),
+        Column('exif_timestamp', coltr("EXIF Timestamp"), visible=False, optional=True),
+        Column('mtime', coltr("Modification"), visible=False, optional=True),
+        Column('percentage', coltr("Match %"), optional=True),
+        Column('dupe_count', coltr("Dupe Count"), visible=False, optional=True),
+    ]
+    DELTA_COLUMNS = {'size', 'dimensions', 'mtime'}
--- a/core/pe/scanner.py
+++ b/core/pe/scanner.py
@@ -0,0 +1,32 @@
+# Copyright 2016 Hardcoded Software (http://www.hardcoded.net)
+#
+# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
+# which should be included with this package. The terms are also available at
+# http://www.gnu.org/licenses/gpl-3.0.html
+
+from hscommon.trans import tr
+
+from core.scanner import Scanner, ScanType, ScanOption
+
+from . import matchblock, matchexif
+
+class ScannerPE(Scanner):
+    cache_path = None
+    match_scaled = False
+    threshold = 75
+
+    @staticmethod
+    def get_scan_options():
+        return [
+            ScanOption(ScanType.FuzzyBlock, tr("Contents")),
+            ScanOption(ScanType.ExifTimestamp, tr("EXIF Timestamp")),
+        ]
+
+    def _getmatches(self, files, j):
+        if self.scan_type == ScanType.FuzzyBlock:
+            return matchblock.getmatches(files, self.cache_path, self.threshold, self.match_scaled, j)
+        elif self.scan_type == ScanType.ExifTimestamp:
+            return matchexif.getmatches(files, self.match_scaled, j)
+        else:
+            raise Exception("Invalid scan type")
+
				`@@ -0,0 +1 @@`
				`from . import block, cache, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa`