1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-22 14:41:39 +00:00

Merge core_{se,me,pe} into core.{se,me,pe}

This commit is contained in:
Virgil Dupras
2016-05-31 22:32:37 -04:00
parent d4919054f9
commit a65077f871
39 changed files with 55 additions and 75 deletions

View File

@@ -21,18 +21,8 @@ from hscommon.util import delete_if_empty, first, escape, nonone, allsame
from hscommon.trans import tr
from hscommon import desktop
import core_se.fs
import core_se.result_table
import core_se.scanner
import core_me.fs
import core_me.prioritize
import core_me.result_table
import core_me.scanner
import core_pe.photo
import core_pe.prioritize
import core_pe.result_table
import core_pe.scanner
from core_pe.photo import get_delta_dimensions
from . import se, me, pe
from .pe.photo import get_delta_dimensions
from .util import cmp_value, fix_surrogate_encoding
from . import directories, results, export, fs, prioritize
from .ignore import IgnoreList
@@ -166,11 +156,11 @@ class DupeGuru(Broadcaster):
#--- Private
def _create_result_table(self):
if self.app_mode == AppMode.Picture:
return core_pe.result_table.ResultTable(self)
return pe.result_table.ResultTable(self)
elif self.app_mode == AppMode.Music:
return core_me.result_table.ResultTable(self)
return me.result_table.ResultTable(self)
else:
return core_se.result_table.ResultTable(self)
return se.result_table.ResultTable(self)
def _get_dupe_sort_key(self, dupe, get_group, key, delta):
if self.app_mode in (AppMode.Music, AppMode.Picture):
@@ -335,9 +325,9 @@ class DupeGuru(Broadcaster):
#--- Protected
def _prioritization_categories(self):
if self.app_mode == AppMode.Picture:
return core_pe.prioritize.all_categories()
return pe.prioritize.all_categories()
elif self.app_mode == AppMode.Music:
return core_me.prioritize.all_categories()
return me.prioritize.all_categories()
else:
return prioritize.all_categories()
@@ -393,8 +383,7 @@ class DupeGuru(Broadcaster):
path = path.parent()
def clear_picture_cache(self):
from core_pe.cache import Cache
cache = Cache(self.options['cache_path'])
cache = pe.cache.Cache(self.options['cache_path'])
cache.clear()
cache.close()
@@ -754,7 +743,7 @@ class DupeGuru(Broadcaster):
def do(j):
j.set_progress(0, tr("Collecting files to scan"))
if scanner.scan_type == ScanType.Folders:
files = list(self.directories.get_folders(folderclass=core_se.fs.folder, j=j))
files = list(self.directories.get_folders(folderclass=se.fs.folder, j=j))
else:
files = list(self.directories.get_files(fileclasses=self.fileclasses, j=j))
if self.options['ignore_hardlink_matches']:
@@ -806,20 +795,20 @@ class DupeGuru(Broadcaster):
@property
def fileclasses(self):
if self.app_mode == AppMode.Picture:
return [core_pe.photo.PLAT_SPECIFIC_PHOTO_CLASS]
return [pe.photo.PLAT_SPECIFIC_PHOTO_CLASS]
elif self.app_mode == AppMode.Music:
return [core_me.fs.MusicFile]
return [me.fs.MusicFile]
else:
return [core_se.fs.File]
return [se.fs.File]
@property
def SCANNER_CLASS(self):
if self.app_mode == AppMode.Picture:
return core_pe.scanner.ScannerPE
return pe.scanner.ScannerPE
elif self.app_mode == AppMode.Music:
return core_me.scanner.ScannerME
return me.scanner.ScannerME
else:
return core_se.scanner.ScannerSE
return se.scanner.ScannerSE
@property
def METADATA_TO_READ(self):

1
core/me/__init__.py Normal file
View File

@@ -0,0 +1 @@
from . import fs, prioritize, result_table, scanner # noqa

104
core/me/fs.py Normal file
View File

@@ -0,0 +1,104 @@
# Created By: Virgil Dupras
# Created On: 2009-10-23
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hsaudiotag import auto
from hscommon.util import get_file_ext, format_size, format_time
from core.util import format_timestamp, format_perc, format_words, format_dupe_count
from core import fs
TAG_FIELDS = {
'audiosize', 'duration', 'bitrate', 'samplerate', 'title', 'artist',
'album', 'genre', 'year', 'track', 'comment'
}
class MusicFile(fs.File):
INITIAL_INFO = fs.File.INITIAL_INFO.copy()
INITIAL_INFO.update({
'audiosize': 0,
'bitrate': 0,
'duration': 0,
'samplerate': 0,
'artist': '',
'album': '',
'title': '',
'genre': '',
'comment': '',
'year': '',
'track': 0,
})
__slots__ = fs.File.__slots__ + tuple(INITIAL_INFO.keys())
@classmethod
def can_handle(cls, path):
if not fs.File.can_handle(path):
return False
return get_file_ext(path.name) in auto.EXT2CLASS
def get_display_info(self, group, delta):
size = self.size
duration = self.duration
bitrate = self.bitrate
samplerate = self.samplerate
mtime = self.mtime
m = group.get_match_of(self)
if m:
percentage = m.percentage
dupe_count = 0
if delta:
r = group.ref
size -= r.size
duration -= r.duration
bitrate -= r.bitrate
samplerate -= r.samplerate
mtime -= r.mtime
else:
percentage = group.percentage
dupe_count = len(group.dupes)
dupe_folder_path = getattr(self, 'display_folder_path', self.folder_path)
return {
'name': self.name,
'folder_path': str(dupe_folder_path),
'size': format_size(size, 2, 2, False),
'duration': format_time(duration, with_hours=False),
'bitrate': str(bitrate),
'samplerate': str(samplerate),
'extension': self.extension,
'mtime': format_timestamp(mtime, delta and m),
'title': self.title,
'artist': self.artist,
'album': self.album,
'genre': self.genre,
'year': self.year,
'track': str(self.track),
'comment': self.comment,
'percentage': format_perc(percentage),
'words': format_words(self.words) if hasattr(self, 'words') else '',
'dupe_count': format_dupe_count(dupe_count),
}
def _get_md5partial_offset_and_size(self):
f = auto.File(str(self.path))
return (f.audio_offset, f.audio_size)
def _read_info(self, field):
fs.File._read_info(self, field)
if field in TAG_FIELDS:
f = auto.File(str(self.path))
self.audiosize = f.audio_size
self.bitrate = f.bitrate
self.duration = f.duration
self.samplerate = f.sample_rate
self.artist = f.artist
self.album = f.album
self.title = f.title
self.genre = f.genre
self.comment = f.comment
self.year = f.year
self.track = f.track

40
core/me/prioritize.py Normal file
View File

@@ -0,0 +1,40 @@
# Created On: 2011/09/16
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.trans import trget
from core.prioritize import (
KindCategory, FolderCategory, FilenameCategory, NumericalCategory,
SizeCategory, MtimeCategory
)
coltr = trget('columns')
class DurationCategory(NumericalCategory):
NAME = coltr("Duration")
def extract_value(self, dupe):
return dupe.duration
class BitrateCategory(NumericalCategory):
NAME = coltr("Bitrate")
def extract_value(self, dupe):
return dupe.bitrate
class SamplerateCategory(NumericalCategory):
NAME = coltr("Samplerate")
def extract_value(self, dupe):
return dupe.samplerate
def all_categories():
return [
KindCategory, FolderCategory, FilenameCategory, SizeCategory, DurationCategory,
BitrateCategory, SamplerateCategory, MtimeCategory
]

37
core/me/result_table.py Normal file
View File

@@ -0,0 +1,37 @@
# Created On: 2011-11-27
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.gui.column import Column
from hscommon.trans import trget
from core.gui.result_table import ResultTable as ResultTableBase
coltr = trget('columns')
class ResultTable(ResultTableBase):
COLUMNS = [
Column('marked', ''),
Column('name', coltr("Filename")),
Column('folder_path', coltr("Folder"), visible=False, optional=True),
Column('size', coltr("Size (MB)"), optional=True),
Column('duration', coltr("Time"), optional=True),
Column('bitrate', coltr("Bitrate"), optional=True),
Column('samplerate', coltr("Sample Rate"), visible=False, optional=True),
Column('extension', coltr("Kind"), optional=True),
Column('mtime', coltr("Modification"), visible=False, optional=True),
Column('title', coltr("Title"), visible=False, optional=True),
Column('artist', coltr("Artist"), visible=False, optional=True),
Column('album', coltr("Album"), visible=False, optional=True),
Column('genre', coltr("Genre"), visible=False, optional=True),
Column('year', coltr("Year"), visible=False, optional=True),
Column('track', coltr("Track Number"), visible=False, optional=True),
Column('comment', coltr("Comment"), visible=False, optional=True),
Column('percentage', coltr("Match %"), optional=True),
Column('words', coltr("Words Used"), visible=False, optional=True),
Column('dupe_count', coltr("Dupe Count"), visible=False, optional=True),
]
DELTA_COLUMNS = {'size', 'duration', 'bitrate', 'samplerate', 'mtime'}

27
core/me/scanner.py Normal file
View File

@@ -0,0 +1,27 @@
# Copyright 2016 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.trans import tr
from core.scanner import Scanner as ScannerBase, ScanOption, ScanType
class ScannerME(ScannerBase):
@staticmethod
def _key_func(dupe):
return (-dupe.bitrate, -dupe.size)
@staticmethod
def get_scan_options():
return [
ScanOption(ScanType.Filename, tr("Filename")),
ScanOption(ScanType.Fields, tr("Filename - Fields")),
ScanOption(ScanType.FieldsNoOrder, tr("Filename - Fields (No Order)")),
ScanOption(ScanType.Tag, tr("Tags")),
ScanOption(ScanType.Contents, tr("Contents")),
ScanOption(ScanType.ContentsAudio, tr("Audio Contents")),
]

1
core/pe/__init__.py Normal file
View File

@@ -0,0 +1 @@
from . import block, cache, exif, iphoto_plist, matchblock, matchexif, photo, prioritize, result_table, scanner # noqa

122
core/pe/block.py Normal file
View File

@@ -0,0 +1,122 @@
# Created By: Virgil Dupras
# Created On: 2006/09/01
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from ._block import NoBlocksError, DifferentBlockCountError, avgdiff, getblocks2 # NOQA
# Converted to C
# def getblock(image):
# """Returns a 3 sized tuple containing the mean color of 'image'.
#
# image: a PIL image or crop.
# """
# if image.size[0]:
# pixel_count = image.size[0] * image.size[1]
# red = green = blue = 0
# for r,g,b in image.getdata():
# red += r
# green += g
# blue += b
# return (red // pixel_count, green // pixel_count, blue // pixel_count)
# else:
# return (0,0,0)
# This is not used anymore
# def getblocks(image,blocksize):
# """Returns a list of blocks (3 sized tuples).
#
# image: A PIL image to base the blocks on.
# blocksize: The size of the blocks to be create. This is a single integer, defining
# both width and height (blocks are square).
# """
# if min(image.size) < blocksize:
# return ()
# result = []
# for i in xrange(image.size[1] // blocksize):
# for j in xrange(image.size[0] // blocksize):
# box = (blocksize * j, blocksize * i, blocksize * (j + 1), blocksize * (i + 1))
# crop = image.crop(box)
# result.append(getblock(crop))
# return result
# Converted to C
# def getblocks2(image,block_count_per_side):
# """Returns a list of blocks (3 sized tuples).
#
# image: A PIL image to base the blocks on.
# block_count_per_side: This integer determine the number of blocks the function will return.
# If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not
# necessarely cover square areas. The area covered by each block will be proportional to the image
# itself.
# """
# if not image.size[0]:
# return []
# width,height = image.size
# block_width = max(width // block_count_per_side,1)
# block_height = max(height // block_count_per_side,1)
# result = []
# for ih in range(block_count_per_side):
# top = min(ih * block_height, height - block_height)
# bottom = top + block_height
# for iw in range(block_count_per_side):
# left = min(iw * block_width, width - block_width)
# right = left + block_width
# box = (left,top,right,bottom)
# crop = image.crop(box)
# result.append(getblock(crop))
# return result
# Converted to C
# def diff(first, second):
# """Returns the difference between the first block and the second.
#
# It returns an absolute sum of the 3 differences (RGB).
# """
# r1, g1, b1 = first
# r2, g2, b2 = second
# return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2)
# Converted to C
# def avgdiff(first, second, limit=768, min_iterations=1):
# """Returns the average diff between first blocks and seconds.
#
# If the result surpasses limit, limit + 1 is returned, except if less than min_iterations
# iterations have been made in the blocks.
# """
# if len(first) != len(second):
# raise DifferentBlockCountError
# if not first:
# raise NoBlocksError
# count = len(first)
# sum = 0
# zipped = izip(xrange(1, count + 1), first, second)
# for i, first, second in zipped:
# sum += diff(first, second)
# if sum > limit * i and i >= min_iterations:
# return limit + 1
# result = sum // count
# if (not result) and sum:
# result = 1
# return result
# This is not used anymore
# def maxdiff(first,second,limit=768):
# """Returns the max diff between first blocks and seconds.
#
# If the result surpasses limit, the first max being over limit is returned.
# """
# if len(first) != len(second):
# raise DifferentBlockCountError
# if not first:
# raise NoBlocksError
# result = 0
# zipped = zip(first,second)
# for first,second in zipped:
# result = max(result,diff(first,second))
# if result > limit:
# return result
# return result

162
core/pe/cache.py Normal file
View File

@@ -0,0 +1,162 @@
# Created By: Virgil Dupras
# Created On: 2006/09/14
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
import os
import os.path as op
import logging
import sqlite3 as sqlite
from ._cache import string_to_colors
def colors_to_string(colors):
"""Transform the 3 sized tuples 'colors' into a hex string.
[(0,100,255)] --> 0064ff
[(1,2,3),(4,5,6)] --> 010203040506
"""
return ''.join(['%02x%02x%02x' % (r, g, b) for r, g, b in colors])
# This function is an important bottleneck of dupeGuru PE. It has been converted to C.
# def string_to_colors(s):
# """Transform the string 's' in a list of 3 sized tuples.
# """
# result = []
# for i in xrange(0, len(s), 6):
# number = int(s[i:i+6], 16)
# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
# return result
class Cache:
"""A class to cache picture blocks.
"""
def __init__(self, db=':memory:'):
self.dbname = db
self.con = None
self._create_con()
def __contains__(self, key):
sql = "select count(*) from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchall()
return result[0][0] > 0
def __delitem__(self, key):
if key not in self:
raise KeyError(key)
sql = "delete from pictures where path = ?"
self.con.execute(sql, [key])
# Optimized
def __getitem__(self, key):
if isinstance(key, int):
sql = "select blocks from pictures where rowid = ?"
else:
sql = "select blocks from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchone()
if result:
result = string_to_colors(result[0])
return result
else:
raise KeyError(key)
def __iter__(self):
sql = "select path from pictures"
result = self.con.execute(sql)
return (row[0] for row in result)
def __len__(self):
sql = "select count(*) from pictures"
result = self.con.execute(sql).fetchall()
return result[0][0]
def __setitem__(self, path_str, blocks):
blocks = colors_to_string(blocks)
if op.exists(path_str):
mtime = int(os.stat(path_str).st_mtime)
else:
mtime = 0
if path_str in self:
sql = "update pictures set blocks = ?, mtime = ? where path = ?"
else:
sql = "insert into pictures(blocks,mtime,path) values(?,?,?)"
try:
self.con.execute(sql, [blocks, mtime, path_str])
except sqlite.OperationalError:
logging.warning('Picture cache could not set value for key %r', path_str)
except sqlite.DatabaseError as e:
logging.warning('DatabaseError while setting value for key %r: %s', path_str, str(e))
def _create_con(self, second_try=False):
def create_tables():
logging.debug("Creating picture cache tables.")
self.con.execute("drop table if exists pictures")
self.con.execute("drop index if exists idx_path")
self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks TEXT)")
self.con.execute("create index idx_path on pictures (path)")
self.con = sqlite.connect(self.dbname, isolation_level=None)
try:
self.con.execute("select path, mtime, blocks from pictures where 1=2")
except sqlite.OperationalError: # new db
create_tables()
except sqlite.DatabaseError as e: # corrupted db
if second_try:
raise # Something really strange is happening
logging.warning('Could not create picture cache because of an error: %s', str(e))
self.con.close()
os.remove(self.dbname)
self._create_con(second_try=True)
def clear(self):
self.close()
if self.dbname != ':memory:':
os.remove(self.dbname)
self._create_con()
def close(self):
if self.con is not None:
self.con.close()
self.con = None
def filter(self, func):
to_delete = [key for key in self if not func(key)]
for key in to_delete:
del self[key]
def get_id(self, path):
sql = "select rowid from pictures where path = ?"
result = self.con.execute(sql, [path]).fetchone()
if result:
return result[0]
else:
raise ValueError(path)
def get_multiple(self, rowids):
sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids))
cur = self.con.execute(sql)
return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)
def purge_outdated(self):
"""Go through the cache and purge outdated records.
A record is outdated if the picture doesn't exist or if its mtime is greater than the one in
the db.
"""
todelete = []
sql = "select rowid, path, mtime from pictures"
cur = self.con.execute(sql)
for rowid, path_str, mtime in cur:
if mtime and op.exists(path_str):
picture_mtime = os.stat(path_str).st_mtime
if int(picture_mtime) <= mtime:
# not outdated
continue
todelete.append(rowid)
if todelete:
sql = "delete from pictures where rowid in (%s)" % ','.join(map(str, todelete))
self.con.execute(sql)

335
core/pe/exif.py Normal file
View File

@@ -0,0 +1,335 @@
# Created By: Virgil Dupras
# Created On: 2011-04-20
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
# Heavily based on http://topo.math.u-psud.fr/~bousch/exifdump.py by Thierry Bousch (Public Domain)
import logging
EXIF_TAGS = {
0x0100: "ImageWidth",
0x0101: "ImageLength",
0x0102: "BitsPerSample",
0x0103: "Compression",
0x0106: "PhotometricInterpretation",
0x010A: "FillOrder",
0x010D: "DocumentName",
0x010E: "ImageDescription",
0x010F: "Make",
0x0110: "Model",
0x0111: "StripOffsets",
0x0112: "Orientation",
0x0115: "SamplesPerPixel",
0x0116: "RowsPerStrip",
0x0117: "StripByteCounts",
0x011A: "XResolution",
0x011B: "YResolution",
0x011C: "PlanarConfiguration",
0x0128: "ResolutionUnit",
0x012D: "TransferFunction",
0x0131: "Software",
0x0132: "DateTime",
0x013B: "Artist",
0x013E: "WhitePoint",
0x013F: "PrimaryChromaticities",
0x0156: "TransferRange",
0x0200: "JPEGProc",
0x0201: "JPEGInterchangeFormat",
0x0202: "JPEGInterchangeFormatLength",
0x0211: "YCbCrCoefficients",
0x0212: "YCbCrSubSampling",
0x0213: "YCbCrPositioning",
0x0214: "ReferenceBlackWhite",
0x828F: "BatteryLevel",
0x8298: "Copyright",
0x829A: "ExposureTime",
0x829D: "FNumber",
0x83BB: "IPTC/NAA",
0x8769: "ExifIFDPointer",
0x8773: "InterColorProfile",
0x8822: "ExposureProgram",
0x8824: "SpectralSensitivity",
0x8825: "GPSInfoIFDPointer",
0x8827: "ISOSpeedRatings",
0x8828: "OECF",
0x9000: "ExifVersion",
0x9003: "DateTimeOriginal",
0x9004: "DateTimeDigitized",
0x9101: "ComponentsConfiguration",
0x9102: "CompressedBitsPerPixel",
0x9201: "ShutterSpeedValue",
0x9202: "ApertureValue",
0x9203: "BrightnessValue",
0x9204: "ExposureBiasValue",
0x9205: "MaxApertureValue",
0x9206: "SubjectDistance",
0x9207: "MeteringMode",
0x9208: "LightSource",
0x9209: "Flash",
0x920A: "FocalLength",
0x9214: "SubjectArea",
0x927C: "MakerNote",
0x9286: "UserComment",
0x9290: "SubSecTime",
0x9291: "SubSecTimeOriginal",
0x9292: "SubSecTimeDigitized",
0xA000: "FlashPixVersion",
0xA001: "ColorSpace",
0xA002: "PixelXDimension",
0xA003: "PixelYDimension",
0xA004: "RelatedSoundFile",
0xA005: "InteroperabilityIFDPointer",
0xA20B: "FlashEnergy", # 0x920B in TIFF/EP
0xA20C: "SpatialFrequencyResponse", # 0x920C - -
0xA20E: "FocalPlaneXResolution", # 0x920E - -
0xA20F: "FocalPlaneYResolution", # 0x920F - -
0xA210: "FocalPlaneResolutionUnit", # 0x9210 - -
0xA214: "SubjectLocation", # 0x9214 - -
0xA215: "ExposureIndex", # 0x9215 - -
0xA217: "SensingMethod", # 0x9217 - -
0xA300: "FileSource",
0xA301: "SceneType",
0xA302: "CFAPattern", # 0x828E in TIFF/EP
0xA401: "CustomRendered",
0xA402: "ExposureMode",
0xA403: "WhiteBalance",
0xA404: "DigitalZoomRatio",
0xA405: "FocalLengthIn35mmFilm",
0xA406: "SceneCaptureType",
0xA407: "GainControl",
0xA408: "Contrast",
0xA409: "Saturation",
0xA40A: "Sharpness",
0xA40B: "DeviceSettingDescription",
0xA40C: "SubjectDistanceRange",
0xA420: "ImageUniqueID",
}
INTR_TAGS = {
0x0001: "InteroperabilityIndex",
0x0002: "InteroperabilityVersion",
0x1000: "RelatedImageFileFormat",
0x1001: "RelatedImageWidth",
0x1002: "RelatedImageLength",
}
GPS_TA0GS = {
0x00: "GPSVersionID",
0x01: "GPSLatitudeRef",
0x02: "GPSLatitude",
0x03: "GPSLongitudeRef",
0x04: "GPSLongitude",
0x05: "GPSAltitudeRef",
0x06: "GPSAltitude",
0x07: "GPSTimeStamp",
0x08: "GPSSatellites",
0x09: "GPSStatus",
0x0A: "GPSMeasureMode",
0x0B: "GPSDOP",
0x0C: "GPSSpeedRef",
0x0D: "GPSSpeed",
0x0E: "GPSTrackRef",
0x0F: "GPSTrack",
0x10: "GPSImgDirectionRef",
0x11: "GPSImgDirection",
0x12: "GPSMapDatum",
0x13: "GPSDestLatitudeRef",
0x14: "GPSDestLatitude",
0x15: "GPSDestLongitudeRef",
0x16: "GPSDestLongitude",
0x17: "GPSDestBearingRef",
0x18: "GPSDestBearing",
0x19: "GPSDestDistanceRef",
0x1A: "GPSDestDistance",
0x1B: "GPSProcessingMethod",
0x1C: "GPSAreaInformation",
0x1D: "GPSDateStamp",
0x1E: "GPSDifferential"
}
INTEL_ENDIAN = ord('I')
MOTOROLA_ENDIAN = ord('M')
# About MAX_COUNT: It's possible to have corrupted exif tags where the entry count is way too high
# and thus makes us loop, not endlessly, but for heck of a long time for nothing. Therefore, we put
# an arbitrary limit on the entry count we'll allow ourselves to read and any IFD reporting more
# entries than that will be considered corrupt.
MAX_COUNT = 0xffff
def s2n_motorola(bytes):
x = 0
for c in bytes:
x = (x << 8) | c
return x
def s2n_intel(bytes):
x = 0
y = 0
for c in bytes:
x = x | (c << y)
y = y + 8
return x
class Fraction:
def __init__(self, num, den):
self.num = num
self.den = den
def __repr__(self):
return '%d/%d' % (self.num, self.den)
class TIFF_file:
def __init__(self, data):
self.data = data
self.endian = data[0]
self.s2nfunc = s2n_intel if self.endian == INTEL_ENDIAN else s2n_motorola
def s2n(self, offset, length, signed=0, debug=False):
slice = self.data[offset:offset+length]
val = self.s2nfunc(slice)
# Sign extension ?
if signed:
msb = 1 << (8*length - 1)
if val & msb:
val = val - (msb << 1)
if debug:
logging.debug(self.endian)
logging.debug("Slice for offset %d length %d: %r and value: %d", offset, length, slice, val)
return val
def first_IFD(self):
return self.s2n(4, 4)
def next_IFD(self, ifd):
entries = self.s2n(ifd, 2)
return self.s2n(ifd + 2 + 12 * entries, 4)
def list_IFDs(self):
i = self.first_IFD()
a = []
while i:
a.append(i)
i = self.next_IFD(i)
return a
def dump_IFD(self, ifd):
entries = self.s2n(ifd, 2)
logging.debug("Entries for IFD %d: %d", ifd, entries)
if entries > MAX_COUNT:
logging.debug("Probably corrupt. Aborting.")
return []
a = []
for i in range(entries):
entry = ifd + 2 + 12*i
tag = self.s2n(entry, 2)
type = self.s2n(entry+2, 2)
if not 1 <= type <= 10:
continue # not handled
typelen = [1, 1, 2, 4, 8, 1, 1, 2, 4, 8][type-1]
count = self.s2n(entry+4, 4)
if count > MAX_COUNT:
logging.debug("Probably corrupt. Aborting.")
return []
offset = entry+8
if count*typelen > 4:
offset = self.s2n(offset, 4)
if type == 2:
# Special case: nul-terminated ASCII string
values = str(self.data[offset:offset+count-1], encoding='latin-1')
else:
values = []
signed = (type == 6 or type >= 8)
for j in range(count):
if type in {5, 10}:
# The type is either 5 or 10
value_j = Fraction(self.s2n(offset, 4, signed),
self.s2n(offset+4, 4, signed))
else:
# Not a fraction
value_j = self.s2n(offset, typelen, signed)
values.append(value_j)
offset = offset + typelen
# Now "values" is either a string or an array
a.append((tag, type, values))
return a
def read_exif_header(fp):
# If `fp`'s first bytes are not exif, it tries to find it in the next 4kb
def isexif(data):
return data[0:4] == b'\377\330\377\341' and data[6:10] == b'Exif'
data = fp.read(12)
if isexif(data):
return data
# ok, not exif, try to find it
large_data = fp.read(4096)
try:
index = large_data.index(b'Exif')
data = large_data[index-6:index+6]
# large_data omits the first 12 bytes, and the index is at the middle of the header, so we
# must seek index + 18
fp.seek(index+18)
return data
except ValueError:
raise ValueError("Not an Exif file")
def get_fields(fp):
data = read_exif_header(fp)
length = data[4] * 256 + data[5]
logging.debug("Exif header length: %d bytes", length)
data = fp.read(length-8)
data_format = data[0]
logging.debug("%s format", {INTEL_ENDIAN: 'Intel', MOTOROLA_ENDIAN: 'Motorola'}[data_format])
T = TIFF_file(data)
# There may be more than one IFD per file, but we only read the first one because others are
# most likely thumbnails.
main_IFD_offset = T.first_IFD()
result = {}
def add_tag_to_result(tag, values):
try:
stag = EXIF_TAGS[tag]
except KeyError:
stag = '0x%04X' % tag
if stag in result:
return # don't overwrite data
result[stag] = values
logging.debug("IFD at offset %d", main_IFD_offset)
IFD = T.dump_IFD(main_IFD_offset)
exif_off = gps_off = 0
for tag, type, values in IFD:
if tag == 0x8769:
exif_off = values[0]
continue
if tag == 0x8825:
gps_off = values[0]
continue
add_tag_to_result(tag, values)
if exif_off:
logging.debug("Exif SubIFD at offset %d:", exif_off)
IFD = T.dump_IFD(exif_off)
# Recent digital cameras have a little subdirectory
# here, pointed to by tag 0xA005. Apparently, it's the
# "Interoperability IFD", defined in Exif 2.1 and DCF.
intr_off = 0
for tag, type, values in IFD:
if tag == 0xA005:
intr_off = values[0]
continue
add_tag_to_result(tag, values)
if intr_off:
logging.debug("Exif Interoperability SubSubIFD at offset %d:", intr_off)
IFD = T.dump_IFD(intr_off)
for tag, type, values in IFD:
add_tag_to_result(tag, values)
if gps_off:
logging.debug("GPS SubIFD at offset %d:", gps_off)
IFD = T.dump_IFD(gps_off)
for tag, type, values in IFD:
add_tag_to_result(tag, values)
return result

31
core/pe/iphoto_plist.py Normal file
View File

@@ -0,0 +1,31 @@
# Created By: Virgil Dupras
# Created On: 2014-03-15
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
import plistlib
class IPhotoPlistParser(plistlib._PlistParser):
"""A parser for iPhoto plists.
iPhoto plists tend to be malformed, so we have to subclass the built-in parser to be a bit more
lenient.
"""
def __init__(self):
plistlib._PlistParser.__init__(self, use_builtin_types=True, dict_type=dict)
# For debugging purposes, we remember the last bit of data to be analyzed so that we can
# log it in case of an exception
self.lastdata = ''
def get_data(self):
self.lastdata = plistlib._PlistParser.get_data(self)
return self.lastdata
def end_integer(self):
try:
self.add_object(int(self.get_data()))
except ValueError:
self.add_object(0)

222
core/pe/matchblock.py Normal file
View File

@@ -0,0 +1,222 @@
# Created By: Virgil Dupras
# Created On: 2007/02/25
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
import logging
import multiprocessing
from itertools import combinations
from hscommon.util import extract, iterconsume
from hscommon.trans import tr
from hscommon.jobprogress import job
from core.engine import Match
from .block import avgdiff, DifferentBlockCountError, NoBlocksError
from .cache import Cache
# OPTIMIZATION NOTES:
# The bottleneck of the matching phase is CPU, which is why we use multiprocessing. However, another
# bottleneck that shows up when a lot of pictures are involved is Disk IO's because blocks
# constantly have to be read from disks by subprocesses. This problem is especially big on CPUs
# with a lot of cores. Therefore, we must minimize Disk IOs. The best way to achieve that is to
# separate the files to scan in "chunks" and it's by chunk that blocks are read in memory and
# compared to each other. Each file in a chunk has to be compared to each other, of course, but also
# to files in other chunks. So chunkifying doesn't save us any actual comparison, but the advantage
# is that instead of reading blocks from disk number_of_files**2 times, we read it
# number_of_files*number_of_chunks times.
# Determining the right chunk size is tricky, bceause if it's too big, too many blocks will be in
# memory at the same time and we might end up with memory trashing, which is awfully slow. So,
# because our *real* bottleneck is CPU, the chunk size must simply be enough so that the CPU isn't
# starved by Disk IOs.
MIN_ITERATIONS = 3
BLOCK_COUNT_PER_SIDE = 15
DEFAULT_CHUNK_SIZE = 1000
MIN_CHUNK_SIZE = 100
# Enough so that we're sure that the main thread will not wait after a result.get() call
# cpucount+1 should be enough to be sure that the spawned process will not wait after the results
# collection made by the main process.
try:
RESULTS_QUEUE_LIMIT = multiprocessing.cpu_count() + 1
except Exception:
# I had an IOError on app launch once. It seems to be a freak occurrence. In any case, we want
# the app to launch, so let's just put an arbitrary value.
logging.warning("Had problems to determine cpu count on launch.")
RESULTS_QUEUE_LIMIT = 8
def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
# The MemoryError handlers in there use logging without first caring about whether or not
# there is enough memory left to carry on the operation because it is assumed that the
# MemoryError happens when trying to read an image file, which is freed from memory by the
# time that MemoryError is raised.
cache = Cache(cache_path)
cache.purge_outdated()
prepared = [] # only pictures for which there was no error getting blocks
try:
for picture in j.iter_with_progress(pictures, tr("Analyzed %d/%d pictures")):
if not picture.path:
# XXX Find the root cause of this. I've received reports of crashes where we had
# "Analyzing picture at " (without a path) in the debug log. It was an iPhoto scan.
# For now, I'm simply working around the crash by ignoring those, but it would be
# interesting to know exactly why this happens. I'm suspecting a malformed
# entry in iPhoto library.
logging.warning("We have a picture with a null path here")
continue
picture.unicode_path = str(picture.path)
logging.debug("Analyzing picture at %s", picture.unicode_path)
if with_dimensions:
picture.dimensions # pre-read dimensions
try:
if picture.unicode_path not in cache:
blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE)
cache[picture.unicode_path] = blocks
prepared.append(picture)
except (IOError, ValueError) as e:
logging.warning(str(e))
except MemoryError:
logging.warning("Ran out of memory while reading %s of size %d", picture.unicode_path, picture.size)
if picture.size < 10 * 1024 * 1024: # We're really running out of memory
raise
except MemoryError:
logging.warning('Ran out of memory while preparing pictures')
cache.close()
return prepared
def get_chunks(pictures):
min_chunk_count = multiprocessing.cpu_count() * 2 # have enough chunks to feed all subprocesses
chunk_count = len(pictures) // DEFAULT_CHUNK_SIZE
chunk_count = max(min_chunk_count, chunk_count)
chunk_size = (len(pictures) // chunk_count) + 1
chunk_size = max(MIN_CHUNK_SIZE, chunk_size)
logging.info(
"Creating %d chunks with a chunk size of %d for %d pictures", chunk_count,
chunk_size, len(pictures)
)
chunks = [pictures[i:i+chunk_size] for i in range(0, len(pictures), chunk_size)]
return chunks
def get_match(first, second, percentage):
if percentage < 0:
percentage = 0
return Match(first, second, percentage)
def async_compare(ref_ids, other_ids, dbname, threshold, picinfo):
# The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids
# can be None. In this case, ref_ids has to be compared with itself
# picinfo is a dictionary {pic_id: (dimensions, is_ref)}
cache = Cache(dbname)
limit = 100 - threshold
ref_pairs = list(cache.get_multiple(ref_ids))
if other_ids is not None:
other_pairs = list(cache.get_multiple(other_ids))
comparisons_to_do = [(r, o) for r in ref_pairs for o in other_pairs]
else:
comparisons_to_do = list(combinations(ref_pairs, 2))
results = []
for (ref_id, ref_blocks), (other_id, other_blocks) in comparisons_to_do:
ref_dimensions, ref_is_ref = picinfo[ref_id]
other_dimensions, other_is_ref = picinfo[other_id]
if ref_is_ref and other_is_ref:
continue
if ref_dimensions != other_dimensions:
continue
try:
diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
percentage = 100 - diff
except (DifferentBlockCountError, NoBlocksError):
percentage = 0
if percentage >= threshold:
results.append((ref_id, other_id, percentage))
cache.close()
return results
def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nulljob):
def get_picinfo(p):
if match_scaled:
return (None, p.is_ref)
else:
return (p.dimensions, p.is_ref)
def collect_results(collect_all=False):
# collect results and wait until the queue is small enough to accomodate a new results.
nonlocal async_results, matches, comparison_count, comparisons_to_do
limit = 0 if collect_all else RESULTS_QUEUE_LIMIT
while len(async_results) > limit:
ready, working = extract(lambda r: r.ready(), async_results)
for result in ready:
matches += result.get()
async_results.remove(result)
comparison_count += 1
# About the NOQA below: I think there's a bug in pyflakes. To investigate...
progress_msg = tr("Performed %d/%d chunk matches") % (comparison_count, len(comparisons_to_do)) # NOQA
j.set_progress(comparison_count, progress_msg)
j = j.start_subjob([3, 7])
pictures = prepare_pictures(pictures, cache_path, with_dimensions=not match_scaled, j=j)
j = j.start_subjob([9, 1], tr("Preparing for matching"))
cache = Cache(cache_path)
id2picture = {}
for picture in pictures:
try:
picture.cache_id = cache.get_id(picture.unicode_path)
id2picture[picture.cache_id] = picture
except ValueError:
pass
cache.close()
pictures = [p for p in pictures if hasattr(p, 'cache_id')]
pool = multiprocessing.Pool()
async_results = []
matches = []
chunks = get_chunks(pictures)
# We add a None element at the end of the chunk list because each chunk has to be compared
# with itself. Thus, each chunk will show up as a ref_chunk having other_chunk set to None once.
comparisons_to_do = list(combinations(chunks + [None], 2))
comparison_count = 0
j.start_job(len(comparisons_to_do))
try:
for ref_chunk, other_chunk in comparisons_to_do:
picinfo = {p.cache_id: get_picinfo(p) for p in ref_chunk}
ref_ids = [p.cache_id for p in ref_chunk]
if other_chunk is not None:
other_ids = [p.cache_id for p in other_chunk]
picinfo.update({p.cache_id: get_picinfo(p) for p in other_chunk})
else:
other_ids = None
args = (ref_ids, other_ids, cache_path, threshold, picinfo)
async_results.append(pool.apply_async(async_compare, args))
collect_results()
collect_results(collect_all=True)
except MemoryError:
# Rare, but possible, even in 64bit situations (ref #264). What do we do now? We free us
# some wiggle room, log about the incident, and stop matching right here. We then process
# the matches we have. The rest of the process doesn't allocate much and we should be
# alright.
del comparisons_to_do, chunks, pictures # some wiggle room for the next statements
logging.warning("Ran out of memory when scanning! We had %d matches.", len(matches))
del matches[-len(matches)//3:] # some wiggle room to ensure we don't run out of memory again.
pool.close()
result = []
myiter = j.iter_with_progress(
iterconsume(matches, reverse=False),
tr("Verified %d/%d matches"),
every=10,
count=len(matches),
)
for ref_id, other_id, percentage in myiter:
ref = id2picture[ref_id]
other = id2picture[other_id]
if percentage == 100 and ref.md5 != other.md5:
percentage = 99
if percentage >= threshold:
ref.dimensions # pre-read dimensions for display in results
other.dimensions
result.append(get_match(ref, other, percentage))
return result
multiprocessing.freeze_support()

31
core/pe/matchexif.py Normal file
View File

@@ -0,0 +1,31 @@
# Created By: Virgil Dupras
# Created On: 2011-04-20
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from collections import defaultdict
from itertools import combinations
from hscommon.trans import tr
from core.engine import Match
def getmatches(files, match_scaled, j):
timestamp2pic = defaultdict(set)
for picture in j.iter_with_progress(files, tr("Read EXIF of %d/%d pictures")):
timestamp = picture.exif_timestamp
if timestamp:
timestamp2pic[timestamp].add(picture)
if '0000:00:00 00:00:00' in timestamp2pic: # very likely false matches
del timestamp2pic['0000:00:00 00:00:00']
matches = []
for pictures in timestamp2pic.values():
for p1, p2 in combinations(pictures, 2):
if (not match_scaled) and (p1.dimensions != p2.dimensions):
continue
matches.append(Match(p1, p2, 100))
return matches

253
core/pe/modules/block.c Normal file
View File

@@ -0,0 +1,253 @@
/* Created By: Virgil Dupras
* Created On: 2010-01-30
* Copyright 2014 Hardcoded Software (http://www.hardcoded.net)
*
* This software is licensed under the "BSD" License as described in the "LICENSE" file,
* which should be included with this package. The terms are also available at
* http://www.hardcoded.net/licenses/bsd_license
*/
#include "common.h"
/* avgdiff/maxdiff has been called with empty lists */
static PyObject *NoBlocksError;
/* avgdiff/maxdiff has been called with 2 block lists of different size. */
static PyObject *DifferentBlockCountError;
/* Returns a 3 sized tuple containing the mean color of 'image'.
* image: a PIL image or crop.
*/
static PyObject* getblock(PyObject *image)
{
int i, totr, totg, totb;
Py_ssize_t pixel_count;
PyObject *ppixels;
totr = totg = totb = 0;
ppixels = PyObject_CallMethod(image, "getdata", NULL);
if (ppixels == NULL) {
return NULL;
}
pixel_count = PySequence_Length(ppixels);
for (i=0; i<pixel_count; i++) {
PyObject *ppixel, *pr, *pg, *pb;
int r, g, b;
ppixel = PySequence_ITEM(ppixels, i);
pr = PySequence_ITEM(ppixel, 0);
pg = PySequence_ITEM(ppixel, 1);
pb = PySequence_ITEM(ppixel, 2);
Py_DECREF(ppixel);
r = PyLong_AsLong(pr);
g = PyLong_AsLong(pg);
b = PyLong_AsLong(pb);
Py_DECREF(pr);
Py_DECREF(pg);
Py_DECREF(pb);
totr += r;
totg += g;
totb += b;
}
Py_DECREF(ppixels);
if (pixel_count) {
totr /= pixel_count;
totg /= pixel_count;
totb /= pixel_count;
}
return inttuple(3, totr, totg, totb);
}
/* Returns the difference between the first block and the second.
* It returns an absolute sum of the 3 differences (RGB).
*/
static int diff(PyObject *first, PyObject *second)
{
int r1, g1, b1, r2, b2, g2;
PyObject *pr, *pg, *pb;
pr = PySequence_ITEM(first, 0);
pg = PySequence_ITEM(first, 1);
pb = PySequence_ITEM(first, 2);
r1 = PyLong_AsLong(pr);
g1 = PyLong_AsLong(pg);
b1 = PyLong_AsLong(pb);
Py_DECREF(pr);
Py_DECREF(pg);
Py_DECREF(pb);
pr = PySequence_ITEM(second, 0);
pg = PySequence_ITEM(second, 1);
pb = PySequence_ITEM(second, 2);
r2 = PyLong_AsLong(pr);
g2 = PyLong_AsLong(pg);
b2 = PyLong_AsLong(pb);
Py_DECREF(pr);
Py_DECREF(pg);
Py_DECREF(pb);
return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2);
}
PyDoc_STRVAR(block_getblocks2_doc,
"Returns a list of blocks (3 sized tuples).\n\
\n\
image: A PIL image to base the blocks on.\n\
block_count_per_side: This integer determine the number of blocks the function will return.\n\
If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not\n\
necessarely cover square areas. The area covered by each block will be proportional to the image\n\
itself.\n");
static PyObject* block_getblocks2(PyObject *self, PyObject *args)
{
int block_count_per_side, width, height, block_width, block_height, ih;
PyObject *image;
PyObject *pimage_size, *pwidth, *pheight;
PyObject *result;
if (!PyArg_ParseTuple(args, "Oi", &image, &block_count_per_side)) {
return NULL;
}
pimage_size = PyObject_GetAttrString(image, "size");
pwidth = PySequence_ITEM(pimage_size, 0);
pheight = PySequence_ITEM(pimage_size, 1);
width = PyLong_AsLong(pwidth);
height = PyLong_AsLong(pheight);
Py_DECREF(pimage_size);
Py_DECREF(pwidth);
Py_DECREF(pheight);
if (!(width && height)) {
return PyList_New(0);
}
block_width = max(width / block_count_per_side, 1);
block_height = max(height / block_count_per_side, 1);
result = PyList_New(block_count_per_side * block_count_per_side);
if (result == NULL) {
return NULL;
}
for (ih=0; ih<block_count_per_side; ih++) {
int top, bottom, iw;
top = min(ih*block_height, height-block_height);
bottom = top + block_height;
for (iw=0; iw<block_count_per_side; iw++) {
int left, right;
PyObject *pbox;
PyObject *pmethodname;
PyObject *pcrop;
PyObject *pblock;
left = min(iw*block_width, width-block_width);
right = left + block_width;
pbox = inttuple(4, left, top, right, bottom);
pmethodname = PyUnicode_FromString("crop");
pcrop = PyObject_CallMethodObjArgs(image, pmethodname, pbox, NULL);
Py_DECREF(pmethodname);
Py_DECREF(pbox);
if (pcrop == NULL) {
Py_DECREF(result);
return NULL;
}
pblock = getblock(pcrop);
Py_DECREF(pcrop);
if (pblock == NULL) {
Py_DECREF(result);
return NULL;
}
PyList_SET_ITEM(result, ih*block_count_per_side+iw, pblock);
}
}
return result;
}
PyDoc_STRVAR(block_avgdiff_doc,
"Returns the average diff between first blocks and seconds.\n\
\n\
If the result surpasses limit, limit + 1 is returned, except if less than min_iterations\n\
iterations have been made in the blocks.\n");
static PyObject* block_avgdiff(PyObject *self, PyObject *args)
{
PyObject *first, *second;
int limit, min_iterations;
Py_ssize_t count;
int sum, i, result;
if (!PyArg_ParseTuple(args, "OOii", &first, &second, &limit, &min_iterations)) {
return NULL;
}
count = PySequence_Length(first);
if (count != PySequence_Length(second)) {
PyErr_SetString(DifferentBlockCountError, "");
return NULL;
}
if (!count) {
PyErr_SetString(NoBlocksError, "");
return NULL;
}
sum = 0;
for (i=0; i<count; i++) {
int iteration_count;
PyObject *item1, *item2;
iteration_count = i + 1;
item1 = PySequence_ITEM(first, i);
item2 = PySequence_ITEM(second, i);
sum += diff(item1, item2);
Py_DECREF(item1);
Py_DECREF(item2);
if ((sum > limit*iteration_count) && (iteration_count >= min_iterations)) {
return PyLong_FromLong(limit + 1);
}
}
result = sum / count;
if (!result && sum) {
result = 1;
}
return PyLong_FromLong(result);
}
static PyMethodDef BlockMethods[] = {
{"getblocks2", block_getblocks2, METH_VARARGS, block_getblocks2_doc},
{"avgdiff", block_avgdiff, METH_VARARGS, block_avgdiff_doc},
{NULL, NULL, 0, NULL} /* Sentinel */
};
static struct PyModuleDef BlockDef = {
PyModuleDef_HEAD_INIT,
"_block",
NULL,
-1,
BlockMethods,
NULL,
NULL,
NULL,
NULL
};
PyObject *
PyInit__block(void)
{
PyObject *m = PyModule_Create(&BlockDef);
if (m == NULL) {
return NULL;
}
NoBlocksError = PyErr_NewException("_block.NoBlocksError", NULL, NULL);
PyModule_AddObject(m, "NoBlocksError", NoBlocksError);
DifferentBlockCountError = PyErr_NewException("_block.DifferentBlockCountError", NULL, NULL);
PyModule_AddObject(m, "DifferentBlockCountError", DifferentBlockCountError);
return m;
}

303
core/pe/modules/block_osx.m Normal file
View File

@@ -0,0 +1,303 @@
/* Created By: Virgil Dupras
* Created On: 2010-02-04
* Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
*
* This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
* which should be included with this package. The terms are also available at
* http://www.gnu.org/licenses/gpl-3.0.html
**/
#include "common.h"
#import <Foundation/Foundation.h>
#define RADIANS( degrees ) ( degrees * M_PI / 180 )
static CFStringRef
pystring2cfstring(PyObject *pystring)
{
PyObject *encoded;
UInt8 *s;
CFIndex size;
CFStringRef result;
if (PyUnicode_Check(pystring)) {
encoded = PyUnicode_AsUTF8String(pystring);
if (encoded == NULL) {
return NULL;
}
} else {
encoded = pystring;
Py_INCREF(encoded);
}
s = (UInt8*)PyBytes_AS_STRING(encoded);
size = PyBytes_GET_SIZE(encoded);
result = CFStringCreateWithBytes(NULL, s, size, kCFStringEncodingUTF8, FALSE);
Py_DECREF(encoded);
return result;
}
static PyObject* block_osx_get_image_size(PyObject *self, PyObject *args)
{
PyObject *path;
CFStringRef image_path;
CFURLRef image_url;
CGImageSourceRef source;
CGImageRef image;
long width, height;
PyObject *pwidth, *pheight;
PyObject *result;
width = 0;
height = 0;
if (!PyArg_ParseTuple(args, "O", &path)) {
return NULL;
}
image_path = pystring2cfstring(path);
if (image_path == NULL) {
return PyErr_NoMemory();
}
image_url = CFURLCreateWithFileSystemPath(NULL, image_path, kCFURLPOSIXPathStyle, FALSE);
CFRelease(image_path);
source = CGImageSourceCreateWithURL(image_url, NULL);
CFRelease(image_url);
if (source != NULL) {
image = CGImageSourceCreateImageAtIndex(source, 0, NULL);
if (image != NULL) {
width = CGImageGetWidth(image);
height = CGImageGetHeight(image);
CGImageRelease(image);
}
CFRelease(source);
}
pwidth = PyLong_FromLong(width);
if (pwidth == NULL) {
return NULL;
}
pheight = PyLong_FromLong(height);
if (pheight == NULL) {
return NULL;
}
result = PyTuple_Pack(2, pwidth, pheight);
Py_DECREF(pwidth);
Py_DECREF(pheight);
return result;
}
static CGContextRef
MyCreateBitmapContext(int width, int height)
{
CGContextRef context = NULL;
CGColorSpaceRef colorSpace;
void *bitmapData;
int bitmapByteCount;
int bitmapBytesPerRow;
bitmapBytesPerRow = (width * 4);
bitmapByteCount = (bitmapBytesPerRow * height);
colorSpace = CGColorSpaceCreateWithName(kCGColorSpaceGenericRGB);
// calloc() must be used to allocate bitmapData here because the buffer has to be zeroed.
// If it's not zeroes, when images with transparency are drawn in the context, this buffer
// will stay with undefined pixels, which means that two pictures with the same pixels will
// most likely have different blocks (which is not supposed to happen).
bitmapData = calloc(bitmapByteCount, 1);
if (bitmapData == NULL) {
fprintf(stderr, "Memory not allocated!");
return NULL;
}
context = CGBitmapContextCreate(bitmapData, width, height, 8, bitmapBytesPerRow, colorSpace,
(CGBitmapInfo)kCGImageAlphaNoneSkipLast);
if (context== NULL) {
free(bitmapData);
fprintf(stderr, "Context not created!");
return NULL;
}
CGColorSpaceRelease(colorSpace);
return context;
}
static PyObject* getblock(unsigned char *imageData, int imageWidth, int imageHeight, int boxX, int boxY, int boxW, int boxH)
{
int i,j, totalR, totalG, totalB;
totalR = totalG = totalB = 0;
for(i=boxY; i<boxY+boxH; i++) {
for(j=boxX; j<boxX+boxW; j++) {
int offset = (i * imageWidth * 4) + (j * 4);
totalR += *(imageData + offset);
totalG += *(imageData + offset + 1);
totalB += *(imageData + offset + 2);
}
}
int pixelCount = boxH * boxW;
totalR /= pixelCount;
totalG /= pixelCount;
totalB /= pixelCount;
return inttuple(3, totalR, totalG, totalB);
}
static PyObject* block_osx_getblocks(PyObject *self, PyObject *args)
{
PyObject *path, *result;
CFStringRef image_path;
CFURLRef image_url;
CGImageSourceRef source;
CGImageRef image;
size_t width, height, image_width, image_height;
int block_count, block_width, block_height, orientation, i;
if (!PyArg_ParseTuple(args, "Oii", &path, &block_count, &orientation)) {
return NULL;
}
if (PySequence_Length(path) == 0) {
PyErr_SetString(PyExc_ValueError, "empty path");
return NULL;
}
if ((orientation > 8) || (orientation < 0)) {
orientation = 0; // simplifies checks later since we can only have values in 0-8
}
image_path = pystring2cfstring(path);
if (image_path == NULL) {
return PyErr_NoMemory();
}
image_url = CFURLCreateWithFileSystemPath(NULL, image_path, kCFURLPOSIXPathStyle, FALSE);
CFRelease(image_path);
source = CGImageSourceCreateWithURL(image_url, NULL);
CFRelease(image_url);
if (source == NULL) {
return PyErr_NoMemory();
}
image = CGImageSourceCreateImageAtIndex(source, 0, NULL);
if (image == NULL) {
CFRelease(source);
return PyErr_NoMemory();
}
width = image_width = CGImageGetWidth(image);
height = image_height = CGImageGetHeight(image);
if (orientation >= 5) {
// orientations 5-8 rotate the photo sideways, so we have to swap width and height
width = image_height;
height = image_width;
}
CGContextRef context = MyCreateBitmapContext(width, height);
if (orientation == 2) {
// Flip X
CGContextTranslateCTM(context, width, 0);
CGContextScaleCTM(context, -1, 1);
}
else if (orientation == 3) {
// Rot 180
CGContextTranslateCTM(context, width, height);
CGContextRotateCTM(context, RADIANS(180));
}
else if (orientation == 4) {
// Flip Y
CGContextTranslateCTM(context, 0, height);
CGContextScaleCTM(context, 1, -1);
}
else if (orientation == 5) {
// Flip X + Rot CW 90
CGContextTranslateCTM(context, width, 0);
CGContextScaleCTM(context, -1, 1);
CGContextTranslateCTM(context, 0, height);
CGContextRotateCTM(context, RADIANS(-90));
}
else if (orientation == 6) {
// Rot CW 90
CGContextTranslateCTM(context, 0, height);
CGContextRotateCTM(context, RADIANS(-90));
}
else if (orientation == 7) {
// Rot CCW 90 + Flip X
CGContextTranslateCTM(context, width, 0);
CGContextScaleCTM(context, -1, 1);
CGContextTranslateCTM(context, width, 0);
CGContextRotateCTM(context, RADIANS(90));
}
else if (orientation == 8) {
// Rot CCW 90
CGContextTranslateCTM(context, width, 0);
CGContextRotateCTM(context, RADIANS(90));
}
CGRect myBoundingBox = CGRectMake(0, 0, image_width, image_height);
CGContextDrawImage(context, myBoundingBox, image);
unsigned char *bitmapData = CGBitmapContextGetData(context);
CGContextRelease(context);
CGImageRelease(image);
CFRelease(source);
if (bitmapData == NULL) {
return PyErr_NoMemory();
}
block_width = max(width/block_count, 1);
block_height = max(height/block_count, 1);
result = PyList_New(block_count * block_count);
if (result == NULL) {
return NULL;
}
for(i=0; i<block_count; i++) {
int j, top;
top = min(i*block_height, height-block_height);
for(j=0; j<block_count; j++) {
int left;
left = min(j*block_width, width-block_width);
PyObject *block = getblock(bitmapData, width, height, left, top, block_width, block_height);
if (block == NULL) {
Py_DECREF(result);
return NULL;
}
PyList_SET_ITEM(result, i*block_count+j, block);
}
}
free(bitmapData);
return result;
}
static PyMethodDef BlockOsxMethods[] = {
{"get_image_size", block_osx_get_image_size, METH_VARARGS, ""},
{"getblocks", block_osx_getblocks, METH_VARARGS, ""},
{NULL, NULL, 0, NULL} /* Sentinel */
};
static struct PyModuleDef BlockOsxDef = {
PyModuleDef_HEAD_INIT,
"_block_osx",
NULL,
-1,
BlockOsxMethods,
NULL,
NULL,
NULL,
NULL
};
PyObject *
PyInit__block_osx(void)
{
PyObject *m = PyModule_Create(&BlockOsxDef);
if (m == NULL) {
return NULL;
}
return m;
}

95
core/pe/modules/cache.c Normal file
View File

@@ -0,0 +1,95 @@
/* Created By: Virgil Dupras
* Created On: 2010-01-30
* Copyright 2014 Hardcoded Software (http://www.hardcoded.net)
*
* This software is licensed under the "BSD" License as described in the "LICENSE" file,
* which should be included with this package. The terms are also available at
* http://www.hardcoded.net/licenses/bsd_license
*/
#include "common.h"
/* I know that there strtol out there, but it requires a pointer to
* a char, which would in turn require me to buffer my chars around,
* making the whole process slower.
*/
static long
xchar_to_long(char c)
{
if ((c >= 48) && (c <= 57)) { /* 0-9 */
return c - 48;
}
else if ((c >= 65) && (c <= 70)) { /* A-F */
return c - 55;
}
else if ((c >= 97) && (c <= 102)) { /* a-f */
return c - 87;
}
return 0;
}
static PyObject*
cache_string_to_colors(PyObject *self, PyObject *args)
{
char *s;
Py_ssize_t char_count, color_count, i;
PyObject *result;
if (!PyArg_ParseTuple(args, "s#", &s, &char_count)) {
return NULL;
}
color_count = (char_count / 6);
result = PyList_New(color_count);
if (result == NULL) {
return NULL;
}
for (i=0; i<color_count; i++) {
long r, g, b;
Py_ssize_t ci;
PyObject *color_tuple;
ci = i * 6;
r = (xchar_to_long(s[ci]) << 4) + xchar_to_long(s[ci+1]);
g = (xchar_to_long(s[ci+2]) << 4) + xchar_to_long(s[ci+3]);
b = (xchar_to_long(s[ci+4]) << 4) + xchar_to_long(s[ci+5]);
color_tuple = inttuple(3, r, g, b);
if (color_tuple == NULL) {
Py_DECREF(result);
return NULL;
}
PyList_SET_ITEM(result, i, color_tuple);
}
return result;
}
static PyMethodDef CacheMethods[] = {
{"string_to_colors", cache_string_to_colors, METH_VARARGS,
"Transform the string 's' in a list of 3 sized tuples."},
{NULL, NULL, 0, NULL} /* Sentinel */
};
static struct PyModuleDef CacheDef = {
PyModuleDef_HEAD_INIT,
"_cache",
NULL,
-1,
CacheMethods,
NULL,
NULL,
NULL,
NULL
};
PyObject *
PyInit__cache(void)
{
PyObject *m = PyModule_Create(&CacheDef);
if (m == NULL) {
return NULL;
}
return m;
}

45
core/pe/modules/common.c Normal file
View File

@@ -0,0 +1,45 @@
/* Created By: Virgil Dupras
* Created On: 2010-02-04
* Copyright 2014 Hardcoded Software (http://www.hardcoded.net)
*
* This software is licensed under the "BSD" License as described in the "LICENSE" file,
* which should be included with this package. The terms are also available at
* http://www.hardcoded.net/licenses/bsd_license
*/
#include "common.h"
#ifndef _MSC_VER
int max(int a, int b)
{
return b > a ? b : a;
}
int min(int a, int b)
{
return b < a ? b : a;
}
#endif
PyObject* inttuple(int n, ...)
{
int i;
PyObject *pnumber;
PyObject *result;
va_list numbers;
va_start(numbers, n);
result = PyTuple_New(n);
for (i=0; i<n; i++) {
pnumber = PyLong_FromLong(va_arg(numbers, long));
if (pnumber == NULL) {
Py_DECREF(result);
return NULL;
}
PyTuple_SET_ITEM(result, i, pnumber);
}
va_end(numbers);
return result;
}

20
core/pe/modules/common.h Normal file
View File

@@ -0,0 +1,20 @@
/* Created By: Virgil Dupras
* Created On: 2010-02-04
* Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
*
* This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
* which should be included with this package. The terms are also available at
* http://www.gnu.org/licenses/gpl-3.0.html
*/
#define PY_SSIZE_T_CLEAN
#include "Python.h"
/* It seems like MS VC defines min/max already */
#ifndef _MSC_VER
int max(int a, int b);
int min(int a, int b);
#endif
/* Create a tuple out of an array of integers. */
PyObject* inttuple(int n, ...);

106
core/pe/photo.py Normal file
View File

@@ -0,0 +1,106 @@
# Copyright 2016 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
import logging
from hscommon.util import get_file_ext, format_size
from core.util import format_timestamp, format_perc, format_dupe_count
from core import fs
from . import exif
# This global value is set by the platform-specific subclasser of the Photo base class
PLAT_SPECIFIC_PHOTO_CLASS = None
def format_dimensions(dimensions):
return '%d x %d' % (dimensions[0], dimensions[1])
def get_delta_dimensions(value, ref_value):
return (value[0]-ref_value[0], value[1]-ref_value[1])
class Photo(fs.File):
INITIAL_INFO = fs.File.INITIAL_INFO.copy()
INITIAL_INFO.update({
'dimensions': (0, 0),
'exif_timestamp': '',
})
__slots__ = fs.File.__slots__ + tuple(INITIAL_INFO.keys())
# These extensions are supported on all platforms
HANDLED_EXTS = {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'tif'}
def _plat_get_dimensions(self):
raise NotImplementedError()
def _plat_get_blocks(self, block_count_per_side, orientation):
raise NotImplementedError()
def _get_orientation(self):
if not hasattr(self, '_cached_orientation'):
try:
with self.path.open('rb') as fp:
exifdata = exif.get_fields(fp)
# the value is a list (probably one-sized) of ints
orientations = exifdata['Orientation']
self._cached_orientation = orientations[0]
except Exception: # Couldn't read EXIF data, no transforms
self._cached_orientation = 0
return self._cached_orientation
def _get_exif_timestamp(self):
try:
with self.path.open('rb') as fp:
exifdata = exif.get_fields(fp)
return exifdata['DateTimeOriginal']
except Exception:
logging.info("Couldn't read EXIF of picture: %s", self.path)
return ''
@classmethod
def can_handle(cls, path):
return fs.File.can_handle(path) and get_file_ext(path.name) in cls.HANDLED_EXTS
def get_display_info(self, group, delta):
size = self.size
mtime = self.mtime
dimensions = self.dimensions
m = group.get_match_of(self)
if m:
percentage = m.percentage
dupe_count = 0
if delta:
r = group.ref
size -= r.size
mtime -= r.mtime
dimensions = get_delta_dimensions(dimensions, r.dimensions)
else:
percentage = group.percentage
dupe_count = len(group.dupes)
dupe_folder_path = getattr(self, 'display_folder_path', self.folder_path)
return {
'name': self.name,
'folder_path': str(dupe_folder_path),
'size': format_size(size, 0, 1, False),
'extension': self.extension,
'dimensions': format_dimensions(dimensions),
'exif_timestamp': self.exif_timestamp,
'mtime': format_timestamp(mtime, delta and m),
'percentage': format_perc(percentage),
'dupe_count': format_dupe_count(dupe_count),
}
def _read_info(self, field):
fs.File._read_info(self, field)
if field == 'dimensions':
self.dimensions = self._plat_get_dimensions()
if self._get_orientation() in {5, 6, 7, 8}:
self.dimensions = (self.dimensions[1], self.dimensions[0])
elif field == 'exif_timestamp':
self.exif_timestamp = self._get_exif_timestamp()
def get_blocks(self, block_count_per_side):
return self._plat_get_blocks(block_count_per_side, self._get_orientation())

31
core/pe/prioritize.py Normal file
View File

@@ -0,0 +1,31 @@
# Created On: 2011/09/16
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.trans import trget
from core.prioritize import (
KindCategory, FolderCategory, FilenameCategory, NumericalCategory,
SizeCategory, MtimeCategory
)
coltr = trget('columns')
class DimensionsCategory(NumericalCategory):
NAME = coltr("Dimensions")
def extract_value(self, dupe):
return dupe.dimensions
def invert_numerical_value(self, value):
width, height = value
return (-width, -height)
def all_categories():
return [
KindCategory, FolderCategory, FilenameCategory, SizeCategory, DimensionsCategory,
MtimeCategory
]

28
core/pe/result_table.py Normal file
View File

@@ -0,0 +1,28 @@
# Created On: 2011-11-27
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.gui.column import Column
from hscommon.trans import trget
from core.gui.result_table import ResultTable as ResultTableBase
coltr = trget('columns')
class ResultTable(ResultTableBase):
COLUMNS = [
Column('marked', ''),
Column('name', coltr("Filename")),
Column('folder_path', coltr("Folder"), optional=True),
Column('size', coltr("Size (KB)"), optional=True),
Column('extension', coltr("Kind"), visible=False, optional=True),
Column('dimensions', coltr("Dimensions"), optional=True),
Column('exif_timestamp', coltr("EXIF Timestamp"), visible=False, optional=True),
Column('mtime', coltr("Modification"), visible=False, optional=True),
Column('percentage', coltr("Match %"), optional=True),
Column('dupe_count', coltr("Dupe Count"), visible=False, optional=True),
]
DELTA_COLUMNS = {'size', 'dimensions', 'mtime'}

32
core/pe/scanner.py Normal file
View File

@@ -0,0 +1,32 @@
# Copyright 2016 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.trans import tr
from core.scanner import Scanner, ScanType, ScanOption
from . import matchblock, matchexif
class ScannerPE(Scanner):
cache_path = None
match_scaled = False
threshold = 75
@staticmethod
def get_scan_options():
return [
ScanOption(ScanType.FuzzyBlock, tr("Contents")),
ScanOption(ScanType.ExifTimestamp, tr("EXIF Timestamp")),
]
def _getmatches(self, files, j):
if self.scan_type == ScanType.FuzzyBlock:
return matchblock.getmatches(files, self.cache_path, self.threshold, self.match_scaled, j)
elif self.scan_type == ScanType.ExifTimestamp:
return matchexif.getmatches(files, self.match_scaled, j)
else:
raise Exception("Invalid scan type")

1
core/se/__init__.py Normal file
View File

@@ -0,0 +1 @@
from . import fs, result_table, scanner # noqa

47
core/se/fs.py Normal file
View File

@@ -0,0 +1,47 @@
# Created By: Virgil Dupras
# Created On: 2013-07-14
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.util import format_size
from core import fs
from core.util import format_timestamp, format_perc, format_words, format_dupe_count
def get_display_info(dupe, group, delta):
size = dupe.size
mtime = dupe.mtime
m = group.get_match_of(dupe)
if m:
percentage = m.percentage
dupe_count = 0
if delta:
r = group.ref
size -= r.size
mtime -= r.mtime
else:
percentage = group.percentage
dupe_count = len(group.dupes)
return {
'name': dupe.name,
'folder_path': str(dupe.folder_path),
'size': format_size(size, 0, 1, False),
'extension': dupe.extension,
'mtime': format_timestamp(mtime, delta and m),
'percentage': format_perc(percentage),
'words': format_words(dupe.words) if hasattr(dupe, 'words') else '',
'dupe_count': format_dupe_count(dupe_count),
}
class File(fs.File):
def get_display_info(self, group, delta):
return get_display_info(self, group, delta)
class Folder(fs.Folder):
def get_display_info(self, group, delta):
return get_display_info(self, group, delta)

27
core/se/result_table.py Normal file
View File

@@ -0,0 +1,27 @@
# Created On: 2011-11-27
# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.gui.column import Column
from hscommon.trans import trget
from core.gui.result_table import ResultTable as ResultTableBase
coltr = trget('columns')
class ResultTable(ResultTableBase):
COLUMNS = [
Column('marked', ''),
Column('name', coltr("Filename")),
Column('folder_path', coltr("Folder"), optional=True),
Column('size', coltr("Size (KB)"), optional=True),
Column('extension', coltr("Kind"), visible=False, optional=True),
Column('mtime', coltr("Modification"), visible=False, optional=True),
Column('percentage', coltr("Match %"), optional=True),
Column('words', coltr("Words Used"), visible=False, optional=True),
Column('dupe_count', coltr("Dupe Count"), visible=False, optional=True),
]
DELTA_COLUMNS = {'size', 'mtime'}

19
core/se/scanner.py Normal file
View File

@@ -0,0 +1,19 @@
# Copyright 2016 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
from hscommon.trans import tr
from core.scanner import Scanner as ScannerBase, ScanOption, ScanType
class ScannerSE(ScannerBase):
@staticmethod
def get_scan_options():
return [
ScanOption(ScanType.Filename, tr("Filename")),
ScanOption(ScanType.Contents, tr("Contents")),
ScanOption(ScanType.Folders, tr("Folders")),
]

312
core/tests/block_test.py Normal file
View File

@@ -0,0 +1,312 @@
# Copyright 2016 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
# The commented out tests are tests for function that have been converted to pure C for speed
from pytest import raises, skip
from hscommon.testutil import eq_
try:
from ..pe.block import avgdiff, getblocks2, NoBlocksError, DifferentBlockCountError
except ImportError:
skip("Can't import the block module, probably hasn't been compiled.")
def my_avgdiff(first, second, limit=768, min_iter=3): # this is so I don't have to re-write every call
return avgdiff(first, second, limit, min_iter)
BLACK = (0, 0, 0)
RED = (0xff, 0, 0)
GREEN = (0, 0xff, 0)
BLUE = (0, 0, 0xff)
class FakeImage:
def __init__(self, size, data):
self.size = size
self.data = data
def getdata(self):
return self.data
def crop(self, box):
pixels = []
for i in range(box[1], box[3]):
for j in range(box[0], box[2]):
pixel = self.data[i * self.size[0] + j]
pixels.append(pixel)
return FakeImage((box[2] - box[0], box[3] - box[1]), pixels)
def empty():
return FakeImage((0, 0), [])
def single_pixel(): #one red pixel
return FakeImage((1, 1), [(0xff, 0, 0)])
def four_pixels():
pixels = [RED, (0, 0x80, 0xff), (0x80, 0, 0), (0, 0x40, 0x80)]
return FakeImage((2, 2), pixels)
class TestCasegetblock:
def test_single_pixel(self):
im = single_pixel()
[b] = getblocks2(im, 1)
eq_(RED, b)
def test_no_pixel(self):
im = empty()
eq_([], getblocks2(im, 1))
def test_four_pixels(self):
im = four_pixels()
[b] = getblocks2(im, 1)
meanred = (0xff + 0x80) // 4
meangreen = (0x80 + 0x40) // 4
meanblue = (0xff + 0x80) // 4
eq_((meanred, meangreen, meanblue), b)
# class TCdiff(unittest.TestCase):
# def test_diff(self):
# b1 = (10, 20, 30)
# b2 = (1, 2, 3)
# eq_(9 + 18 + 27, diff(b1, b2))
#
# def test_diff_negative(self):
# b1 = (10, 20, 30)
# b2 = (1, 2, 3)
# eq_(9 + 18 + 27, diff(b2, b1))
#
# def test_diff_mixed_positive_and_negative(self):
# b1 = (1, 5, 10)
# b2 = (10, 1, 15)
# eq_(9 + 4 + 5, diff(b1, b2))
#
# class TCgetblocks(unittest.TestCase):
# def test_empty_image(self):
# im = empty()
# blocks = getblocks(im, 1)
# eq_(0, len(blocks))
#
# def test_one_block_image(self):
# im = four_pixels()
# blocks = getblocks2(im, 1)
# eq_(1, len(blocks))
# block = blocks[0]
# meanred = (0xff + 0x80) // 4
# meangreen = (0x80 + 0x40) // 4
# meanblue = (0xff + 0x80) // 4
# eq_((meanred, meangreen, meanblue), block)
#
# def test_not_enough_height_to_fit_a_block(self):
# im = FakeImage((2, 1), [BLACK, BLACK])
# blocks = getblocks(im, 2)
# eq_(0, len(blocks))
#
# def xtest_dont_include_leftovers(self):
# # this test is disabled because getblocks is not used and getblock in cdeffed
# pixels = [
# RED,(0, 0x80, 0xff), BLACK,
# (0x80, 0, 0),(0, 0x40, 0x80), BLACK,
# BLACK, BLACK, BLACK
# ]
# im = FakeImage((3, 3), pixels)
# blocks = getblocks(im, 2)
# block = blocks[0]
# #Because the block is smaller than the image, only blocksize must be considered.
# meanred = (0xff + 0x80) // 4
# meangreen = (0x80 + 0x40) // 4
# meanblue = (0xff + 0x80) // 4
# eq_((meanred, meangreen, meanblue), block)
#
# def xtest_two_blocks(self):
# # this test is disabled because getblocks is not used and getblock in cdeffed
# pixels = [BLACK for i in xrange(4 * 2)]
# pixels[0] = RED
# pixels[1] = (0, 0x80, 0xff)
# pixels[4] = (0x80, 0, 0)
# pixels[5] = (0, 0x40, 0x80)
# im = FakeImage((4, 2), pixels)
# blocks = getblocks(im, 2)
# eq_(2, len(blocks))
# block = blocks[0]
# #Because the block is smaller than the image, only blocksize must be considered.
# meanred = (0xff + 0x80) // 4
# meangreen = (0x80 + 0x40) // 4
# meanblue = (0xff + 0x80) // 4
# eq_((meanred, meangreen, meanblue), block)
# eq_(BLACK, blocks[1])
#
# def test_four_blocks(self):
# pixels = [BLACK for i in xrange(4 * 4)]
# pixels[0] = RED
# pixels[1] = (0, 0x80, 0xff)
# pixels[4] = (0x80, 0, 0)
# pixels[5] = (0, 0x40, 0x80)
# im = FakeImage((4, 4), pixels)
# blocks = getblocks2(im, 2)
# eq_(4, len(blocks))
# block = blocks[0]
# #Because the block is smaller than the image, only blocksize must be considered.
# meanred = (0xff + 0x80) // 4
# meangreen = (0x80 + 0x40) // 4
# meanblue = (0xff + 0x80) // 4
# eq_((meanred, meangreen, meanblue), block)
# eq_(BLACK, blocks[1])
# eq_(BLACK, blocks[2])
# eq_(BLACK, blocks[3])
#
class TestCasegetblocks2:
def test_empty_image(self):
im = empty()
blocks = getblocks2(im, 1)
eq_(0, len(blocks))
def test_one_block_image(self):
im = four_pixels()
blocks = getblocks2(im, 1)
eq_(1, len(blocks))
block = blocks[0]
meanred = (0xff + 0x80) // 4
meangreen = (0x80 + 0x40) // 4
meanblue = (0xff + 0x80) // 4
eq_((meanred, meangreen, meanblue), block)
def test_four_blocks_all_black(self):
im = FakeImage((2, 2), [BLACK, BLACK, BLACK, BLACK])
blocks = getblocks2(im, 2)
eq_(4, len(blocks))
for block in blocks:
eq_(BLACK, block)
def test_two_pixels_image_horizontal(self):
pixels = [RED, BLUE]
im = FakeImage((2, 1), pixels)
blocks = getblocks2(im, 2)
eq_(4, len(blocks))
eq_(RED, blocks[0])
eq_(BLUE, blocks[1])
eq_(RED, blocks[2])
eq_(BLUE, blocks[3])
def test_two_pixels_image_vertical(self):
pixels = [RED, BLUE]
im = FakeImage((1, 2), pixels)
blocks = getblocks2(im, 2)
eq_(4, len(blocks))
eq_(RED, blocks[0])
eq_(RED, blocks[1])
eq_(BLUE, blocks[2])
eq_(BLUE, blocks[3])
class TestCaseavgdiff:
def test_empty(self):
with raises(NoBlocksError):
my_avgdiff([], [])
def test_two_blocks(self):
b1 = (5, 10, 15)
b2 = (255, 250, 245)
b3 = (0, 0, 0)
b4 = (255, 0, 255)
blocks1 = [b1, b2]
blocks2 = [b3, b4]
expected1 = 5 + 10 + 15
expected2 = 0 + 250 + 10
expected = (expected1 + expected2) // 2
eq_(expected, my_avgdiff(blocks1, blocks2))
def test_blocks_not_the_same_size(self):
b = (0, 0, 0)
with raises(DifferentBlockCountError):
my_avgdiff([b, b], [b])
def test_first_arg_is_empty_but_not_second(self):
#Don't return 0 (as when the 2 lists are empty), raise!
b = (0, 0, 0)
with raises(DifferentBlockCountError):
my_avgdiff([], [b])
def test_limit(self):
ref = (0, 0, 0)
b1 = (10, 10, 10) #avg 30
b2 = (20, 20, 20) #avg 45
b3 = (30, 30, 30) #avg 60
blocks1 = [ref, ref, ref]
blocks2 = [b1, b2, b3]
eq_(45, my_avgdiff(blocks1, blocks2, 44))
def test_min_iterations(self):
ref = (0, 0, 0)
b1 = (10, 10, 10) #avg 30
b2 = (20, 20, 20) #avg 45
b3 = (10, 10, 10) #avg 40
blocks1 = [ref, ref, ref]
blocks2 = [b1, b2, b3]
eq_(40, my_avgdiff(blocks1, blocks2, 45 - 1, 3))
# Bah, I don't know why this test fails, but I don't think it matters very much
# def test_just_over_the_limit(self):
# #A score just over the limit might return exactly the limit due to truncating. We should
# #ceil() the result in this case.
# ref = (0, 0, 0)
# b1 = (10, 0, 0)
# b2 = (11, 0, 0)
# blocks1 = [ref, ref]
# blocks2 = [b1, b2]
# eq_(11, my_avgdiff(blocks1, blocks2, 10))
#
def test_return_at_least_1_at_the_slightest_difference(self):
ref = (0, 0, 0)
b1 = (1, 0, 0)
blocks1 = [ref for i in range(250)]
blocks2 = [ref for i in range(250)]
blocks2[0] = b1
eq_(1, my_avgdiff(blocks1, blocks2))
def test_return_0_if_there_is_no_difference(self):
ref = (0, 0, 0)
blocks1 = [ref, ref]
blocks2 = [ref, ref]
eq_(0, my_avgdiff(blocks1, blocks2))
# class TCmaxdiff(unittest.TestCase):
# def test_empty(self):
# self.assertRaises(NoBlocksError, maxdiff,[],[])
#
# def test_two_blocks(self):
# b1 = (5, 10, 15)
# b2 = (255, 250, 245)
# b3 = (0, 0, 0)
# b4 = (255, 0, 255)
# blocks1 = [b1, b2]
# blocks2 = [b3, b4]
# expected1 = 5 + 10 + 15
# expected2 = 0 + 250 + 10
# expected = max(expected1, expected2)
# eq_(expected, maxdiff(blocks1, blocks2))
#
# def test_blocks_not_the_same_size(self):
# b = (0, 0, 0)
# self.assertRaises(DifferentBlockCountError, maxdiff,[b, b],[b])
#
# def test_first_arg_is_empty_but_not_second(self):
# #Don't return 0 (as when the 2 lists are empty), raise!
# b = (0, 0, 0)
# self.assertRaises(DifferentBlockCountError, maxdiff,[],[b])
#
# def test_limit(self):
# b1 = (5, 10, 15)
# b2 = (255, 250, 245)
# b3 = (0, 0, 0)
# b4 = (255, 0, 255)
# blocks1 = [b1, b2]
# blocks2 = [b3, b4]
# expected1 = 5 + 10 + 15
# expected2 = 0 + 250 + 10
# eq_(expected1, maxdiff(blocks1, blocks2, expected1 - 1))
#

143
core/tests/cache_test.py Normal file
View File

@@ -0,0 +1,143 @@
# Copyright 2016 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html
import logging
from pytest import raises, skip
from hscommon.testutil import eq_
try:
from ..pe.cache import Cache, colors_to_string, string_to_colors
except ImportError:
skip("Can't import the cache module, probably hasn't been compiled.")
class TestCasecolors_to_string:
def test_no_color(self):
eq_('', colors_to_string([]))
def test_single_color(self):
eq_('000000', colors_to_string([(0, 0, 0)]))
eq_('010101', colors_to_string([(1, 1, 1)]))
eq_('0a141e', colors_to_string([(10, 20, 30)]))
def test_two_colors(self):
eq_('000102030405', colors_to_string([(0, 1, 2), (3, 4, 5)]))
class TestCasestring_to_colors:
def test_empty(self):
eq_([], string_to_colors(''))
def test_single_color(self):
eq_([(0, 0, 0)], string_to_colors('000000'))
eq_([(2, 3, 4)], string_to_colors('020304'))
eq_([(10, 20, 30)], string_to_colors('0a141e'))
def test_two_colors(self):
eq_([(10, 20, 30), (40, 50, 60)], string_to_colors('0a141e28323c'))
def test_incomplete_color(self):
# don't return anything if it's not a complete color
eq_([], string_to_colors('102'))
class TestCaseCache:
def test_empty(self):
c = Cache()
eq_(0, len(c))
with raises(KeyError):
c['foo']
def test_set_then_retrieve_blocks(self):
c = Cache()
b = [(0, 0, 0), (1, 2, 3)]
c['foo'] = b
eq_(b, c['foo'])
def test_delitem(self):
c = Cache()
c['foo'] = ''
del c['foo']
assert 'foo' not in c
with raises(KeyError):
del c['foo']
def test_persistance(self, tmpdir):
DBNAME = tmpdir.join('hstest.db')
c = Cache(str(DBNAME))
c['foo'] = [(1, 2, 3)]
del c
c = Cache(str(DBNAME))
eq_([(1, 2, 3)], c['foo'])
def test_filter(self):
c = Cache()
c['foo'] = ''
c['bar'] = ''
c['baz'] = ''
c.filter(lambda p: p != 'bar') #only 'bar' is removed
eq_(2, len(c))
assert 'foo' in c
assert 'baz' in c
assert 'bar' not in c
def test_clear(self):
c = Cache()
c['foo'] = ''
c['bar'] = ''
c['baz'] = ''
c.clear()
eq_(0, len(c))
assert 'foo' not in c
assert 'baz' not in c
assert 'bar' not in c
def test_corrupted_db(self, tmpdir, monkeypatch):
# If we don't do this monkeypatching, we get a weird exception about trying to flush a
# closed file. I've tried setting logging level and stuff, but nothing worked. So, there we
# go, a dirty monkeypatch.
monkeypatch.setattr(logging, 'warning', lambda *args, **kw: None)
dbname = str(tmpdir.join('foo.db'))
fp = open(dbname, 'w')
fp.write('invalid sqlite content')
fp.close()
c = Cache(dbname) # should not raise a DatabaseError
c['foo'] = [(1, 2, 3)]
del c
c = Cache(dbname)
eq_(c['foo'], [(1, 2, 3)])
def test_by_id(self):
# it's possible to use the cache by referring to the files by their row_id
c = Cache()
b = [(0, 0, 0), (1, 2, 3)]
c['foo'] = b
foo_id = c.get_id('foo')
eq_(c[foo_id], b)
class TestCaseCacheSQLEscape:
def test_contains(self):
c = Cache()
assert "foo'bar" not in c
def test_getitem(self):
c = Cache()
with raises(KeyError):
c["foo'bar"]
def test_setitem(self):
c = Cache()
c["foo'bar"] = []
def test_delitem(self):
c = Cache()
c["foo'bar"] = []
try:
del c["foo'bar"]
except KeyError:
assert False

View File

@@ -12,6 +12,7 @@ from .. import fs
from ..engine import getwords, Match
from ..ignore import IgnoreList
from ..scanner import Scanner, ScanType
from ..me.scanner import ScannerME
class NamedObject:
def __init__(self, name="foobar", size=1, path=None):
@@ -528,3 +529,13 @@ def test_dont_count_ref_files_as_discarded(fake_fileexists):
o2.is_ref = True
eq_(len(s.get_dupe_groups([o1, o2, o3])), 1)
eq_(s.discarded_file_count, 0)
def test_priorize_me(fake_fileexists):
# in ScannerME, bitrate goes first (right after is_ref) in priorization
s = ScannerME()
o1, o2 = no('foo', path='p1'), no('foo', path='p2')
o1.bitrate = 1
o2.bitrate = 2
[group] = s.get_dupe_groups([o1, o2])
assert group.ref is o2