2009-06-18 19:42:27 +00:00
|
|
|
# Created By: Virgil Dupras
|
|
|
|
# Created On: 2007/02/25
|
2010-01-01 20:11:34 +00:00
|
|
|
# Copyright 2010 Hardcoded Software (http://www.hardcoded.net)
|
2009-08-05 08:59:46 +00:00
|
|
|
#
|
2010-09-30 10:17:41 +00:00
|
|
|
# This software is licensed under the "BSD" License as described in the "LICENSE" file,
|
2009-08-05 08:59:46 +00:00
|
|
|
# which should be included with this package. The terms are also available at
|
2010-09-30 10:17:41 +00:00
|
|
|
# http://www.hardcoded.net/licenses/bsd_license
|
2009-06-18 19:42:27 +00:00
|
|
|
|
2009-06-01 09:55:11 +00:00
|
|
|
import logging
|
|
|
|
import multiprocessing
|
2011-03-04 10:15:04 +00:00
|
|
|
from itertools import combinations
|
2009-06-01 09:55:11 +00:00
|
|
|
|
2011-03-04 10:15:04 +00:00
|
|
|
from hscommon.util import extract
|
2011-01-18 16:33:33 +00:00
|
|
|
from hscommon.trans import tr
|
2010-11-20 11:42:15 +00:00
|
|
|
from jobprogress import job
|
2009-06-01 09:55:11 +00:00
|
|
|
|
2009-12-30 10:37:57 +00:00
|
|
|
from core.engine import Match
|
2009-06-18 19:42:27 +00:00
|
|
|
from .block import avgdiff, DifferentBlockCountError, NoBlocksError
|
|
|
|
from .cache import Cache
|
2009-06-01 09:55:11 +00:00
|
|
|
|
2011-03-04 10:15:04 +00:00
|
|
|
# OPTIMIZATION NOTES:
|
|
|
|
# The bottleneck of the matching phase is CPU, which is why we use multiprocessing. However, another
|
|
|
|
# bottleneck that shows up when a lot of pictures are involved is Disk IO's because blocks
|
|
|
|
# constantly have to be read from disks by subprocesses. This problem is especially big on CPUs
|
|
|
|
# with a lot of cores. Therefore, we must minimize Disk IOs. The best way to achieve that is to
|
|
|
|
# separate the files to scan in "chunks" and it's by chunk that blocks are read in memory and
|
|
|
|
# compared to each other. Each file in a chunk has to be compared to each other, of course, but also
|
|
|
|
# to files in other chunks. So chunkifying doesn't save us any actual comparison, but the advantage
|
|
|
|
# is that instead of reading blocks from disk number_of_files**2 times, we read it
|
|
|
|
# number_of_files*number_of_chunks times.
|
|
|
|
# Determining the right chunk size is tricky, bceause if it's too big, too many blocks will be in
|
|
|
|
# memory at the same time and we might end up with memory trashing, which is awfully slow. So,
|
|
|
|
# because our *real* bottleneck is CPU, the chunk size must simply be enough so that the CPU isn't
|
|
|
|
# starved by Disk IOs.
|
|
|
|
|
2009-06-01 09:55:11 +00:00
|
|
|
MIN_ITERATIONS = 3
|
2009-10-18 09:26:04 +00:00
|
|
|
BLOCK_COUNT_PER_SIDE = 15
|
2011-03-04 10:15:04 +00:00
|
|
|
DEFAULT_CHUNK_SIZE = 1000
|
|
|
|
MIN_CHUNK_SIZE = 100
|
2009-06-01 09:55:11 +00:00
|
|
|
|
2009-10-03 15:37:53 +00:00
|
|
|
# Enough so that we're sure that the main thread will not wait after a result.get() call
|
2011-03-04 10:15:04 +00:00
|
|
|
# cpucount+1 should be enough to be sure that the spawned process will not wait after the results
|
2009-10-03 15:37:53 +00:00
|
|
|
# collection made by the main process.
|
2011-03-05 12:02:11 +00:00
|
|
|
try:
|
|
|
|
RESULTS_QUEUE_LIMIT = multiprocessing.cpu_count() + 1
|
|
|
|
except Exception:
|
|
|
|
# I had an IOError on app launch once. It seems to be a freak occurrence. In any case, we want
|
|
|
|
# the app to launch, so let's just put an arbitrary value.
|
|
|
|
logging.warning("Had problems to determine cpu count on launch.")
|
|
|
|
RESULTS_QUEUE_LIMIT = 8
|
2009-10-03 15:37:53 +00:00
|
|
|
|
2011-03-04 10:15:04 +00:00
|
|
|
def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
|
2009-10-18 09:26:04 +00:00
|
|
|
# The MemoryError handlers in there use logging without first caring about whether or not
|
|
|
|
# there is enough memory left to carry on the operation because it is assumed that the
|
|
|
|
# MemoryError happens when trying to read an image file, which is freed from memory by the
|
|
|
|
# time that MemoryError is raised.
|
2010-01-14 15:14:26 +00:00
|
|
|
cache = Cache(cache_path)
|
2009-10-18 09:26:04 +00:00
|
|
|
prepared = [] # only pictures for which there was no error getting blocks
|
|
|
|
try:
|
2011-01-18 16:33:33 +00:00
|
|
|
for picture in j.iter_with_progress(pictures, tr("Analyzed %d/%d pictures")):
|
2010-08-11 14:39:06 +00:00
|
|
|
picture.unicode_path = str(picture.path)
|
2011-01-29 10:31:17 +00:00
|
|
|
logging.debug("Analyzing picture at {}".format(picture.unicode_path))
|
2011-03-04 10:15:04 +00:00
|
|
|
if with_dimensions:
|
|
|
|
picture.dimensions # pre-read dimensions
|
2009-10-18 09:26:04 +00:00
|
|
|
try:
|
2010-01-14 15:14:26 +00:00
|
|
|
if picture.unicode_path not in cache:
|
2009-10-18 09:26:04 +00:00
|
|
|
blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE)
|
2010-01-14 15:14:26 +00:00
|
|
|
cache[picture.unicode_path] = blocks
|
2009-10-18 09:26:04 +00:00
|
|
|
prepared.append(picture)
|
2010-07-14 07:36:35 +00:00
|
|
|
except (IOError, ValueError) as e:
|
2010-08-11 14:39:06 +00:00
|
|
|
logging.warning(str(e))
|
2009-10-18 09:26:04 +00:00
|
|
|
except MemoryError:
|
2010-08-11 14:39:06 +00:00
|
|
|
logging.warning('Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
|
2009-10-18 09:26:04 +00:00
|
|
|
if picture.size < 10 * 1024 * 1024: # We're really running out of memory
|
|
|
|
raise
|
|
|
|
except MemoryError:
|
|
|
|
logging.warning('Ran out of memory while preparing pictures')
|
2010-01-14 15:14:26 +00:00
|
|
|
cache.close()
|
2009-10-18 09:26:04 +00:00
|
|
|
return prepared
|
|
|
|
|
2011-03-04 10:15:04 +00:00
|
|
|
def get_chunks(pictures):
|
|
|
|
min_chunk_count = multiprocessing.cpu_count() * 2 # have enough chunks to feed all subprocesses
|
|
|
|
chunk_count = len(pictures) // DEFAULT_CHUNK_SIZE
|
|
|
|
chunk_count = max(min_chunk_count, chunk_count)
|
|
|
|
chunk_size = (len(pictures) // chunk_count) + 1
|
|
|
|
chunk_size = max(MIN_CHUNK_SIZE, chunk_size)
|
|
|
|
logging.info("Creating {} chunks with a chunk size of {} for {} pictures".format(
|
|
|
|
chunk_count, chunk_size, len(pictures)))
|
|
|
|
chunks = [pictures[i:i+chunk_size] for i in range(0, len(pictures), chunk_size)]
|
|
|
|
return chunks
|
|
|
|
|
2009-10-18 09:26:04 +00:00
|
|
|
def get_match(first, second, percentage):
|
2009-06-01 09:55:11 +00:00
|
|
|
if percentage < 0:
|
|
|
|
percentage = 0
|
2009-10-18 09:26:04 +00:00
|
|
|
return Match(first, second, percentage)
|
2009-06-01 09:55:11 +00:00
|
|
|
|
2011-03-04 10:15:04 +00:00
|
|
|
def async_compare(ref_ids, other_ids, dbname, threshold, picinfo):
|
|
|
|
# The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids
|
|
|
|
# can be None. In this case, ref_ids has to be compared with itself
|
|
|
|
# picinfo is a dictionary {pic_id: (dimensions, is_ref)}
|
2010-01-14 15:14:26 +00:00
|
|
|
cache = Cache(dbname)
|
2009-06-01 09:55:11 +00:00
|
|
|
limit = 100 - threshold
|
2011-03-04 10:15:04 +00:00
|
|
|
ref_pairs = list(cache.get_multiple(ref_ids))
|
|
|
|
if other_ids is not None:
|
|
|
|
other_pairs = list(cache.get_multiple(other_ids))
|
|
|
|
comparisons_to_do = [(r, o) for r in ref_pairs for o in other_pairs]
|
|
|
|
else:
|
|
|
|
comparisons_to_do = list(combinations(ref_pairs, 2))
|
2009-06-01 09:55:11 +00:00
|
|
|
results = []
|
2011-03-04 10:15:04 +00:00
|
|
|
for (ref_id, ref_blocks), (other_id, other_blocks) in comparisons_to_do:
|
|
|
|
ref_dimensions, ref_is_ref = picinfo[ref_id]
|
|
|
|
other_dimensions, other_is_ref = picinfo[other_id]
|
|
|
|
if ref_is_ref and other_is_ref:
|
|
|
|
continue
|
|
|
|
if ref_dimensions != other_dimensions:
|
|
|
|
continue
|
2009-06-01 09:55:11 +00:00
|
|
|
try:
|
|
|
|
diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
|
|
|
|
percentage = 100 - diff
|
|
|
|
except (DifferentBlockCountError, NoBlocksError):
|
|
|
|
percentage = 0
|
|
|
|
if percentage >= threshold:
|
|
|
|
results.append((ref_id, other_id, percentage))
|
2010-01-14 15:14:26 +00:00
|
|
|
cache.close()
|
2009-06-01 09:55:11 +00:00
|
|
|
return results
|
|
|
|
|
2010-01-14 15:14:26 +00:00
|
|
|
def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nulljob):
|
2011-03-04 10:15:04 +00:00
|
|
|
def get_picinfo(p):
|
|
|
|
if match_scaled:
|
|
|
|
return (None, p.is_ref)
|
|
|
|
else:
|
|
|
|
return (p.dimensions, p.is_ref)
|
|
|
|
|
|
|
|
def collect_results(collect_all=False):
|
|
|
|
# collect results and wait until the queue is small enough to accomodate a new results.
|
|
|
|
nonlocal async_results, matches, comparison_count
|
|
|
|
limit = 0 if collect_all else RESULTS_QUEUE_LIMIT
|
|
|
|
while len(async_results) > limit:
|
|
|
|
ready, working = extract(lambda r: r.ready(), async_results)
|
|
|
|
for result in ready:
|
|
|
|
matches += result.get()
|
|
|
|
async_results.remove(result)
|
2011-03-07 10:55:37 +00:00
|
|
|
comparison_count += 1
|
2011-03-04 10:15:04 +00:00
|
|
|
progress_msg = tr("Performed %d/%d chunk matches") % (comparison_count, len(comparisons_to_do))
|
|
|
|
j.set_progress(comparison_count, progress_msg)
|
|
|
|
|
2009-10-18 09:26:04 +00:00
|
|
|
j = j.start_subjob([3, 7])
|
2011-03-04 10:15:04 +00:00
|
|
|
pictures = prepare_pictures(pictures, cache_path, with_dimensions=not match_scaled, j=j)
|
2011-01-18 16:33:33 +00:00
|
|
|
j = j.start_subjob([9, 1], tr("Preparing for matching"))
|
2010-01-14 15:14:26 +00:00
|
|
|
cache = Cache(cache_path)
|
2009-10-18 09:26:04 +00:00
|
|
|
id2picture = {}
|
|
|
|
for picture in pictures:
|
|
|
|
try:
|
|
|
|
picture.cache_id = cache.get_id(picture.unicode_path)
|
|
|
|
id2picture[picture.cache_id] = picture
|
|
|
|
except ValueError:
|
|
|
|
pass
|
2010-01-14 15:14:26 +00:00
|
|
|
cache.close()
|
2009-10-18 09:26:04 +00:00
|
|
|
pictures = [p for p in pictures if hasattr(p, 'cache_id')]
|
|
|
|
pool = multiprocessing.Pool()
|
2011-03-04 10:15:04 +00:00
|
|
|
async_results = []
|
2009-10-18 09:26:04 +00:00
|
|
|
matches = []
|
2011-03-04 10:15:04 +00:00
|
|
|
chunks = get_chunks(pictures)
|
|
|
|
# We add a None element at the end of the chunk list because each chunk has to be compared
|
|
|
|
# with itself. Thus, each chunk will show up as a ref_chunk having other_chunk set to None once.
|
|
|
|
comparisons_to_do = list(combinations(chunks + [None], 2))
|
|
|
|
comparison_count = 0
|
|
|
|
j.start_job(len(comparisons_to_do))
|
|
|
|
for ref_chunk, other_chunk in comparisons_to_do:
|
|
|
|
picinfo = {p.cache_id: get_picinfo(p) for p in ref_chunk}
|
|
|
|
ref_ids = [p.cache_id for p in ref_chunk]
|
|
|
|
if other_chunk is not None:
|
|
|
|
other_ids = [p.cache_id for p in other_chunk]
|
|
|
|
picinfo.update({p.cache_id: get_picinfo(p) for p in other_chunk})
|
|
|
|
else:
|
|
|
|
other_ids = None
|
|
|
|
args = (ref_ids, other_ids, cache_path, threshold, picinfo)
|
|
|
|
async_results.append(pool.apply_async(async_compare, args))
|
|
|
|
collect_results()
|
|
|
|
collect_results(collect_all=True)
|
|
|
|
pool.close()
|
2009-10-18 09:26:04 +00:00
|
|
|
|
|
|
|
result = []
|
2011-01-18 16:33:33 +00:00
|
|
|
for ref_id, other_id, percentage in j.iter_with_progress(matches, tr("Verified %d/%d matches"), every=10):
|
2009-10-18 09:26:04 +00:00
|
|
|
ref = id2picture[ref_id]
|
|
|
|
other = id2picture[other_id]
|
|
|
|
if percentage == 100 and ref.md5 != other.md5:
|
|
|
|
percentage = 99
|
|
|
|
if percentage >= threshold:
|
2011-03-04 10:15:04 +00:00
|
|
|
ref.dimensions # pre-read dimensions for display in results
|
|
|
|
other.dimensions
|
2009-10-18 09:26:04 +00:00
|
|
|
result.append(get_match(ref, other, percentage))
|
|
|
|
return result
|
2009-06-01 09:55:11 +00:00
|
|
|
|
|
|
|
multiprocessing.freeze_support()
|