dupeguru/core_pe/matchbase.py

# Created By: Virgil Dupras
# Created On: 2007/02/25
# Copyright 2010 Hardcoded Software (http://www.hardcoded.net)
# 
# This software is licensed under the "HS" License as described in the "LICENSE" file, 
# which should be included with this package. The terms are also available at 
# http://www.hardcoded.net/licenses/hs_license

import logging
import multiprocessing
from Queue import Empty
from collections import defaultdict

from hsutil import job
from hsutil.misc import dedupe

from core.engine import Match
from .block import avgdiff, DifferentBlockCountError, NoBlocksError
from .cache import Cache

MIN_ITERATIONS = 3
BLOCK_COUNT_PER_SIDE = 15

# Enough so that we're sure that the main thread will not wait after a result.get() call
# cpucount*2 should be enough to be sure that the spawned process will not wait after the results
# collection made by the main process.
RESULTS_QUEUE_LIMIT = multiprocessing.cpu_count() * 2

def prepare_pictures(pictures, cache_path, j=job.nulljob):
    # The MemoryError handlers in there use logging without first caring about whether or not
    # there is enough memory left to carry on the operation because it is assumed that the
    # MemoryError happens when trying to read an image file, which is freed from memory by the
    # time that MemoryError is raised.
    cache = Cache(cache_path)
    prepared = [] # only pictures for which there was no error getting blocks
    try:
        for picture in j.iter_with_progress(pictures, 'Analyzed %d/%d pictures'):
            picture.dimensions
            picture.unicode_path = unicode(picture.path)
            try:
                if picture.unicode_path not in cache:
                    blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE)
                    cache[picture.unicode_path] = blocks
                prepared.append(picture)
            except IOError as e:
                logging.warning(unicode(e))
            except MemoryError:
                logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
                if picture.size < 10 * 1024 * 1024: # We're really running out of memory
                    raise
    except MemoryError:
        logging.warning('Ran out of memory while preparing pictures')
    cache.close()
    return prepared

def get_match(first, second, percentage):
    if percentage < 0:
        percentage = 0
    return Match(first, second, percentage)

def async_compare(ref_id, other_ids, dbname, threshold):
    cache = Cache(dbname)
    limit = 100 - threshold
    ref_blocks = cache[ref_id]
    pairs = cache.get_multiple(other_ids)
    results = []
    for other_id, other_blocks in pairs:
        try:
            diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
            percentage = 100 - diff
        except (DifferentBlockCountError, NoBlocksError):
            percentage = 0
        if percentage >= threshold:
            results.append((ref_id, other_id, percentage))
    cache.close()
    return results
    
def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nulljob):
    def empty_out_queue(queue, into):
        try:
            while True:
                into.append(queue.get(block=False))
        except Empty:
            pass
    
    j = j.start_subjob([3, 7])
    pictures = prepare_pictures(pictures, cache_path, j)
    j = j.start_subjob([9, 1], 'Preparing for matching')
    cache = Cache(cache_path)
    id2picture = {}
    dimensions2pictures = defaultdict(set)
    for picture in pictures:
        try:
            picture.cache_id = cache.get_id(picture.unicode_path)
            id2picture[picture.cache_id] = picture
            if not match_scaled:
                dimensions2pictures[picture.dimensions].add(picture)
        except ValueError:
            pass
    cache.close()
    pictures = [p for p in pictures if hasattr(p, 'cache_id')]
    pool = multiprocessing.Pool()
    async_results = []
    matches = []
    pictures_copy = set(pictures)
    for ref in j.iter_with_progress(pictures, 'Matched %d/%d pictures'):
        others = pictures_copy if match_scaled else dimensions2pictures[ref.dimensions]
        others.remove(ref)
        if ref.is_ref:
            # Don't spend time comparing two ref pics together.
            others = [pic for pic in others if not pic.is_ref]
        if others:
            cache_ids = [f.cache_id for f in others]
            # We limit the number of cache_ids we send for multi-processing because otherwise, we
            # might get an error saying "String or BLOB exceeded size limit"
            ARG_LIMIT = 1000
            while cache_ids:
                args = (ref.cache_id, cache_ids[:ARG_LIMIT], cache_path, threshold)
                async_results.append(pool.apply_async(async_compare, args))
                cache_ids = cache_ids[ARG_LIMIT:]
        if len(async_results) > RESULTS_QUEUE_LIMIT:
            result = async_results.pop(0)
            matches.extend(result.get())
    for result in async_results: # process the rest of the results
        matches.extend(result.get())
    
    result = []
    for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
        ref = id2picture[ref_id]
        other = id2picture[other_id]
        if percentage == 100 and ref.md5 != other.md5:
            percentage = 99
        if percentage >= threshold:
            result.append(get_match(ref, other, percentage))
    return result

multiprocessing.freeze_support()
pe py: Adjusted the code to its move from 'base/py' --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4080 2009-06-18 19:42:27 +00:00			`# Created By: Virgil Dupras`
			`# Created On: 2007/02/25`
Changed copyright year to 2010 2010-01-01 20:11:34 +00:00			`# Copyright 2010 Hardcoded Software (http://www.hardcoded.net)`
Relicensed to HS License. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40100 2009-08-05 08:59:46 +00:00			`#`
			`# This software is licensed under the "HS" License as described in the "LICENSE" file,`
			`# which should be included with this package. The terms are also available at`
			`# http://www.hardcoded.net/licenses/hs_license`
pe py: Adjusted the code to its move from 'base/py' --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4080 2009-06-18 19:42:27 +00:00
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`import logging`
			`import multiprocessing`
			`from Queue import Empty`
			`from collections import defaultdict`

			`from hsutil import job`
py: ignore/import adjustments --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%403 2009-06-06 12:09:02 +00:00			`from hsutil.misc import dedupe`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
Changed dupeguru and dupeguru_* external references to core and core_* references. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40278 2009-12-30 10:37:57 +00:00			`from core.engine import Match`
pe py: Adjusted the code to its move from 'base/py' --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4080 2009-06-18 19:42:27 +00:00			`from .block import avgdiff, DifferentBlockCountError, NoBlocksError`
			`from .cache import Cache`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
			`MIN_ITERATIONS = 3`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`BLOCK_COUNT_PER_SIDE = 15`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
[#58 state:fixed] Moved the async results collection into the same loops as the async filler phase to avoid getting memory errors. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40174 2009-10-03 15:37:53 +00:00			`# Enough so that we're sure that the main thread will not wait after a result.get() call`
			`# cpucount*2 should be enough to be sure that the spawned process will not wait after the results`
			`# collection made by the main process.`
			`RESULTS_QUEUE_LIMIT = multiprocessing.cpu_count() * 2`

Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`def prepare_pictures(pictures, cache_path, j=job.nulljob):`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`# The MemoryError handlers in there use logging without first caring about whether or not`
			`# there is enough memory left to carry on the operation because it is assumed that the`
			`# MemoryError happens when trying to read an image file, which is freed from memory by the`
			`# time that MemoryError is raised.`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache = Cache(cache_path)`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`prepared = [] # only pictures for which there was no error getting blocks`
			`try:`
			`for picture in j.iter_with_progress(pictures, 'Analyzed %d/%d pictures'):`
			`picture.dimensions`
			`picture.unicode_path = unicode(picture.path)`
			`try:`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`if picture.unicode_path not in cache:`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE)`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache[picture.unicode_path] = blocks`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`prepared.append(picture)`
			`except IOError as e:`
			`logging.warning(unicode(e))`
			`except MemoryError:`
			`logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))`
			`if picture.size < 10 * 1024 * 1024: # We're really running out of memory`
			`raise`
			`except MemoryError:`
			`logging.warning('Ran out of memory while preparing pictures')`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache.close()`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`return prepared`

			`def get_match(first, second, percentage):`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`if percentage < 0:`
			`percentage = 0`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`return Match(first, second, percentage)`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
			`def async_compare(ref_id, other_ids, dbname, threshold):`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache = Cache(dbname)`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`limit = 100 - threshold`
			`ref_blocks = cache[ref_id]`
			`pairs = cache.get_multiple(other_ids)`
			`results = []`
			`for other_id, other_blocks in pairs:`
			`try:`
			`diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)`
			`percentage = 100 - diff`
			`except (DifferentBlockCountError, NoBlocksError):`
			`percentage = 0`
			`if percentage >= threshold:`
			`results.append((ref_id, other_id, percentage))`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache.close()`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`return results`

Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nulljob):`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`def empty_out_queue(queue, into):`
			`try:`
			`while True:`
			`into.append(queue.get(block=False))`
			`except Empty:`
			`pass`

			`j = j.start_subjob([3, 7])`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`pictures = prepare_pictures(pictures, cache_path, j)`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`j = j.start_subjob([9, 1], 'Preparing for matching')`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache = Cache(cache_path)`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`id2picture = {}`
			`dimensions2pictures = defaultdict(set)`
			`for picture in pictures:`
			`try:`
			`picture.cache_id = cache.get_id(picture.unicode_path)`
			`id2picture[picture.cache_id] = picture`
			`if not match_scaled:`
			`dimensions2pictures[picture.dimensions].add(picture)`
			`except ValueError:`
			`pass`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache.close()`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`pictures = [p for p in pictures if hasattr(p, 'cache_id')]`
			`pool = multiprocessing.Pool()`
			`async_results = []`
			`matches = []`
			`pictures_copy = set(pictures)`
			`for ref in j.iter_with_progress(pictures, 'Matched %d/%d pictures'):`
			`others = pictures_copy if match_scaled else dimensions2pictures[ref.dimensions]`
			`others.remove(ref)`
[#77 state:fixed] Don't spend time comparing 2 ref files together. 2010-01-13 09:04:53 +00:00			`if ref.is_ref:`
			`# Don't spend time comparing two ref pics together.`
			`others = [pic for pic in others if not pic.is_ref]`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`if others:`
			`cache_ids = [f.cache_id for f in others]`
Limit the size of arguments sent to multiprocessing because it could cause crashes. 2010-04-05 08:15:33 +00:00			`# We limit the number of cache_ids we send for multi-processing because otherwise, we`
			`# might get an error saying "String or BLOB exceeded size limit"`
			`ARG_LIMIT = 1000`
			`while cache_ids:`
			`args = (ref.cache_id, cache_ids[:ARG_LIMIT], cache_path, threshold)`
			`async_results.append(pool.apply_async(async_compare, args))`
			`cache_ids = cache_ids[ARG_LIMIT:]`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`if len(async_results) > RESULTS_QUEUE_LIMIT:`
			`result = async_results.pop(0)`
			`matches.extend(result.get())`
[#73 state:port] Fixed a bug causing some matches to be ignored in the new pe match algo. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40212 2009-10-24 13:54:57 +00:00			`for result in async_results: # process the rest of the results`
			`matches.extend(result.get())`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00
			`result = []`
			`for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):`
			`ref = id2picture[ref_id]`
			`other = id2picture[other_id]`
			`if percentage == 100 and ref.md5 != other.md5:`
			`percentage = 99`
			`if percentage >= threshold:`
			`result.append(get_match(ref, other, percentage))`
			`return result`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
			`multiprocessing.freeze_support()`