dupeguru/pe/py/matchbase.py

# Created By: Virgil Dupras
# Created On: 2007/02/25
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
# 
# This software is licensed under the "HS" License as described in the "LICENSE" file, 
# which should be included with this package. The terms are also available at 
# http://www.hardcoded.net/licenses/hs_license

import logging
import multiprocessing
from Queue import Empty
from collections import defaultdict

from hsutil import job
from hsutil.misc import dedupe

from dupeguru.engine import Match
from .block import avgdiff, DifferentBlockCountError, NoBlocksError
from .cache import Cache

MIN_ITERATIONS = 3

# Enough so that we're sure that the main thread will not wait after a result.get() call
# cpucount*2 should be enough to be sure that the spawned process will not wait after the results
# collection made by the main process.
RESULTS_QUEUE_LIMIT = multiprocessing.cpu_count() * 2

def get_match(first,second,percentage):
    if percentage < 0:
        percentage = 0
    return Match(first,second,percentage)

class MatchFactory(object):
    cached_blocks = None
    block_count_per_side = 15
    threshold = 75
    match_scaled = False
    
    def _do_getmatches(self, files, j):
        raise NotImplementedError()
    
    def getmatches(self, files, j=job.nulljob):
        # The MemoryError handlers in there use logging without first caring about whether or not
        # there is enough memory left to carry on the operation because it is assumed that the
        # MemoryError happens when trying to read an image file, which is freed from memory by the
        # time that MemoryError is raised.
        j = j.start_subjob([3, 7])
        logging.info('Preparing %d files' % len(files))
        prepared = self.prepare_files(files, j)
        logging.info('Finished preparing %d files' % len(prepared))
        return self._do_getmatches(prepared, j)
    
    def prepare_files(self, files, j=job.nulljob):
        prepared = [] # only files for which there was no error getting blocks
        try:
            for picture in j.iter_with_progress(files, 'Analyzed %d/%d pictures'):
                picture.dimensions
                picture.unicode_path = unicode(picture.path)
                try:
                    if picture.unicode_path not in self.cached_blocks:
                        blocks = picture.get_blocks(self.block_count_per_side)
                        self.cached_blocks[picture.unicode_path] = blocks
                    prepared.append(picture)
                except IOError as e:
                    logging.warning(unicode(e))
                except MemoryError:
                    logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
                    if picture.size < 10 * 1024 * 1024: # We're really running out of memory
                        raise
        except MemoryError:
            logging.warning('Ran out of memory while preparing files')
        return prepared
    

def async_compare(ref_id, other_ids, dbname, threshold):
    cache = Cache(dbname, threaded=False)
    limit = 100 - threshold
    ref_blocks = cache[ref_id]
    pairs = cache.get_multiple(other_ids)
    results = []
    for other_id, other_blocks in pairs:
        try:
            diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
            percentage = 100 - diff
        except (DifferentBlockCountError, NoBlocksError):
            percentage = 0
        if percentage >= threshold:
            results.append((ref_id, other_id, percentage))
    cache.con.close()
    return results

class AsyncMatchFactory(MatchFactory):
    def _do_getmatches(self, pictures, j):
        def empty_out_queue(queue, into):
            try:
                while True:
                    into.append(queue.get(block=False))
            except Empty:
                pass
        
        j = j.start_subjob([9, 1], 'Preparing for matching')
        cache = self.cached_blocks
        id2picture = {}
        dimensions2pictures = defaultdict(set)
        for picture in pictures:
            try:
                picture.cache_id = cache.get_id(picture.unicode_path)
                id2picture[picture.cache_id] = picture
                if not self.match_scaled:
                    dimensions2pictures[picture.dimensions].add(picture)
            except ValueError:
                pass
        pictures = [p for p in pictures if hasattr(p, 'cache_id')]
        pool = multiprocessing.Pool()
        async_results = []
        matches = []
        pictures_copy = set(pictures)
        for ref in j.iter_with_progress(pictures, 'Matched %d/%d pictures'):
            others = pictures_copy if self.match_scaled else dimensions2pictures[ref.dimensions]
            others.remove(ref)
            if others:
                cache_ids = [f.cache_id for f in others]
                args = (ref.cache_id, cache_ids, self.cached_blocks.dbname, self.threshold)
                async_results.append(pool.apply_async(async_compare, args))
            if len(async_results) > RESULTS_QUEUE_LIMIT:
                result = async_results.pop(0)
                matches.extend(result.get())
        
        result = []
        for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
            ref = id2picture[ref_id]
            other = id2picture[other_id]
            if percentage == 100 and ref.md5 != other.md5:
                percentage = 99
            if percentage >= self.threshold:
                result.append(get_match(ref, other, percentage))
        return result
    

multiprocessing.freeze_support()
pe py: Adjusted the code to its move from 'base/py' --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4080 2009-06-18 19:42:27 +00:00			`# Created By: Virgil Dupras`
			`# Created On: 2007/02/25`
			`# $Id$`
			`# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)`
Relicensed to HS License. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40100 2009-08-05 08:59:46 +00:00			`#`
			`# This software is licensed under the "HS" License as described in the "LICENSE" file,`
			`# which should be included with this package. The terms are also available at`
			`# http://www.hardcoded.net/licenses/hs_license`
pe py: Adjusted the code to its move from 'base/py' --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4080 2009-06-18 19:42:27 +00:00
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`import logging`
			`import multiprocessing`
			`from Queue import Empty`
			`from collections import defaultdict`

			`from hsutil import job`
py: ignore/import adjustments --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%403 2009-06-06 12:09:02 +00:00			`from hsutil.misc import dedupe`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
			`from dupeguru.engine import Match`
pe py: Adjusted the code to its move from 'base/py' --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4080 2009-06-18 19:42:27 +00:00			`from .block import avgdiff, DifferentBlockCountError, NoBlocksError`
			`from .cache import Cache`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
			`MIN_ITERATIONS = 3`

[#58 state:fixed] Moved the async results collection into the same loops as the async filler phase to avoid getting memory errors. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40174 2009-10-03 15:37:53 +00:00			`# Enough so that we're sure that the main thread will not wait after a result.get() call`
			`# cpucount*2 should be enough to be sure that the spawned process will not wait after the results`
			`# collection made by the main process.`
			`RESULTS_QUEUE_LIMIT = multiprocessing.cpu_count() * 2`

Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`def get_match(first,second,percentage):`
			`if percentage < 0:`
			`percentage = 0`
			`return Match(first,second,percentage)`

			`class MatchFactory(object):`
			`cached_blocks = None`
			`block_count_per_side = 15`
			`threshold = 75`
			`match_scaled = False`

			`def _do_getmatches(self, files, j):`
			`raise NotImplementedError()`

			`def getmatches(self, files, j=job.nulljob):`
			`# The MemoryError handlers in there use logging without first caring about whether or not`
			`# there is enough memory left to carry on the operation because it is assumed that the`
			`# MemoryError happens when trying to read an image file, which is freed from memory by the`
			`# time that MemoryError is raised.`
[#58 state:fixed] Moved the async results collection into the same loops as the async filler phase to avoid getting memory errors. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40174 2009-10-03 15:37:53 +00:00			`j = j.start_subjob([3, 7])`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`logging.info('Preparing %d files' % len(files))`
			`prepared = self.prepare_files(files, j)`
			`logging.info('Finished preparing %d files' % len(prepared))`
			`return self._do_getmatches(prepared, j)`

			`def prepare_files(self, files, j=job.nulljob):`
			`prepared = [] # only files for which there was no error getting blocks`
			`try:`
			`for picture in j.iter_with_progress(files, 'Analyzed %d/%d pictures'):`
			`picture.dimensions`
			`picture.unicode_path = unicode(picture.path)`
			`try:`
			`if picture.unicode_path not in self.cached_blocks:`
			`blocks = picture.get_blocks(self.block_count_per_side)`
			`self.cached_blocks[picture.unicode_path] = blocks`
			`prepared.append(picture)`
			`except IOError as e:`
			`logging.warning(unicode(e))`
			`except MemoryError:`
			`logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))`
			`if picture.size < 10 * 1024 * 1024: # We're really running out of memory`
			`raise`
			`except MemoryError:`
			`logging.warning('Ran out of memory while preparing files')`
			`return prepared`


			`def async_compare(ref_id, other_ids, dbname, threshold):`
			`cache = Cache(dbname, threaded=False)`
			`limit = 100 - threshold`
			`ref_blocks = cache[ref_id]`
			`pairs = cache.get_multiple(other_ids)`
			`results = []`
			`for other_id, other_blocks in pairs:`
			`try:`
			`diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)`
			`percentage = 100 - diff`
			`except (DifferentBlockCountError, NoBlocksError):`
			`percentage = 0`
			`if percentage >= threshold:`
			`results.append((ref_id, other_id, percentage))`
			`cache.con.close()`
			`return results`

			`class AsyncMatchFactory(MatchFactory):`
			`def _do_getmatches(self, pictures, j):`
			`def empty_out_queue(queue, into):`
			`try:`
			`while True:`
			`into.append(queue.get(block=False))`
			`except Empty:`
			`pass`

[#58 state:fixed] Moved the async results collection into the same loops as the async filler phase to avoid getting memory errors. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40174 2009-10-03 15:37:53 +00:00			`j = j.start_subjob([9, 1], 'Preparing for matching')`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`cache = self.cached_blocks`
			`id2picture = {}`
			`dimensions2pictures = defaultdict(set)`
[#36 state:fixed] Don't add pictures to dimensions2pictures if they don't have a cache_id (and moved some code around). --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4087 2009-06-19 10:39:14 +00:00			`for picture in pictures:`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`try:`
			`picture.cache_id = cache.get_id(picture.unicode_path)`
			`id2picture[picture.cache_id] = picture`
[#36 state:fixed] Don't add pictures to dimensions2pictures if they don't have a cache_id (and moved some code around). --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4087 2009-06-19 10:39:14 +00:00			`if not self.match_scaled:`
			`dimensions2pictures[picture.dimensions].add(picture)`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`except ValueError:`
[#36 state:fixed] Don't add pictures to dimensions2pictures if they don't have a cache_id (and moved some code around). --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4087 2009-06-19 10:39:14 +00:00			`pass`
			`pictures = [p for p in pictures if hasattr(p, 'cache_id')]`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`pool = multiprocessing.Pool()`
			`async_results = []`
[#58 state:fixed] Moved the async results collection into the same loops as the async filler phase to avoid getting memory errors. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40174 2009-10-03 15:37:53 +00:00			`matches = []`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`pictures_copy = set(pictures)`
[#58 state:fixed] Moved the async results collection into the same loops as the async filler phase to avoid getting memory errors. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40174 2009-10-03 15:37:53 +00:00			`for ref in j.iter_with_progress(pictures, 'Matched %d/%d pictures'):`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`others = pictures_copy if self.match_scaled else dimensions2pictures[ref.dimensions]`
			`others.remove(ref)`
			`if others:`
			`cache_ids = [f.cache_id for f in others]`
			`args = (ref.cache_id, cache_ids, self.cached_blocks.dbname, self.threshold)`
			`async_results.append(pool.apply_async(async_compare, args))`
[#58 state:fixed] Moved the async results collection into the same loops as the async filler phase to avoid getting memory errors. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40174 2009-10-03 15:37:53 +00:00			`if len(async_results) > RESULTS_QUEUE_LIMIT:`
			`result = async_results.pop(0)`
			`matches.extend(result.get())`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
			`result = []`
			`for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):`
			`ref = id2picture[ref_id]`
			`other = id2picture[other_id]`
			`if percentage == 100 and ref.md5 != other.md5:`
			`percentage = 99`
			`if percentage >= self.threshold:`
			`result.append(get_match(ref, other, percentage))`
			`return result`


			`multiprocessing.freeze_support()`