dupeguru/core_pe/matchbase.py

# Created By: Virgil Dupras
# Created On: 2007/02/25
# Copyright 2010 Hardcoded Software (http://www.hardcoded.net)
# 
# This software is licensed under the "BSD" License as described in the "LICENSE" file, 
# which should be included with this package. The terms are also available at 
# http://www.hardcoded.net/licenses/bsd_license

import logging
import multiprocessing
from itertools import combinations

from hscommon.util import extract
from hscommon.trans import tr
from jobprogress import job

from core.engine import Match
from .block import avgdiff, DifferentBlockCountError, NoBlocksError
from .cache import Cache

# OPTIMIZATION NOTES:
# The bottleneck of the matching phase is CPU, which is why we use multiprocessing. However, another
# bottleneck that shows up when a lot of pictures are involved is Disk IO's because blocks
# constantly have to be read from disks by subprocesses. This problem is especially big on CPUs
# with a lot of cores. Therefore, we must minimize Disk IOs. The best way to achieve that is to
# separate the files to scan in "chunks" and it's by chunk that blocks are read in memory and
# compared to each other. Each file in a chunk has to be compared to each other, of course, but also
# to files in other chunks. So chunkifying doesn't save us any actual comparison, but the advantage
# is that instead of reading blocks from disk number_of_files**2 times, we read it
# number_of_files*number_of_chunks times.
# Determining the right chunk size is tricky, bceause if it's too big, too many blocks will be in
# memory at the same time and we might end up with memory trashing, which is awfully slow. So,
# because our *real* bottleneck is CPU, the chunk size must simply be enough so that the CPU isn't
# starved by Disk IOs.

MIN_ITERATIONS = 3
BLOCK_COUNT_PER_SIDE = 15
DEFAULT_CHUNK_SIZE = 1000
MIN_CHUNK_SIZE = 100

# Enough so that we're sure that the main thread will not wait after a result.get() call
# cpucount+1 should be enough to be sure that the spawned process will not wait after the results
# collection made by the main process.
try:
    RESULTS_QUEUE_LIMIT = multiprocessing.cpu_count() + 1
except Exception:
    # I had an IOError on app launch once. It seems to be a freak occurrence. In any case, we want
    # the app to launch, so let's just put an arbitrary value.
    logging.warning("Had problems to determine cpu count on launch.")
    RESULTS_QUEUE_LIMIT = 8

def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
    # The MemoryError handlers in there use logging without first caring about whether or not
    # there is enough memory left to carry on the operation because it is assumed that the
    # MemoryError happens when trying to read an image file, which is freed from memory by the
    # time that MemoryError is raised.
    cache = Cache(cache_path)
    prepared = [] # only pictures for which there was no error getting blocks
    try:
        for picture in j.iter_with_progress(pictures, tr("Analyzed %d/%d pictures")):
            picture.unicode_path = str(picture.path)
            logging.debug("Analyzing picture at {}".format(picture.unicode_path))
            if with_dimensions:
                picture.dimensions # pre-read dimensions
            try:
                if picture.unicode_path not in cache:
                    blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE)
                    cache[picture.unicode_path] = blocks
                prepared.append(picture)
            except (IOError, ValueError) as e:
                logging.warning(str(e))
            except MemoryError:
                logging.warning('Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
                if picture.size < 10 * 1024 * 1024: # We're really running out of memory
                    raise
    except MemoryError:
        logging.warning('Ran out of memory while preparing pictures')
    cache.close()
    return prepared

def get_chunks(pictures):
    min_chunk_count = multiprocessing.cpu_count() * 2 # have enough chunks to feed all subprocesses
    chunk_count = len(pictures) // DEFAULT_CHUNK_SIZE
    chunk_count = max(min_chunk_count, chunk_count)
    chunk_size = (len(pictures) // chunk_count) + 1
    chunk_size = max(MIN_CHUNK_SIZE, chunk_size)
    logging.info("Creating {} chunks with a chunk size of {} for {} pictures".format(
        chunk_count, chunk_size, len(pictures)))
    chunks = [pictures[i:i+chunk_size] for i in range(0, len(pictures), chunk_size)]
    return chunks

def get_match(first, second, percentage):
    if percentage < 0:
        percentage = 0
    return Match(first, second, percentage)

def async_compare(ref_ids, other_ids, dbname, threshold, picinfo):
    # The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids
    # can be None. In this case, ref_ids has to be compared with itself
    # picinfo is a dictionary {pic_id: (dimensions, is_ref)}
    cache = Cache(dbname)
    limit = 100 - threshold
    ref_pairs = list(cache.get_multiple(ref_ids))
    if other_ids is not None:
        other_pairs = list(cache.get_multiple(other_ids))
        comparisons_to_do = [(r, o) for r in ref_pairs for o in other_pairs]
    else:
        comparisons_to_do = list(combinations(ref_pairs, 2))
    results = []
    for (ref_id, ref_blocks), (other_id, other_blocks) in comparisons_to_do:
        ref_dimensions, ref_is_ref = picinfo[ref_id]
        other_dimensions, other_is_ref = picinfo[other_id]
        if ref_is_ref and other_is_ref:
            continue
        if ref_dimensions != other_dimensions:
            continue
        try:
            diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
            percentage = 100 - diff
        except (DifferentBlockCountError, NoBlocksError):
            percentage = 0
        if percentage >= threshold:
            results.append((ref_id, other_id, percentage))
    cache.close()
    return results
    
def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nulljob):
    def get_picinfo(p):
        if match_scaled:
            return (None, p.is_ref)
        else:
            return (p.dimensions, p.is_ref)
    
    def collect_results(collect_all=False):
        # collect results and wait until the queue is small enough to accomodate a new results.
        nonlocal async_results, matches, comparison_count
        limit = 0 if collect_all else RESULTS_QUEUE_LIMIT
        while len(async_results) > limit:
            ready, working = extract(lambda r: r.ready(), async_results)
            for result in ready:
                matches += result.get()
                async_results.remove(result)
                comparison_count += 1
        progress_msg = tr("Performed %d/%d chunk matches") % (comparison_count, len(comparisons_to_do))
        j.set_progress(comparison_count, progress_msg)
    
    j = j.start_subjob([3, 7])
    pictures = prepare_pictures(pictures, cache_path, with_dimensions=not match_scaled, j=j)
    j = j.start_subjob([9, 1], tr("Preparing for matching"))
    cache = Cache(cache_path)
    id2picture = {}
    for picture in pictures:
        try:
            picture.cache_id = cache.get_id(picture.unicode_path)
            id2picture[picture.cache_id] = picture
        except ValueError:
            pass
    cache.close()
    pictures = [p for p in pictures if hasattr(p, 'cache_id')]
    pool = multiprocessing.Pool()
    async_results = []
    matches = []
    chunks = get_chunks(pictures)
    # We add a None element at the end of the chunk list because each chunk has to be compared
    # with itself. Thus, each chunk will show up as a ref_chunk having other_chunk set to None once.
    comparisons_to_do = list(combinations(chunks + [None], 2))
    comparison_count = 0
    j.start_job(len(comparisons_to_do))
    for ref_chunk, other_chunk in comparisons_to_do:
        picinfo = {p.cache_id: get_picinfo(p) for p in ref_chunk}
        ref_ids = [p.cache_id for p in ref_chunk]
        if other_chunk is not None:
            other_ids = [p.cache_id for p in other_chunk]
            picinfo.update({p.cache_id: get_picinfo(p) for p in other_chunk})
        else:
            other_ids = None
        args = (ref_ids, other_ids, cache_path, threshold, picinfo)
        async_results.append(pool.apply_async(async_compare, args))
        collect_results()
    collect_results(collect_all=True)
    pool.close()
    
    result = []
    for ref_id, other_id, percentage in j.iter_with_progress(matches, tr("Verified %d/%d matches"), every=10):
        ref = id2picture[ref_id]
        other = id2picture[other_id]
        if percentage == 100 and ref.md5 != other.md5:
            percentage = 99
        if percentage >= threshold:
            ref.dimensions # pre-read dimensions for display in results
            other.dimensions
            result.append(get_match(ref, other, percentage))
    return result

multiprocessing.freeze_support()
pe py: Adjusted the code to its move from 'base/py' --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4080 2009-06-18 19:42:27 +00:00			`# Created By: Virgil Dupras`
			`# Created On: 2007/02/25`
Changed copyright year to 2010 2010-01-01 20:11:34 +00:00			`# Copyright 2010 Hardcoded Software (http://www.hardcoded.net)`
Relicensed to HS License. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40100 2009-08-05 08:59:46 +00:00			`#`
Re-licensed to BSD 2010-09-30 10:17:41 +00:00			`# This software is licensed under the "BSD" License as described in the "LICENSE" file,`
Relicensed to HS License. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40100 2009-08-05 08:59:46 +00:00			`# which should be included with this package. The terms are also available at`
Re-licensed to BSD 2010-09-30 10:17:41 +00:00			`# http://www.hardcoded.net/licenses/bsd_license`
pe py: Adjusted the code to its move from 'base/py' --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4080 2009-06-18 19:42:27 +00:00
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`import logging`
			`import multiprocessing`
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`from itertools import combinations`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`from hscommon.util import extract`
[#32] Internationalized the core and localized it to french. 2011-01-18 16:33:33 +00:00			`from hscommon.trans import tr`
Adapted to the job-related code moving to the 'jobprogress' package. 2010-11-20 11:42:15 +00:00			`from jobprogress import job`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
Changed dupeguru and dupeguru_* external references to core and core_* references. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40278 2009-12-30 10:37:57 +00:00			`from core.engine import Match`
pe py: Adjusted the code to its move from 'base/py' --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4080 2009-06-18 19:42:27 +00:00			`from .block import avgdiff, DifferentBlockCountError, NoBlocksError`
			`from .cache import Cache`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`# OPTIMIZATION NOTES:`
			`# The bottleneck of the matching phase is CPU, which is why we use multiprocessing. However, another`
			`# bottleneck that shows up when a lot of pictures are involved is Disk IO's because blocks`
			`# constantly have to be read from disks by subprocesses. This problem is especially big on CPUs`
			`# with a lot of cores. Therefore, we must minimize Disk IOs. The best way to achieve that is to`
			`# separate the files to scan in "chunks" and it's by chunk that blocks are read in memory and`
			`# compared to each other. Each file in a chunk has to be compared to each other, of course, but also`
			`# to files in other chunks. So chunkifying doesn't save us any actual comparison, but the advantage`
			`# is that instead of reading blocks from disk number_of_files**2 times, we read it`
			`# number_of_files*number_of_chunks times.`
			`# Determining the right chunk size is tricky, bceause if it's too big, too many blocks will be in`
			`# memory at the same time and we might end up with memory trashing, which is awfully slow. So,`
			`# because our real bottleneck is CPU, the chunk size must simply be enough so that the CPU isn't`
			`# starved by Disk IOs.`

Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`MIN_ITERATIONS = 3`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`BLOCK_COUNT_PER_SIDE = 15`
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`DEFAULT_CHUNK_SIZE = 1000`
			`MIN_CHUNK_SIZE = 100`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
[#58 state:fixed] Moved the async results collection into the same loops as the async filler phase to avoid getting memory errors. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40174 2009-10-03 15:37:53 +00:00			`# Enough so that we're sure that the main thread will not wait after a result.get() call`
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`# cpucount+1 should be enough to be sure that the spawned process will not wait after the results`
[#58 state:fixed] Moved the async results collection into the same loops as the async filler phase to avoid getting memory errors. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40174 2009-10-03 15:37:53 +00:00			`# collection made by the main process.`
Catch a random occurrence of an exception on multiprocessing.cpu_count() so that dupeGuru doesn't crash because of that. 2011-03-05 12:02:11 +00:00			`try:`
			`RESULTS_QUEUE_LIMIT = multiprocessing.cpu_count() + 1`
			`except Exception:`
			`# I had an IOError on app launch once. It seems to be a freak occurrence. In any case, we want`
			`# the app to launch, so let's just put an arbitrary value.`
			`logging.warning("Had problems to determine cpu count on launch.")`
			`RESULTS_QUEUE_LIMIT = 8`
[#58 state:fixed] Moved the async results collection into the same loops as the async filler phase to avoid getting memory errors. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40174 2009-10-03 15:37:53 +00:00
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`# The MemoryError handlers in there use logging without first caring about whether or not`
			`# there is enough memory left to carry on the operation because it is assumed that the`
			`# MemoryError happens when trying to read an image file, which is freed from memory by the`
			`# time that MemoryError is raised.`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache = Cache(cache_path)`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`prepared = [] # only pictures for which there was no error getting blocks`
			`try:`
[#32] Internationalized the core and localized it to french. 2011-01-18 16:33:33 +00:00			`for picture in j.iter_with_progress(pictures, tr("Analyzed %d/%d pictures")):`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`picture.unicode_path = str(picture.path)`
Changed error logging in core_pe.cache because it would sometimes result in huge logs of no value. Also, added debug logging during the analysis of pictures. 2011-01-29 10:31:17 +00:00			`logging.debug("Analyzing picture at {}".format(picture.unicode_path))`
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`if with_dimensions:`
			`picture.dimensions # pre-read dimensions`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`try:`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`if picture.unicode_path not in cache:`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE)`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache[picture.unicode_path] = blocks`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`prepared.append(picture)`
[#96 state:fixed] Fixed a hard crash on calling get_blocks() with an empty path. 2010-07-14 07:36:35 +00:00			`except (IOError, ValueError) as e:`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`logging.warning(str(e))`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`except MemoryError:`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`logging.warning('Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`if picture.size < 10 * 1024 * 1024: # We're really running out of memory`
			`raise`
			`except MemoryError:`
			`logging.warning('Ran out of memory while preparing pictures')`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache.close()`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`return prepared`

Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`def get_chunks(pictures):`
			`min_chunk_count = multiprocessing.cpu_count() * 2 # have enough chunks to feed all subprocesses`
			`chunk_count = len(pictures) // DEFAULT_CHUNK_SIZE`
			`chunk_count = max(min_chunk_count, chunk_count)`
			`chunk_size = (len(pictures) // chunk_count) + 1`
			`chunk_size = max(MIN_CHUNK_SIZE, chunk_size)`
			`logging.info("Creating {} chunks with a chunk size of {} for {} pictures".format(`
			`chunk_count, chunk_size, len(pictures)))`
			`chunks = [pictures[i:i+chunk_size] for i in range(0, len(pictures), chunk_size)]`
			`return chunks`

dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`def get_match(first, second, percentage):`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`if percentage < 0:`
			`percentage = 0`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`return Match(first, second, percentage)`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`def async_compare(ref_ids, other_ids, dbname, threshold, picinfo):`
			`# The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids`
			`# can be None. In this case, ref_ids has to be compared with itself`
			`# picinfo is a dictionary {pic_id: (dimensions, is_ref)}`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache = Cache(dbname)`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`limit = 100 - threshold`
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`ref_pairs = list(cache.get_multiple(ref_ids))`
			`if other_ids is not None:`
			`other_pairs = list(cache.get_multiple(other_ids))`
			`comparisons_to_do = [(r, o) for r in ref_pairs for o in other_pairs]`
			`else:`
			`comparisons_to_do = list(combinations(ref_pairs, 2))`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`results = []`
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`for (ref_id, ref_blocks), (other_id, other_blocks) in comparisons_to_do:`
			`ref_dimensions, ref_is_ref = picinfo[ref_id]`
			`other_dimensions, other_is_ref = picinfo[other_id]`
			`if ref_is_ref and other_is_ref:`
			`continue`
			`if ref_dimensions != other_dimensions:`
			`continue`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`try:`
			`diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)`
			`percentage = 100 - diff`
			`except (DifferentBlockCountError, NoBlocksError):`
			`percentage = 0`
			`if percentage >= threshold:`
			`results.append((ref_id, other_id, percentage))`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache.close()`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`return results`

Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nulljob):`
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`def get_picinfo(p):`
			`if match_scaled:`
			`return (None, p.is_ref)`
			`else:`
			`return (p.dimensions, p.is_ref)`

			`def collect_results(collect_all=False):`
			`# collect results and wait until the queue is small enough to accomodate a new results.`
			`nonlocal async_results, matches, comparison_count`
			`limit = 0 if collect_all else RESULTS_QUEUE_LIMIT`
			`while len(async_results) > limit:`
			`ready, working = extract(lambda r: r.ready(), async_results)`
			`for result in ready:`
			`matches += result.get()`
			`async_results.remove(result)`
Fixed bug causing PE progress report to be (very) wrong during matching. 2011-03-07 10:55:37 +00:00			`comparison_count += 1`
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`progress_msg = tr("Performed %d/%d chunk matches") % (comparison_count, len(comparisons_to_do))`
			`j.set_progress(comparison_count, progress_msg)`

dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`j = j.start_subjob([3, 7])`
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`pictures = prepare_pictures(pictures, cache_path, with_dimensions=not match_scaled, j=j)`
[#32] Internationalized the core and localized it to french. 2011-01-18 16:33:33 +00:00			`j = j.start_subjob([9, 1], tr("Preparing for matching"))`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache = Cache(cache_path)`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`id2picture = {}`
			`for picture in pictures:`
			`try:`
			`picture.cache_id = cache.get_id(picture.unicode_path)`
			`id2picture[picture.cache_id] = picture`
			`except ValueError:`
			`pass`
Straightened out the blocks cache. Instead of having a single global threaded block cache in the app, there's just a cache path, and non-threaded caches are created when needed. Also, made Cache.clear() more robust (it will clear the cache even if the db is corrupted). 2010-01-14 15:14:26 +00:00			`cache.close()`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`pictures = [p for p in pictures if hasattr(p, 'cache_id')]`
			`pool = multiprocessing.Pool()`
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`async_results = []`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`matches = []`
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`chunks = get_chunks(pictures)`
			`# We add a None element at the end of the chunk list because each chunk has to be compared`
			`# with itself. Thus, each chunk will show up as a ref_chunk having other_chunk set to None once.`
			`comparisons_to_do = list(combinations(chunks + [None], 2))`
			`comparison_count = 0`
			`j.start_job(len(comparisons_to_do))`
			`for ref_chunk, other_chunk in comparisons_to_do:`
			`picinfo = {p.cache_id: get_picinfo(p) for p in ref_chunk}`
			`ref_ids = [p.cache_id for p in ref_chunk]`
			`if other_chunk is not None:`
			`other_ids = [p.cache_id for p in other_chunk]`
			`picinfo.update({p.cache_id: get_picinfo(p) for p in other_chunk})`
			`else:`
			`other_ids = None`
			`args = (ref_ids, other_ids, cache_path, threshold, picinfo)`
			`async_results.append(pool.apply_async(async_compare, args))`
			`collect_results()`
			`collect_results(collect_all=True)`
			`pool.close()`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00
			`result = []`
[#32] Internationalized the core and localized it to french. 2011-01-18 16:33:33 +00:00			`for ref_id, other_id, percentage in j.iter_with_progress(matches, tr("Verified %d/%d matches"), every=10):`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`ref = id2picture[ref_id]`
			`other = id2picture[other_id]`
			`if percentage == 100 and ref.md5 != other.md5:`
			`percentage = 99`
			`if percentage >= threshold:`
Optimized the scanning process in PE. 2011-03-04 10:15:04 +00:00			`ref.dimensions # pre-read dimensions for display in results`
			`other.dimensions`
dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193 2009-10-18 09:26:04 +00:00			`result.append(get_match(ref, other, percentage))`
			`return result`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
			`multiprocessing.freeze_support()`