diff --git a/base/py/scanner.py b/base/py/scanner.py index 0ac41d23..39f1984a 100644 --- a/base/py/scanner.py +++ b/base/py/scanner.py @@ -99,7 +99,6 @@ class Scanner(object): g.prioritize(self._key_func, self._tie_breaker) return groups - match_factory = None match_similar_words = False min_match_percentage = 80 mix_file_kind = True diff --git a/pe/py/app_cocoa.py b/pe/py/app_cocoa.py index 4880b2e2..fa619838 100644 --- a/pe/py/app_cocoa.py +++ b/pe/py/app_cocoa.py @@ -27,8 +27,9 @@ from hsutil.path import Path from hsutil.cocoa import as_fetch from dupeguru import app_cocoa, directories -from . import data, matchbase +from . import data from .cache import string_to_colors, Cache +from .scanner import ScannerPE mainBundle = NSBundle.mainBundle() PictureBlocks = mainBundle.classNamed_('PictureBlocks') @@ -126,11 +127,11 @@ class IPhotoLibrary(fs.Directory): class DupeGuruPE(app_cocoa.DupeGuru): def __init__(self): app_cocoa.DupeGuru.__init__(self, data, 'dupeGuru Picture Edition', appid=5) - self.scanner.match_factory = matchbase.AsyncMatchFactory() + self.scanner = ScannerPE() self.directories.dirclass = Directory self.directories.special_dirclasses[Path('iPhoto Library')] = lambda _, __: self._create_iphoto_library() p = op.join(self.appdata, 'cached_pictures.db') - self.scanner.match_factory.cached_blocks = Cache(p) + self.scanner.cached_blocks = Cache(p) def _create_iphoto_library(self): ud = NSUserDefaults.standardUserDefaults() diff --git a/pe/py/matchbase.py b/pe/py/matchbase.py index 34966a50..3490620b 100644 --- a/pe/py/matchbase.py +++ b/pe/py/matchbase.py @@ -20,58 +20,42 @@ from .block import avgdiff, DifferentBlockCountError, NoBlocksError from .cache import Cache MIN_ITERATIONS = 3 +BLOCK_COUNT_PER_SIDE = 15 # Enough so that we're sure that the main thread will not wait after a result.get() call # cpucount*2 should be enough to be sure that the spawned process will not wait after the results # collection made by the main process. RESULTS_QUEUE_LIMIT = multiprocessing.cpu_count() * 2 -def get_match(first,second,percentage): +def prepare_pictures(pictures, cached_blocks, j=job.nulljob): + # The MemoryError handlers in there use logging without first caring about whether or not + # there is enough memory left to carry on the operation because it is assumed that the + # MemoryError happens when trying to read an image file, which is freed from memory by the + # time that MemoryError is raised. + prepared = [] # only pictures for which there was no error getting blocks + try: + for picture in j.iter_with_progress(pictures, 'Analyzed %d/%d pictures'): + picture.dimensions + picture.unicode_path = unicode(picture.path) + try: + if picture.unicode_path not in cached_blocks: + blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE) + cached_blocks[picture.unicode_path] = blocks + prepared.append(picture) + except IOError as e: + logging.warning(unicode(e)) + except MemoryError: + logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size)) + if picture.size < 10 * 1024 * 1024: # We're really running out of memory + raise + except MemoryError: + logging.warning('Ran out of memory while preparing pictures') + return prepared + +def get_match(first, second, percentage): if percentage < 0: percentage = 0 - return Match(first,second,percentage) - -class MatchFactory(object): - cached_blocks = None - block_count_per_side = 15 - threshold = 75 - match_scaled = False - - def _do_getmatches(self, files, j): - raise NotImplementedError() - - def getmatches(self, files, j=job.nulljob): - # The MemoryError handlers in there use logging without first caring about whether or not - # there is enough memory left to carry on the operation because it is assumed that the - # MemoryError happens when trying to read an image file, which is freed from memory by the - # time that MemoryError is raised. - j = j.start_subjob([3, 7]) - logging.info('Preparing %d files' % len(files)) - prepared = self.prepare_files(files, j) - logging.info('Finished preparing %d files' % len(prepared)) - return self._do_getmatches(prepared, j) - - def prepare_files(self, files, j=job.nulljob): - prepared = [] # only files for which there was no error getting blocks - try: - for picture in j.iter_with_progress(files, 'Analyzed %d/%d pictures'): - picture.dimensions - picture.unicode_path = unicode(picture.path) - try: - if picture.unicode_path not in self.cached_blocks: - blocks = picture.get_blocks(self.block_count_per_side) - self.cached_blocks[picture.unicode_path] = blocks - prepared.append(picture) - except IOError as e: - logging.warning(unicode(e)) - except MemoryError: - logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size)) - if picture.size < 10 * 1024 * 1024: # We're really running out of memory - raise - except MemoryError: - logging.warning('Ran out of memory while preparing files') - return prepared - + return Match(first, second, percentage) def async_compare(ref_id, other_ids, dbname, threshold): cache = Cache(dbname, threaded=False) @@ -89,53 +73,53 @@ def async_compare(ref_id, other_ids, dbname, threshold): results.append((ref_id, other_id, percentage)) cache.con.close() return results - -class AsyncMatchFactory(MatchFactory): - def _do_getmatches(self, pictures, j): - def empty_out_queue(queue, into): - try: - while True: - into.append(queue.get(block=False)) - except Empty: - pass - - j = j.start_subjob([9, 1], 'Preparing for matching') - cache = self.cached_blocks - id2picture = {} - dimensions2pictures = defaultdict(set) - for picture in pictures: - try: - picture.cache_id = cache.get_id(picture.unicode_path) - id2picture[picture.cache_id] = picture - if not self.match_scaled: - dimensions2pictures[picture.dimensions].add(picture) - except ValueError: - pass - pictures = [p for p in pictures if hasattr(p, 'cache_id')] - pool = multiprocessing.Pool() - async_results = [] - matches = [] - pictures_copy = set(pictures) - for ref in j.iter_with_progress(pictures, 'Matched %d/%d pictures'): - others = pictures_copy if self.match_scaled else dimensions2pictures[ref.dimensions] - others.remove(ref) - if others: - cache_ids = [f.cache_id for f in others] - args = (ref.cache_id, cache_ids, self.cached_blocks.dbname, self.threshold) - async_results.append(pool.apply_async(async_compare, args)) - if len(async_results) > RESULTS_QUEUE_LIMIT: - result = async_results.pop(0) - matches.extend(result.get()) - - result = [] - for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10): - ref = id2picture[ref_id] - other = id2picture[other_id] - if percentage == 100 and ref.md5 != other.md5: - percentage = 99 - if percentage >= self.threshold: - result.append(get_match(ref, other, percentage)) - return result +def getmatches(pictures, cached_blocks, threshold=75, match_scaled=False, j=job.nulljob): + def empty_out_queue(queue, into): + try: + while True: + into.append(queue.get(block=False)) + except Empty: + pass + + j = j.start_subjob([3, 7]) + pictures = prepare_pictures(pictures, cached_blocks, j) + j = j.start_subjob([9, 1], 'Preparing for matching') + cache = cached_blocks + id2picture = {} + dimensions2pictures = defaultdict(set) + for picture in pictures: + try: + picture.cache_id = cache.get_id(picture.unicode_path) + id2picture[picture.cache_id] = picture + if not match_scaled: + dimensions2pictures[picture.dimensions].add(picture) + except ValueError: + pass + pictures = [p for p in pictures if hasattr(p, 'cache_id')] + pool = multiprocessing.Pool() + async_results = [] + matches = [] + pictures_copy = set(pictures) + for ref in j.iter_with_progress(pictures, 'Matched %d/%d pictures'): + others = pictures_copy if match_scaled else dimensions2pictures[ref.dimensions] + others.remove(ref) + if others: + cache_ids = [f.cache_id for f in others] + args = (ref.cache_id, cache_ids, cached_blocks.dbname, threshold) + async_results.append(pool.apply_async(async_compare, args)) + if len(async_results) > RESULTS_QUEUE_LIMIT: + result = async_results.pop(0) + matches.extend(result.get()) + + result = [] + for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10): + ref = id2picture[ref_id] + other = id2picture[other_id] + if percentage == 100 and ref.md5 != other.md5: + percentage = 99 + if percentage >= threshold: + result.append(get_match(ref, other, percentage)) + return result multiprocessing.freeze_support() \ No newline at end of file diff --git a/pe/py/scanner.py b/pe/py/scanner.py new file mode 100644 index 00000000..b25f0011 --- /dev/null +++ b/pe/py/scanner.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# Created By: Virgil Dupras +# Created On: 2009-10-18 +# $Id$ +# Copyright 2009 Hardcoded Software (http://www.hardcoded.net) +# +# This software is licensed under the "HS" License as described in the "LICENSE" file, +# which should be included with this package. The terms are also available at +# http://www.hardcoded.net/licenses/hs_license + +from dupeguru.scanner import Scanner + +from . import matchbase + +class ScannerPE(Scanner): + cached_blocks = None + match_scaled = False + threshold = 75 + + def _getmatches(self, files, j): + return matchbase.getmatches(files, self.cached_blocks, self.threshold, self.match_scaled, j) + diff --git a/pe/qt/app.py b/pe/qt/app.py index dced3764..cc6296c7 100644 --- a/pe/qt/app.py +++ b/pe/qt/app.py @@ -17,7 +17,7 @@ from hsutil.str import get_file_ext from dupeguru_pe import data as data_pe from dupeguru_pe.cache import Cache -from dupeguru_pe.matchbase import AsyncMatchFactory +from dupeguru_pe.scanner import ScannerPE from block import getblocks from base.app import DupeGuru as DupeGuruBase @@ -63,15 +63,15 @@ class DupeGuru(DupeGuruBase): DupeGuruBase.__init__(self, data_pe, appid=5) def _setup(self): - self.scanner.match_factory = AsyncMatchFactory() + self.scanner = ScannerPE() self.directories.dirclass = Directory - self.scanner.match_factory.cached_blocks = Cache(op.join(self.appdata, 'cached_pictures.db')) + self.scanner.cached_blocks = Cache(op.join(self.appdata, 'cached_pictures.db')) DupeGuruBase._setup(self) def _update_options(self): DupeGuruBase._update_options(self) - self.scanner.match_factory.match_scaled = self.prefs.match_scaled - self.scanner.match_factory.threshold = self.prefs.filter_hardness + self.scanner.match_scaled = self.prefs.match_scaled + self.scanner.threshold = self.prefs.filter_hardness def _create_details_dialog(self, parent): return DetailsDialog(parent, self) diff --git a/pe/qt/main_window.py b/pe/qt/main_window.py index e0ab90b1..f3d7d990 100644 --- a/pe/qt/main_window.py +++ b/pe/qt/main_window.py @@ -23,6 +23,6 @@ class MainWindow(MainWindowBase): title = "Clear Picture Cache" msg = "Do you really want to remove all your cached picture analysis?" if self._confirm(title, msg, QMessageBox.No): - self.app.scanner.match_factory.cached_blocks.clear() + self.app.scanner.cached_blocks.clear() QMessageBox.information(self, title, "Picture cache cleared.") \ No newline at end of file