Moving the 'py' folder into 'base'.

--HG-- rename : py/__init__.py => base/py/__init__.py rename : py/app.py => base/py/app.py rename : py/app_cocoa.py => base/py/app_cocoa.py rename : py/app_me_cocoa.py => base/py/app_me_cocoa.py rename : py/app_pe_cocoa.py => base/py/app_pe_cocoa.py rename : py/app_se_cocoa.py => base/py/app_se_cocoa.py rename : py/data.py => base/py/data.py rename : py/data_me.py => base/py/data_me.py rename : py/data_pe.py => base/py/data_pe.py rename : py/directories.py => base/py/directories.py rename : py/engine.py => base/py/engine.py rename : py/export.py => base/py/export.py rename : py/gen.py => base/py/gen.py rename : py/ignore.py => base/py/ignore.py rename : py/modules/block/block.pyx => base/py/modules/block/block.pyx rename : py/modules/block/setup.py => base/py/modules/block/setup.py rename : py/modules/cache/cache.pyx => base/py/modules/cache/cache.pyx rename : py/modules/cache/setup.py => base/py/modules/cache/setup.py rename : py/picture/__init__.py => base/py/picture/__init__.py rename : py/picture/block.py => base/py/picture/block.py rename : py/picture/cache.py => base/py/picture/cache.py rename : py/picture/matchbase.py => base/py/picture/matchbase.py rename : py/results.py => base/py/results.py rename : py/scanner.py => base/py/scanner.py rename : py/tests/__init__.py => base/py/tests/__init__.py rename : py/tests/app_cocoa_test.py => base/py/tests/app_cocoa_test.py rename : py/tests/app_test.py => base/py/tests/app_test.py rename : py/tests/block_test.py => base/py/tests/block_test.py rename : py/tests/cache_test.py => base/py/tests/cache_test.py rename : py/tests/directories_test.py => base/py/tests/directories_test.py rename : py/tests/engine_test.py => base/py/tests/engine_test.py rename : py/tests/export_test.py => base/py/tests/export_test.py rename : py/tests/ignore_test.py => base/py/tests/ignore_test.py rename : py/tests/results_test.py => base/py/tests/results_test.py rename : py/tests/scanner_test.py => base/py/tests/scanner_test.py extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4074
2026-04-26 22:21:37 +00:00 · 2009-06-18 18:55:00 +00:00
parent 87a56bb537
commit d1f460b091
35 changed files with 0 additions and 0 deletions
--- a/base/py/picture/init.py
+++ b/base/py/picture/init.py
--- a/base/py/picture/block.py
+++ b/base/py/picture/block.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+"""
+Unit Name: hs.picture.block
+Created By: Virgil Dupras
+Created On: 2006/09/01
+Last modified by:$Author: virgil $
+Last modified on:$Date: 2009-05-26 18:12:39 +0200 (Tue, 26 May 2009) $
+                 $Revision: 4365 $
+Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
+"""
+from _block import NoBlocksError, DifferentBlockCountError, avgdiff, getblocks2
+
+# Converted to Cython
+# def getblock(image):
+#     """Returns a 3 sized tuple containing the mean color of 'image'.
+#     
+#     image: a PIL image or crop.
+#     """
+#     if image.size[0]:
+#         pixel_count = image.size[0] * image.size[1]
+#         red = green = blue = 0
+#         for r,g,b in image.getdata():
+#             red += r
+#             green += g
+#             blue += b
+#         return (red // pixel_count, green // pixel_count, blue // pixel_count)
+#     else:
+#         return (0,0,0)
+
+# This is not used anymore
+# def getblocks(image,blocksize):
+#     """Returns a list of blocks (3 sized tuples).
+#     
+#     image: A PIL image to base the blocks on.
+#     blocksize: The size of the blocks to be create. This is a single integer, defining
+#         both width and height (blocks are square).
+#     """
+#     if min(image.size) < blocksize:
+#         return ()
+#     result = []
+#     for i in xrange(image.size[1] // blocksize):
+#         for j in xrange(image.size[0] // blocksize):
+#             box = (blocksize * j, blocksize * i, blocksize * (j + 1), blocksize * (i + 1))
+#             crop = image.crop(box)
+#             result.append(getblock(crop))
+#     return result
+
+# Converted to Cython
+# def getblocks2(image,block_count_per_side):
+#     """Returns a list of blocks (3 sized tuples).
+#     
+#     image: A PIL image to base the blocks on.
+#     block_count_per_side: This integer determine the number of blocks the function will return.
+#     If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not
+#     necessarely cover square areas. The area covered by each block will be proportional to the image
+#     itself.
+#     """
+#     if not image.size[0]:
+#         return []
+#     width,height = image.size
+#     block_width = max(width // block_count_per_side,1)
+#     block_height = max(height // block_count_per_side,1)
+#     result = []
+#     for ih in range(block_count_per_side):
+#         top = min(ih * block_height, height - block_height)
+#         bottom = top + block_height
+#         for iw in range(block_count_per_side):
+#             left = min(iw * block_width, width - block_width)
+#             right = left + block_width
+#             box = (left,top,right,bottom)
+#             crop = image.crop(box)
+#             result.append(getblock(crop))
+#     return result
+
+# Converted to Cython
+# def diff(first, second):
+#     """Returns the difference between the first block and the second.
+#     
+#     It returns an absolute sum of the 3 differences (RGB).
+#     """
+#     r1, g1, b1 = first
+#     r2, g2, b2 = second
+#     return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2)
+
+# Converted to Cython
+# def avgdiff(first, second, limit=768, min_iterations=1):
+#     """Returns the average diff between first blocks and seconds.
+#     
+#     If the result surpasses limit, limit + 1 is returned, except if less than min_iterations
+#     iterations have been made in the blocks.
+#     """
+#     if len(first) != len(second):
+#         raise DifferentBlockCountError
+#     if not first:
+#         raise NoBlocksError
+#     count = len(first)
+#     sum = 0
+#     zipped = izip(xrange(1, count + 1), first, second)
+#     for i, first, second in zipped:
+#         sum += diff(first, second)
+#         if sum > limit * i and i >= min_iterations:
+#             return limit + 1
+#     result = sum // count
+#     if (not result) and sum:
+#         result = 1
+#     return result
+
+# This is not used anymore
+# def maxdiff(first,second,limit=768):
+#     """Returns the max diff between first blocks and seconds.
+#     
+#     If the result surpasses limit, the first max being over limit is returned.
+#     """
+#     if len(first) != len(second):
+#         raise DifferentBlockCountError
+#     if not first:
+#         raise NoBlocksError
+#     result = 0
+#     zipped = zip(first,second)
+#     for first,second in zipped:
+#         result = max(result,diff(first,second))
+#         if result > limit:
+#             return result
+#     return result
--- a/base/py/picture/cache.py
+++ b/base/py/picture/cache.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+"""
+Unit Name: hs.picture.cache
+Created By: Virgil Dupras
+Created On: 2006/09/14
+Last modified by:$Author: virgil $
+Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
+                 $Revision: 4392 $
+Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
+"""
+import os
+import logging
+import sqlite3 as sqlite
+
+import hsutil.sqlite
+
+from _cache import string_to_colors
+
+def colors_to_string(colors):
+    """Transform the 3 sized tuples 'colors' into a hex string.
+    
+    [(0,100,255)] --> 0064ff
+    [(1,2,3),(4,5,6)] --> 010203040506
+    """
+    return ''.join(['%02x%02x%02x' % (r,g,b) for r,g,b in colors])
+
+# This function is an important bottleneck of dupeGuru PE. It has been converted to Cython.
+# def string_to_colors(s):
+#     """Transform the string 's' in a list of 3 sized tuples.
+#     """
+#     result = []
+#     for i in xrange(0, len(s), 6):
+#         number = int(s[i:i+6], 16)
+#         result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
+#     return result
+
+class Cache(object):
+    """A class to cache picture blocks.
+    """
+    def __init__(self, db=':memory:', threaded=True):
+        def create_tables():
+            sql = "create table pictures(path TEXT, blocks TEXT)"
+            self.con.execute(sql);
+            sql = "create index idx_path on pictures (path)"
+            self.con.execute(sql)
+        
+        self.dbname = db
+        if threaded:
+            self.con = hsutil.sqlite.ThreadedConn(db, True)
+        else:
+            self.con = sqlite.connect(db, isolation_level=None)
+        try:
+            self.con.execute("select * from pictures where 1=2")
+        except sqlite.OperationalError: # new db
+            create_tables()
+        except sqlite.DatabaseError, e: # corrupted db
+            logging.warning('Could not create picture cache because of an error: %s', str(e))
+            self.con.close()
+            os.remove(db)
+            if threaded:
+                self.con = hsutil.sqlite.ThreadedConn(db, True)
+            else:
+                self.con = sqlite.connect(db, isolation_level=None)
+            create_tables()
+    
+    def __contains__(self, key):
+        sql = "select count(*) from pictures where path = ?"
+        result = self.con.execute(sql, [key]).fetchall()
+        return result[0][0] > 0
+    
+    def __delitem__(self, key):
+        if key not in self:
+            raise KeyError(key)
+        sql = "delete from pictures where path = ?"
+        self.con.execute(sql, [key])
+    
+    # Optimized
+    def __getitem__(self, key):
+        if isinstance(key, int):
+            sql = "select blocks from pictures where rowid = ?"
+        else:
+            sql = "select blocks from pictures where path = ?"
+        result = self.con.execute(sql, [key]).fetchone()
+        if result:
+            result = string_to_colors(result[0])
+            return result
+        else:
+            raise KeyError(key)
+    
+    def __iter__(self):
+        sql = "select path from pictures"
+        result = self.con.execute(sql)
+        return (row[0] for row in result)
+    
+    def __len__(self):
+        sql = "select count(*) from pictures"
+        result = self.con.execute(sql).fetchall()
+        return result[0][0]
+    
+    def __setitem__(self, key, value):
+        value = colors_to_string(value)
+        if key in self:
+            sql = "update pictures set blocks = ? where path = ?"
+        else:
+            sql = "insert into pictures(blocks,path) values(?,?)"
+        try:
+            self.con.execute(sql, [value, key])
+        except sqlite.OperationalError:
+            logging.warning('Picture cache could not set %r for key %r', value, key)
+        except sqlite.DatabaseError, e:
+            logging.warning('DatabaseError while setting %r for key %r: %s', value, key, str(e))
+    
+    def clear(self):
+        sql = "delete from pictures"
+        self.con.execute(sql)
+    
+    def filter(self, func):
+        to_delete = [key for key in self if not func(key)]
+        for key in to_delete:
+            del self[key]
+    
+    def get_id(self, path):
+        sql = "select rowid from pictures where path = ?"
+        result = self.con.execute(sql, [path]).fetchone()
+        if result:
+            return result[0]
+        else:
+            raise ValueError(path)
+    
+    def get_multiple(self, rowids):
+        sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids))
+        cur = self.con.execute(sql)
+        return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)
+    
--- a/base/py/picture/matchbase.py
+++ b/base/py/picture/matchbase.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+"""
+Unit Name: hs.picture._match
+Created By: Virgil Dupras
+Created On: 2007/02/25
+Last modified by:$Author: virgil $
+Last modified on:$Date: 2009-05-28 16:02:48 +0200 (Thu, 28 May 2009) $
+                 $Revision: 4388 $
+Copyright 2007 Hardcoded Software (http://www.hardcoded.net)
+"""
+import logging
+import multiprocessing
+from Queue import Empty
+from collections import defaultdict
+
+from hsutil import job
+from hsutil.misc import dedupe
+
+from dupeguru.engine import Match
+from block import avgdiff, DifferentBlockCountError, NoBlocksError
+from cache import Cache
+
+MIN_ITERATIONS = 3
+
+def get_match(first,second,percentage):
+    if percentage < 0:
+        percentage = 0
+    return Match(first,second,percentage)
+
+class MatchFactory(object):
+    cached_blocks = None
+    block_count_per_side = 15
+    threshold = 75
+    match_scaled = False
+    
+    def _do_getmatches(self, files, j):
+        raise NotImplementedError()
+    
+    def getmatches(self, files, j=job.nulljob):
+        # The MemoryError handlers in there use logging without first caring about whether or not
+        # there is enough memory left to carry on the operation because it is assumed that the
+        # MemoryError happens when trying to read an image file, which is freed from memory by the
+        # time that MemoryError is raised.
+        j = j.start_subjob([2, 8])
+        logging.info('Preparing %d files' % len(files))
+        prepared = self.prepare_files(files, j)
+        logging.info('Finished preparing %d files' % len(prepared))
+        return self._do_getmatches(prepared, j)
+    
+    def prepare_files(self, files, j=job.nulljob):
+        prepared = [] # only files for which there was no error getting blocks
+        try:
+            for picture in j.iter_with_progress(files, 'Analyzed %d/%d pictures'):
+                picture.dimensions
+                picture.unicode_path = unicode(picture.path)
+                try:
+                    if picture.unicode_path not in self.cached_blocks:
+                        blocks = picture.get_blocks(self.block_count_per_side)
+                        self.cached_blocks[picture.unicode_path] = blocks
+                    prepared.append(picture)
+                except IOError as e:
+                    logging.warning(unicode(e))
+                except MemoryError:
+                    logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
+                    if picture.size < 10 * 1024 * 1024: # We're really running out of memory
+                        raise
+        except MemoryError:
+            logging.warning('Ran out of memory while preparing files')
+        return prepared
+    
+
+def async_compare(ref_id, other_ids, dbname, threshold):
+    cache = Cache(dbname, threaded=False)
+    limit = 100 - threshold
+    ref_blocks = cache[ref_id]
+    pairs = cache.get_multiple(other_ids)
+    results = []
+    for other_id, other_blocks in pairs:
+        try:
+            diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
+            percentage = 100 - diff
+        except (DifferentBlockCountError, NoBlocksError):
+            percentage = 0
+        if percentage >= threshold:
+            results.append((ref_id, other_id, percentage))
+    cache.con.close()
+    return results
+
+class AsyncMatchFactory(MatchFactory):
+    def _do_getmatches(self, pictures, j):
+        def empty_out_queue(queue, into):
+            try:
+                while True:
+                    into.append(queue.get(block=False))
+            except Empty:
+                pass
+        
+        j = j.start_subjob([1, 8, 1], 'Preparing for matching')
+        cache = self.cached_blocks
+        id2picture = {}
+        dimensions2pictures = defaultdict(set)
+        for picture in pictures[:]:
+            try:
+                picture.cache_id = cache.get_id(picture.unicode_path)
+                id2picture[picture.cache_id] = picture
+            except ValueError:
+                pictures.remove(picture)
+            if not self.match_scaled:
+                dimensions2pictures[picture.dimensions].add(picture)
+        pool = multiprocessing.Pool()
+        async_results = []
+        pictures_copy = set(pictures)
+        for ref in j.iter_with_progress(pictures):
+            others = pictures_copy if self.match_scaled else dimensions2pictures[ref.dimensions]
+            others.remove(ref)
+            if others:
+                cache_ids = [f.cache_id for f in others]
+                args = (ref.cache_id, cache_ids, self.cached_blocks.dbname, self.threshold)
+                async_results.append(pool.apply_async(async_compare, args))
+        
+        matches = []
+        for result in j.iter_with_progress(async_results, 'Matched %d/%d pictures'):
+            matches.extend(result.get())
+        
+        result = []
+        for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
+            ref = id2picture[ref_id]
+            other = id2picture[other_id]
+            if percentage == 100 and ref.md5 != other.md5:
+                percentage = 99
+            if percentage >= self.threshold:
+                result.append(get_match(ref, other, percentage))
+        return result
+    
+
+multiprocessing.freeze_support()