1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-02-24 20:51:38 +00:00

Moving the 'py' folder into 'base'.

--HG--
rename : py/__init__.py => base/py/__init__.py
rename : py/app.py => base/py/app.py
rename : py/app_cocoa.py => base/py/app_cocoa.py
rename : py/app_me_cocoa.py => base/py/app_me_cocoa.py
rename : py/app_pe_cocoa.py => base/py/app_pe_cocoa.py
rename : py/app_se_cocoa.py => base/py/app_se_cocoa.py
rename : py/data.py => base/py/data.py
rename : py/data_me.py => base/py/data_me.py
rename : py/data_pe.py => base/py/data_pe.py
rename : py/directories.py => base/py/directories.py
rename : py/engine.py => base/py/engine.py
rename : py/export.py => base/py/export.py
rename : py/gen.py => base/py/gen.py
rename : py/ignore.py => base/py/ignore.py
rename : py/modules/block/block.pyx => base/py/modules/block/block.pyx
rename : py/modules/block/setup.py => base/py/modules/block/setup.py
rename : py/modules/cache/cache.pyx => base/py/modules/cache/cache.pyx
rename : py/modules/cache/setup.py => base/py/modules/cache/setup.py
rename : py/picture/__init__.py => base/py/picture/__init__.py
rename : py/picture/block.py => base/py/picture/block.py
rename : py/picture/cache.py => base/py/picture/cache.py
rename : py/picture/matchbase.py => base/py/picture/matchbase.py
rename : py/results.py => base/py/results.py
rename : py/scanner.py => base/py/scanner.py
rename : py/tests/__init__.py => base/py/tests/__init__.py
rename : py/tests/app_cocoa_test.py => base/py/tests/app_cocoa_test.py
rename : py/tests/app_test.py => base/py/tests/app_test.py
rename : py/tests/block_test.py => base/py/tests/block_test.py
rename : py/tests/cache_test.py => base/py/tests/cache_test.py
rename : py/tests/directories_test.py => base/py/tests/directories_test.py
rename : py/tests/engine_test.py => base/py/tests/engine_test.py
rename : py/tests/export_test.py => base/py/tests/export_test.py
rename : py/tests/ignore_test.py => base/py/tests/ignore_test.py
rename : py/tests/results_test.py => base/py/tests/results_test.py
rename : py/tests/scanner_test.py => base/py/tests/scanner_test.py
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4074
This commit is contained in:
hsoft
2009-06-18 18:55:00 +00:00
parent 87a56bb537
commit d1f460b091
35 changed files with 0 additions and 0 deletions

View File

124
base/py/picture/block.py Normal file
View File

@@ -0,0 +1,124 @@
#!/usr/bin/env python
"""
Unit Name: hs.picture.block
Created By: Virgil Dupras
Created On: 2006/09/01
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-26 18:12:39 +0200 (Tue, 26 May 2009) $
$Revision: 4365 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
from _block import NoBlocksError, DifferentBlockCountError, avgdiff, getblocks2
# Converted to Cython
# def getblock(image):
# """Returns a 3 sized tuple containing the mean color of 'image'.
#
# image: a PIL image or crop.
# """
# if image.size[0]:
# pixel_count = image.size[0] * image.size[1]
# red = green = blue = 0
# for r,g,b in image.getdata():
# red += r
# green += g
# blue += b
# return (red // pixel_count, green // pixel_count, blue // pixel_count)
# else:
# return (0,0,0)
# This is not used anymore
# def getblocks(image,blocksize):
# """Returns a list of blocks (3 sized tuples).
#
# image: A PIL image to base the blocks on.
# blocksize: The size of the blocks to be create. This is a single integer, defining
# both width and height (blocks are square).
# """
# if min(image.size) < blocksize:
# return ()
# result = []
# for i in xrange(image.size[1] // blocksize):
# for j in xrange(image.size[0] // blocksize):
# box = (blocksize * j, blocksize * i, blocksize * (j + 1), blocksize * (i + 1))
# crop = image.crop(box)
# result.append(getblock(crop))
# return result
# Converted to Cython
# def getblocks2(image,block_count_per_side):
# """Returns a list of blocks (3 sized tuples).
#
# image: A PIL image to base the blocks on.
# block_count_per_side: This integer determine the number of blocks the function will return.
# If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not
# necessarely cover square areas. The area covered by each block will be proportional to the image
# itself.
# """
# if not image.size[0]:
# return []
# width,height = image.size
# block_width = max(width // block_count_per_side,1)
# block_height = max(height // block_count_per_side,1)
# result = []
# for ih in range(block_count_per_side):
# top = min(ih * block_height, height - block_height)
# bottom = top + block_height
# for iw in range(block_count_per_side):
# left = min(iw * block_width, width - block_width)
# right = left + block_width
# box = (left,top,right,bottom)
# crop = image.crop(box)
# result.append(getblock(crop))
# return result
# Converted to Cython
# def diff(first, second):
# """Returns the difference between the first block and the second.
#
# It returns an absolute sum of the 3 differences (RGB).
# """
# r1, g1, b1 = first
# r2, g2, b2 = second
# return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2)
# Converted to Cython
# def avgdiff(first, second, limit=768, min_iterations=1):
# """Returns the average diff between first blocks and seconds.
#
# If the result surpasses limit, limit + 1 is returned, except if less than min_iterations
# iterations have been made in the blocks.
# """
# if len(first) != len(second):
# raise DifferentBlockCountError
# if not first:
# raise NoBlocksError
# count = len(first)
# sum = 0
# zipped = izip(xrange(1, count + 1), first, second)
# for i, first, second in zipped:
# sum += diff(first, second)
# if sum > limit * i and i >= min_iterations:
# return limit + 1
# result = sum // count
# if (not result) and sum:
# result = 1
# return result
# This is not used anymore
# def maxdiff(first,second,limit=768):
# """Returns the max diff between first blocks and seconds.
#
# If the result surpasses limit, the first max being over limit is returned.
# """
# if len(first) != len(second):
# raise DifferentBlockCountError
# if not first:
# raise NoBlocksError
# result = 0
# zipped = zip(first,second)
# for first,second in zipped:
# result = max(result,diff(first,second))
# if result > limit:
# return result
# return result

134
base/py/picture/cache.py Normal file
View File

@@ -0,0 +1,134 @@
#!/usr/bin/env python
"""
Unit Name: hs.picture.cache
Created By: Virgil Dupras
Created On: 2006/09/14
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
$Revision: 4392 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import os
import logging
import sqlite3 as sqlite
import hsutil.sqlite
from _cache import string_to_colors
def colors_to_string(colors):
"""Transform the 3 sized tuples 'colors' into a hex string.
[(0,100,255)] --> 0064ff
[(1,2,3),(4,5,6)] --> 010203040506
"""
return ''.join(['%02x%02x%02x' % (r,g,b) for r,g,b in colors])
# This function is an important bottleneck of dupeGuru PE. It has been converted to Cython.
# def string_to_colors(s):
# """Transform the string 's' in a list of 3 sized tuples.
# """
# result = []
# for i in xrange(0, len(s), 6):
# number = int(s[i:i+6], 16)
# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
# return result
class Cache(object):
"""A class to cache picture blocks.
"""
def __init__(self, db=':memory:', threaded=True):
def create_tables():
sql = "create table pictures(path TEXT, blocks TEXT)"
self.con.execute(sql);
sql = "create index idx_path on pictures (path)"
self.con.execute(sql)
self.dbname = db
if threaded:
self.con = hsutil.sqlite.ThreadedConn(db, True)
else:
self.con = sqlite.connect(db, isolation_level=None)
try:
self.con.execute("select * from pictures where 1=2")
except sqlite.OperationalError: # new db
create_tables()
except sqlite.DatabaseError, e: # corrupted db
logging.warning('Could not create picture cache because of an error: %s', str(e))
self.con.close()
os.remove(db)
if threaded:
self.con = hsutil.sqlite.ThreadedConn(db, True)
else:
self.con = sqlite.connect(db, isolation_level=None)
create_tables()
def __contains__(self, key):
sql = "select count(*) from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchall()
return result[0][0] > 0
def __delitem__(self, key):
if key not in self:
raise KeyError(key)
sql = "delete from pictures where path = ?"
self.con.execute(sql, [key])
# Optimized
def __getitem__(self, key):
if isinstance(key, int):
sql = "select blocks from pictures where rowid = ?"
else:
sql = "select blocks from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchone()
if result:
result = string_to_colors(result[0])
return result
else:
raise KeyError(key)
def __iter__(self):
sql = "select path from pictures"
result = self.con.execute(sql)
return (row[0] for row in result)
def __len__(self):
sql = "select count(*) from pictures"
result = self.con.execute(sql).fetchall()
return result[0][0]
def __setitem__(self, key, value):
value = colors_to_string(value)
if key in self:
sql = "update pictures set blocks = ? where path = ?"
else:
sql = "insert into pictures(blocks,path) values(?,?)"
try:
self.con.execute(sql, [value, key])
except sqlite.OperationalError:
logging.warning('Picture cache could not set %r for key %r', value, key)
except sqlite.DatabaseError, e:
logging.warning('DatabaseError while setting %r for key %r: %s', value, key, str(e))
def clear(self):
sql = "delete from pictures"
self.con.execute(sql)
def filter(self, func):
to_delete = [key for key in self if not func(key)]
for key in to_delete:
del self[key]
def get_id(self, path):
sql = "select rowid from pictures where path = ?"
result = self.con.execute(sql, [path]).fetchone()
if result:
return result[0]
else:
raise ValueError(path)
def get_multiple(self, rowids):
sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids))
cur = self.con.execute(sql)
return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python
"""
Unit Name: hs.picture._match
Created By: Virgil Dupras
Created On: 2007/02/25
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:02:48 +0200 (Thu, 28 May 2009) $
$Revision: 4388 $
Copyright 2007 Hardcoded Software (http://www.hardcoded.net)
"""
import logging
import multiprocessing
from Queue import Empty
from collections import defaultdict
from hsutil import job
from hsutil.misc import dedupe
from dupeguru.engine import Match
from block import avgdiff, DifferentBlockCountError, NoBlocksError
from cache import Cache
MIN_ITERATIONS = 3
def get_match(first,second,percentage):
if percentage < 0:
percentage = 0
return Match(first,second,percentage)
class MatchFactory(object):
cached_blocks = None
block_count_per_side = 15
threshold = 75
match_scaled = False
def _do_getmatches(self, files, j):
raise NotImplementedError()
def getmatches(self, files, j=job.nulljob):
# The MemoryError handlers in there use logging without first caring about whether or not
# there is enough memory left to carry on the operation because it is assumed that the
# MemoryError happens when trying to read an image file, which is freed from memory by the
# time that MemoryError is raised.
j = j.start_subjob([2, 8])
logging.info('Preparing %d files' % len(files))
prepared = self.prepare_files(files, j)
logging.info('Finished preparing %d files' % len(prepared))
return self._do_getmatches(prepared, j)
def prepare_files(self, files, j=job.nulljob):
prepared = [] # only files for which there was no error getting blocks
try:
for picture in j.iter_with_progress(files, 'Analyzed %d/%d pictures'):
picture.dimensions
picture.unicode_path = unicode(picture.path)
try:
if picture.unicode_path not in self.cached_blocks:
blocks = picture.get_blocks(self.block_count_per_side)
self.cached_blocks[picture.unicode_path] = blocks
prepared.append(picture)
except IOError as e:
logging.warning(unicode(e))
except MemoryError:
logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
if picture.size < 10 * 1024 * 1024: # We're really running out of memory
raise
except MemoryError:
logging.warning('Ran out of memory while preparing files')
return prepared
def async_compare(ref_id, other_ids, dbname, threshold):
cache = Cache(dbname, threaded=False)
limit = 100 - threshold
ref_blocks = cache[ref_id]
pairs = cache.get_multiple(other_ids)
results = []
for other_id, other_blocks in pairs:
try:
diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
percentage = 100 - diff
except (DifferentBlockCountError, NoBlocksError):
percentage = 0
if percentage >= threshold:
results.append((ref_id, other_id, percentage))
cache.con.close()
return results
class AsyncMatchFactory(MatchFactory):
def _do_getmatches(self, pictures, j):
def empty_out_queue(queue, into):
try:
while True:
into.append(queue.get(block=False))
except Empty:
pass
j = j.start_subjob([1, 8, 1], 'Preparing for matching')
cache = self.cached_blocks
id2picture = {}
dimensions2pictures = defaultdict(set)
for picture in pictures[:]:
try:
picture.cache_id = cache.get_id(picture.unicode_path)
id2picture[picture.cache_id] = picture
except ValueError:
pictures.remove(picture)
if not self.match_scaled:
dimensions2pictures[picture.dimensions].add(picture)
pool = multiprocessing.Pool()
async_results = []
pictures_copy = set(pictures)
for ref in j.iter_with_progress(pictures):
others = pictures_copy if self.match_scaled else dimensions2pictures[ref.dimensions]
others.remove(ref)
if others:
cache_ids = [f.cache_id for f in others]
args = (ref.cache_id, cache_ids, self.cached_blocks.dbname, self.threshold)
async_results.append(pool.apply_async(async_compare, args))
matches = []
for result in j.iter_with_progress(async_results, 'Matched %d/%d pictures'):
matches.extend(result.get())
result = []
for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
ref = id2picture[ref_id]
other = id2picture[other_id]
if percentage == 100 and ref.md5 != other.md5:
percentage = 99
if percentage >= self.threshold:
result.append(get_match(ref, other, percentage))
return result
multiprocessing.freeze_support()