1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-02-01 03:01:39 +00:00

Moved dupeguru.picture into 'pe/py'

--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4076
This commit is contained in:
hsoft
2009-06-18 19:11:10 +00:00
parent e5b93a74fa
commit 137c33439d
9 changed files with 4 additions and 4 deletions

0
pe/py/__init__.py Normal file
View File

124
pe/py/block.py Normal file
View File

@@ -0,0 +1,124 @@
#!/usr/bin/env python
"""
Unit Name: hs.picture.block
Created By: Virgil Dupras
Created On: 2006/09/01
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-26 18:12:39 +0200 (Tue, 26 May 2009) $
$Revision: 4365 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
from _block import NoBlocksError, DifferentBlockCountError, avgdiff, getblocks2
# Converted to Cython
# def getblock(image):
# """Returns a 3 sized tuple containing the mean color of 'image'.
#
# image: a PIL image or crop.
# """
# if image.size[0]:
# pixel_count = image.size[0] * image.size[1]
# red = green = blue = 0
# for r,g,b in image.getdata():
# red += r
# green += g
# blue += b
# return (red // pixel_count, green // pixel_count, blue // pixel_count)
# else:
# return (0,0,0)
# This is not used anymore
# def getblocks(image,blocksize):
# """Returns a list of blocks (3 sized tuples).
#
# image: A PIL image to base the blocks on.
# blocksize: The size of the blocks to be create. This is a single integer, defining
# both width and height (blocks are square).
# """
# if min(image.size) < blocksize:
# return ()
# result = []
# for i in xrange(image.size[1] // blocksize):
# for j in xrange(image.size[0] // blocksize):
# box = (blocksize * j, blocksize * i, blocksize * (j + 1), blocksize * (i + 1))
# crop = image.crop(box)
# result.append(getblock(crop))
# return result
# Converted to Cython
# def getblocks2(image,block_count_per_side):
# """Returns a list of blocks (3 sized tuples).
#
# image: A PIL image to base the blocks on.
# block_count_per_side: This integer determine the number of blocks the function will return.
# If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not
# necessarely cover square areas. The area covered by each block will be proportional to the image
# itself.
# """
# if not image.size[0]:
# return []
# width,height = image.size
# block_width = max(width // block_count_per_side,1)
# block_height = max(height // block_count_per_side,1)
# result = []
# for ih in range(block_count_per_side):
# top = min(ih * block_height, height - block_height)
# bottom = top + block_height
# for iw in range(block_count_per_side):
# left = min(iw * block_width, width - block_width)
# right = left + block_width
# box = (left,top,right,bottom)
# crop = image.crop(box)
# result.append(getblock(crop))
# return result
# Converted to Cython
# def diff(first, second):
# """Returns the difference between the first block and the second.
#
# It returns an absolute sum of the 3 differences (RGB).
# """
# r1, g1, b1 = first
# r2, g2, b2 = second
# return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2)
# Converted to Cython
# def avgdiff(first, second, limit=768, min_iterations=1):
# """Returns the average diff between first blocks and seconds.
#
# If the result surpasses limit, limit + 1 is returned, except if less than min_iterations
# iterations have been made in the blocks.
# """
# if len(first) != len(second):
# raise DifferentBlockCountError
# if not first:
# raise NoBlocksError
# count = len(first)
# sum = 0
# zipped = izip(xrange(1, count + 1), first, second)
# for i, first, second in zipped:
# sum += diff(first, second)
# if sum > limit * i and i >= min_iterations:
# return limit + 1
# result = sum // count
# if (not result) and sum:
# result = 1
# return result
# This is not used anymore
# def maxdiff(first,second,limit=768):
# """Returns the max diff between first blocks and seconds.
#
# If the result surpasses limit, the first max being over limit is returned.
# """
# if len(first) != len(second):
# raise DifferentBlockCountError
# if not first:
# raise NoBlocksError
# result = 0
# zipped = zip(first,second)
# for first,second in zipped:
# result = max(result,diff(first,second))
# if result > limit:
# return result
# return result

134
pe/py/cache.py Normal file
View File

@@ -0,0 +1,134 @@
#!/usr/bin/env python
"""
Unit Name: hs.picture.cache
Created By: Virgil Dupras
Created On: 2006/09/14
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
$Revision: 4392 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import os
import logging
import sqlite3 as sqlite
import hsutil.sqlite
from _cache import string_to_colors
def colors_to_string(colors):
"""Transform the 3 sized tuples 'colors' into a hex string.
[(0,100,255)] --> 0064ff
[(1,2,3),(4,5,6)] --> 010203040506
"""
return ''.join(['%02x%02x%02x' % (r,g,b) for r,g,b in colors])
# This function is an important bottleneck of dupeGuru PE. It has been converted to Cython.
# def string_to_colors(s):
# """Transform the string 's' in a list of 3 sized tuples.
# """
# result = []
# for i in xrange(0, len(s), 6):
# number = int(s[i:i+6], 16)
# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
# return result
class Cache(object):
"""A class to cache picture blocks.
"""
def __init__(self, db=':memory:', threaded=True):
def create_tables():
sql = "create table pictures(path TEXT, blocks TEXT)"
self.con.execute(sql);
sql = "create index idx_path on pictures (path)"
self.con.execute(sql)
self.dbname = db
if threaded:
self.con = hsutil.sqlite.ThreadedConn(db, True)
else:
self.con = sqlite.connect(db, isolation_level=None)
try:
self.con.execute("select * from pictures where 1=2")
except sqlite.OperationalError: # new db
create_tables()
except sqlite.DatabaseError, e: # corrupted db
logging.warning('Could not create picture cache because of an error: %s', str(e))
self.con.close()
os.remove(db)
if threaded:
self.con = hsutil.sqlite.ThreadedConn(db, True)
else:
self.con = sqlite.connect(db, isolation_level=None)
create_tables()
def __contains__(self, key):
sql = "select count(*) from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchall()
return result[0][0] > 0
def __delitem__(self, key):
if key not in self:
raise KeyError(key)
sql = "delete from pictures where path = ?"
self.con.execute(sql, [key])
# Optimized
def __getitem__(self, key):
if isinstance(key, int):
sql = "select blocks from pictures where rowid = ?"
else:
sql = "select blocks from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchone()
if result:
result = string_to_colors(result[0])
return result
else:
raise KeyError(key)
def __iter__(self):
sql = "select path from pictures"
result = self.con.execute(sql)
return (row[0] for row in result)
def __len__(self):
sql = "select count(*) from pictures"
result = self.con.execute(sql).fetchall()
return result[0][0]
def __setitem__(self, key, value):
value = colors_to_string(value)
if key in self:
sql = "update pictures set blocks = ? where path = ?"
else:
sql = "insert into pictures(blocks,path) values(?,?)"
try:
self.con.execute(sql, [value, key])
except sqlite.OperationalError:
logging.warning('Picture cache could not set %r for key %r', value, key)
except sqlite.DatabaseError, e:
logging.warning('DatabaseError while setting %r for key %r: %s', value, key, str(e))
def clear(self):
sql = "delete from pictures"
self.con.execute(sql)
def filter(self, func):
to_delete = [key for key in self if not func(key)]
for key in to_delete:
del self[key]
def get_id(self, path):
sql = "select rowid from pictures where path = ?"
result = self.con.execute(sql, [path]).fetchone()
if result:
return result[0]
else:
raise ValueError(path)
def get_multiple(self, rowids):
sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids))
cur = self.con.execute(sql)
return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)

28
pe/py/gen.py Normal file
View File

@@ -0,0 +1,28 @@
#!/usr/bin/env python
# Unit Name: gen
# Created By: Virgil Dupras
# Created On: 2009-05-26
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
import os
import os.path as op
def move(src, dst):
if not op.exists(src):
return
if op.exists(dst):
os.remove(dst)
print 'Moving %s --> %s' % (src, dst)
os.rename(src, dst)
os.chdir(op.join('modules', 'block'))
os.system('python setup.py build_ext --inplace')
os.chdir(op.join('..', 'cache'))
os.system('python setup.py build_ext --inplace')
os.chdir(op.join('..', '..'))
move(op.join('modules', 'block', '_block.so'), '.')
move(op.join('modules', 'block', '_block.pyd'), '.')
move(op.join('modules', 'cache', '_cache.so'), '.')
move(op.join('modules', 'cache', '_cache.pyd'), '.')

136
pe/py/matchbase.py Normal file
View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python
"""
Unit Name: hs.picture._match
Created By: Virgil Dupras
Created On: 2007/02/25
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:02:48 +0200 (Thu, 28 May 2009) $
$Revision: 4388 $
Copyright 2007 Hardcoded Software (http://www.hardcoded.net)
"""
import logging
import multiprocessing
from Queue import Empty
from collections import defaultdict
from hsutil import job
from hsutil.misc import dedupe
from dupeguru.engine import Match
from block import avgdiff, DifferentBlockCountError, NoBlocksError
from cache import Cache
MIN_ITERATIONS = 3
def get_match(first,second,percentage):
if percentage < 0:
percentage = 0
return Match(first,second,percentage)
class MatchFactory(object):
cached_blocks = None
block_count_per_side = 15
threshold = 75
match_scaled = False
def _do_getmatches(self, files, j):
raise NotImplementedError()
def getmatches(self, files, j=job.nulljob):
# The MemoryError handlers in there use logging without first caring about whether or not
# there is enough memory left to carry on the operation because it is assumed that the
# MemoryError happens when trying to read an image file, which is freed from memory by the
# time that MemoryError is raised.
j = j.start_subjob([2, 8])
logging.info('Preparing %d files' % len(files))
prepared = self.prepare_files(files, j)
logging.info('Finished preparing %d files' % len(prepared))
return self._do_getmatches(prepared, j)
def prepare_files(self, files, j=job.nulljob):
prepared = [] # only files for which there was no error getting blocks
try:
for picture in j.iter_with_progress(files, 'Analyzed %d/%d pictures'):
picture.dimensions
picture.unicode_path = unicode(picture.path)
try:
if picture.unicode_path not in self.cached_blocks:
blocks = picture.get_blocks(self.block_count_per_side)
self.cached_blocks[picture.unicode_path] = blocks
prepared.append(picture)
except IOError as e:
logging.warning(unicode(e))
except MemoryError:
logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
if picture.size < 10 * 1024 * 1024: # We're really running out of memory
raise
except MemoryError:
logging.warning('Ran out of memory while preparing files')
return prepared
def async_compare(ref_id, other_ids, dbname, threshold):
cache = Cache(dbname, threaded=False)
limit = 100 - threshold
ref_blocks = cache[ref_id]
pairs = cache.get_multiple(other_ids)
results = []
for other_id, other_blocks in pairs:
try:
diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
percentage = 100 - diff
except (DifferentBlockCountError, NoBlocksError):
percentage = 0
if percentage >= threshold:
results.append((ref_id, other_id, percentage))
cache.con.close()
return results
class AsyncMatchFactory(MatchFactory):
def _do_getmatches(self, pictures, j):
def empty_out_queue(queue, into):
try:
while True:
into.append(queue.get(block=False))
except Empty:
pass
j = j.start_subjob([1, 8, 1], 'Preparing for matching')
cache = self.cached_blocks
id2picture = {}
dimensions2pictures = defaultdict(set)
for picture in pictures[:]:
try:
picture.cache_id = cache.get_id(picture.unicode_path)
id2picture[picture.cache_id] = picture
except ValueError:
pictures.remove(picture)
if not self.match_scaled:
dimensions2pictures[picture.dimensions].add(picture)
pool = multiprocessing.Pool()
async_results = []
pictures_copy = set(pictures)
for ref in j.iter_with_progress(pictures):
others = pictures_copy if self.match_scaled else dimensions2pictures[ref.dimensions]
others.remove(ref)
if others:
cache_ids = [f.cache_id for f in others]
args = (ref.cache_id, cache_ids, self.cached_blocks.dbname, self.threshold)
async_results.append(pool.apply_async(async_compare, args))
matches = []
for result in j.iter_with_progress(async_results, 'Matched %d/%d pictures'):
matches.extend(result.get())
result = []
for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
ref = id2picture[ref_id]
other = id2picture[other_id]
if percentage == 100 and ref.md5 != other.md5:
percentage = 99
if percentage >= self.threshold:
result.append(get_match(ref, other, percentage))
return result
multiprocessing.freeze_support()

View File

@@ -0,0 +1,93 @@
# Created By: Virgil Dupras
# Created On: 2009-04-23
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
cdef extern from "stdlib.h":
int abs(int n) # required so that abs() is applied on ints, not python objects
class NoBlocksError(Exception):
"""avgdiff/maxdiff has been called with empty lists"""
class DifferentBlockCountError(Exception):
"""avgdiff/maxdiff has been called with 2 block lists of different size."""
cdef object getblock(object image):
"""Returns a 3 sized tuple containing the mean color of 'image'.
image: a PIL image or crop.
"""
cdef int pixel_count, red, green, blue, r, g, b
if image.size[0]:
pixel_count = image.size[0] * image.size[1]
red = green = blue = 0
for r, g, b in image.getdata():
red += r
green += g
blue += b
return (red // pixel_count, green // pixel_count, blue // pixel_count)
else:
return (0, 0, 0)
def getblocks2(image, int block_count_per_side):
"""Returns a list of blocks (3 sized tuples).
image: A PIL image to base the blocks on.
block_count_per_side: This integer determine the number of blocks the function will return.
If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not
necessarely cover square areas. The area covered by each block will be proportional to the image
itself.
"""
if not image.size[0]:
return []
cdef int width, height, block_width, block_height, ih, iw, top, bottom, left, right
width, height = image.size
block_width = max(width // block_count_per_side, 1)
block_height = max(height // block_count_per_side, 1)
result = []
for ih in range(block_count_per_side):
top = min(ih * block_height, height - block_height)
bottom = top + block_height
for iw in range(block_count_per_side):
left = min(iw * block_width, width - block_width)
right = left + block_width
box = (left, top, right, bottom)
crop = image.crop(box)
result.append(getblock(crop))
return result
cdef int diff(first, second):
"""Returns the difference between the first block and the second.
It returns an absolute sum of the 3 differences (RGB).
"""
cdef int r1, g1, b1, r2, g2, b2
r1, g1, b1 = first
r2, g2, b2 = second
return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2)
def avgdiff(first, second, int limit, int min_iterations):
"""Returns the average diff between first blocks and seconds.
If the result surpasses limit, limit + 1 is returned, except if less than min_iterations
iterations have been made in the blocks.
"""
cdef int count, sum, i, iteration_count
count = len(first)
if count != len(second):
raise DifferentBlockCountError()
if not count:
raise NoBlocksError()
sum = 0
for i in range(count):
iteration_count = i + 1
item1 = first[i]
item2 = second[i]
sum += diff(item1, item2)
if sum > limit * iteration_count and iteration_count >= min_iterations:
return limit + 1
result = sum // count
if (not result) and sum:
result = 1
return result

View File

@@ -0,0 +1,14 @@
#!/usr/bin/env python
# Created By: Virgil Dupras
# Created On: 2009-04-23
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
setup(
cmdclass = {'build_ext': build_ext},
ext_modules = [Extension("_block", ["block.pyx"])]
)

34
pe/py/modules/cache/cache.pyx vendored Normal file
View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python
# Created By: Virgil Dupras
# Created On: 2009-04-23
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
# ok, this is hacky and stuff, but I don't know C well enough to play with char buffers, copy
# them around and stuff
cdef int xchar_to_int(char c):
if 48 <= c <= 57: # 0-9
return c - 48
elif 65 <= c <= 70: # A-F
return c - 55
elif 97 <= c <= 102: # a-f
return c - 87
def string_to_colors(s):
"""Transform the string 's' in a list of 3 sized tuples.
"""
result = []
cdef int i, char_count, r, g, b
cdef char* cs
char_count = len(s)
char_count = (char_count // 6) * 6
cs = s
for i in range(0, char_count, 6):
r = xchar_to_int(cs[i]) << 4
r += xchar_to_int(cs[i+1])
g = xchar_to_int(cs[i+2]) << 4
g += xchar_to_int(cs[i+3])
b = xchar_to_int(cs[i+4]) << 4
b += xchar_to_int(cs[i+5])
result.append((r, g, b))
return result

14
pe/py/modules/cache/setup.py vendored Normal file
View File

@@ -0,0 +1,14 @@
#!/usr/bin/env python
# Created By: Virgil Dupras
# Created On: 2009-04-23
# $Id$
# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
setup(
cmdclass = {'build_ext': build_ext},
ext_modules = [Extension("_cache", ["cache.pyx"])]
)