1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-23 07:01:39 +00:00

Initial commit.

--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402
This commit is contained in:
hsoft
2009-06-01 09:55:11 +00:00
parent 4f197ffd5a
commit e9a97afdf8
354 changed files with 38083 additions and 0 deletions

0
py/picture/__init__.py Normal file
View File

124
py/picture/block.py Normal file
View File

@@ -0,0 +1,124 @@
#!/usr/bin/env python
"""
Unit Name: hs.picture.block
Created By: Virgil Dupras
Created On: 2006/09/01
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-26 18:12:39 +0200 (Tue, 26 May 2009) $
$Revision: 4365 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
from _block import NoBlocksError, DifferentBlockCountError, avgdiff, getblocks2
# Converted to Cython
# def getblock(image):
# """Returns a 3 sized tuple containing the mean color of 'image'.
#
# image: a PIL image or crop.
# """
# if image.size[0]:
# pixel_count = image.size[0] * image.size[1]
# red = green = blue = 0
# for r,g,b in image.getdata():
# red += r
# green += g
# blue += b
# return (red // pixel_count, green // pixel_count, blue // pixel_count)
# else:
# return (0,0,0)
# This is not used anymore
# def getblocks(image,blocksize):
# """Returns a list of blocks (3 sized tuples).
#
# image: A PIL image to base the blocks on.
# blocksize: The size of the blocks to be create. This is a single integer, defining
# both width and height (blocks are square).
# """
# if min(image.size) < blocksize:
# return ()
# result = []
# for i in xrange(image.size[1] // blocksize):
# for j in xrange(image.size[0] // blocksize):
# box = (blocksize * j, blocksize * i, blocksize * (j + 1), blocksize * (i + 1))
# crop = image.crop(box)
# result.append(getblock(crop))
# return result
# Converted to Cython
# def getblocks2(image,block_count_per_side):
# """Returns a list of blocks (3 sized tuples).
#
# image: A PIL image to base the blocks on.
# block_count_per_side: This integer determine the number of blocks the function will return.
# If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not
# necessarely cover square areas. The area covered by each block will be proportional to the image
# itself.
# """
# if not image.size[0]:
# return []
# width,height = image.size
# block_width = max(width // block_count_per_side,1)
# block_height = max(height // block_count_per_side,1)
# result = []
# for ih in range(block_count_per_side):
# top = min(ih * block_height, height - block_height)
# bottom = top + block_height
# for iw in range(block_count_per_side):
# left = min(iw * block_width, width - block_width)
# right = left + block_width
# box = (left,top,right,bottom)
# crop = image.crop(box)
# result.append(getblock(crop))
# return result
# Converted to Cython
# def diff(first, second):
# """Returns the difference between the first block and the second.
#
# It returns an absolute sum of the 3 differences (RGB).
# """
# r1, g1, b1 = first
# r2, g2, b2 = second
# return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2)
# Converted to Cython
# def avgdiff(first, second, limit=768, min_iterations=1):
# """Returns the average diff between first blocks and seconds.
#
# If the result surpasses limit, limit + 1 is returned, except if less than min_iterations
# iterations have been made in the blocks.
# """
# if len(first) != len(second):
# raise DifferentBlockCountError
# if not first:
# raise NoBlocksError
# count = len(first)
# sum = 0
# zipped = izip(xrange(1, count + 1), first, second)
# for i, first, second in zipped:
# sum += diff(first, second)
# if sum > limit * i and i >= min_iterations:
# return limit + 1
# result = sum // count
# if (not result) and sum:
# result = 1
# return result
# This is not used anymore
# def maxdiff(first,second,limit=768):
# """Returns the max diff between first blocks and seconds.
#
# If the result surpasses limit, the first max being over limit is returned.
# """
# if len(first) != len(second):
# raise DifferentBlockCountError
# if not first:
# raise NoBlocksError
# result = 0
# zipped = zip(first,second)
# for first,second in zipped:
# result = max(result,diff(first,second))
# if result > limit:
# return result
# return result

313
py/picture/block_test.py Normal file
View File

@@ -0,0 +1,313 @@
#!/usr/bin/env python
"""
Unit Name: tests.picture.block
Created By: Virgil Dupras
Created On: 2006/09/01
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
# The commented out tests are tests for function that have been converted to pure C for speed
import unittest
from .block import *
def my_avgdiff(first, second, limit=768, min_iter=3): # this is so I don't have to re-write every call
return avgdiff(first, second, limit, min_iter)
BLACK = (0,0,0)
RED = (0xff,0,0)
GREEN = (0,0xff,0)
BLUE = (0,0,0xff)
class FakeImage(object):
def __init__(self, size, data):
self.size = size
self.data = data
def getdata(self):
return self.data
def crop(self, box):
pixels = []
for i in range(box[1], box[3]):
for j in range(box[0], box[2]):
pixel = self.data[i * self.size[0] + j]
pixels.append(pixel)
return FakeImage((box[2] - box[0], box[3] - box[1]), pixels)
def empty():
return FakeImage((0,0), [])
def single_pixel(): #one red pixel
return FakeImage((1, 1), [(0xff,0,0)])
def four_pixels():
pixels = [RED,(0,0x80,0xff),(0x80,0,0),(0,0x40,0x80)]
return FakeImage((2, 2), pixels)
class TCgetblock(unittest.TestCase):
def test_single_pixel(self):
im = single_pixel()
[b] = getblocks2(im, 1)
self.assertEqual(RED,b)
def test_no_pixel(self):
im = empty()
self.assertEqual([], getblocks2(im, 1))
def test_four_pixels(self):
im = four_pixels()
[b] = getblocks2(im, 1)
meanred = (0xff + 0x80) // 4
meangreen = (0x80 + 0x40) // 4
meanblue = (0xff + 0x80) // 4
self.assertEqual((meanred,meangreen,meanblue),b)
# class TCdiff(unittest.TestCase):
# def test_diff(self):
# b1 = (10, 20, 30)
# b2 = (1, 2, 3)
# self.assertEqual(9 + 18 + 27,diff(b1,b2))
#
# def test_diff_negative(self):
# b1 = (10, 20, 30)
# b2 = (1, 2, 3)
# self.assertEqual(9 + 18 + 27,diff(b2,b1))
#
# def test_diff_mixed_positive_and_negative(self):
# b1 = (1, 5, 10)
# b2 = (10, 1, 15)
# self.assertEqual(9 + 4 + 5,diff(b1,b2))
#
# class TCgetblocks(unittest.TestCase):
# def test_empty_image(self):
# im = empty()
# blocks = getblocks(im,1)
# self.assertEqual(0,len(blocks))
#
# def test_one_block_image(self):
# im = four_pixels()
# blocks = getblocks2(im, 1)
# self.assertEqual(1,len(blocks))
# block = blocks[0]
# meanred = (0xff + 0x80) // 4
# meangreen = (0x80 + 0x40) // 4
# meanblue = (0xff + 0x80) // 4
# self.assertEqual((meanred,meangreen,meanblue),block)
#
# def test_not_enough_height_to_fit_a_block(self):
# im = FakeImage((2,1), [BLACK, BLACK])
# blocks = getblocks(im,2)
# self.assertEqual(0,len(blocks))
#
# def xtest_dont_include_leftovers(self):
# # this test is disabled because getblocks is not used and getblock in cdeffed
# pixels = [
# RED,(0,0x80,0xff),BLACK,
# (0x80,0,0),(0,0x40,0x80),BLACK,
# BLACK,BLACK,BLACK
# ]
# im = FakeImage((3,3), pixels)
# blocks = getblocks(im,2)
# block = blocks[0]
# #Because the block is smaller than the image, only blocksize must be considered.
# meanred = (0xff + 0x80) // 4
# meangreen = (0x80 + 0x40) // 4
# meanblue = (0xff + 0x80) // 4
# self.assertEqual((meanred,meangreen,meanblue),block)
#
# def xtest_two_blocks(self):
# # this test is disabled because getblocks is not used and getblock in cdeffed
# pixels = [BLACK for i in xrange(4 * 2)]
# pixels[0] = RED
# pixels[1] = (0,0x80,0xff)
# pixels[4] = (0x80,0,0)
# pixels[5] = (0,0x40,0x80)
# im = FakeImage((4, 2), pixels)
# blocks = getblocks(im,2)
# self.assertEqual(2,len(blocks))
# block = blocks[0]
# #Because the block is smaller than the image, only blocksize must be considered.
# meanred = (0xff + 0x80) // 4
# meangreen = (0x80 + 0x40) // 4
# meanblue = (0xff + 0x80) // 4
# self.assertEqual((meanred,meangreen,meanblue),block)
# self.assertEqual(BLACK,blocks[1])
#
# def test_four_blocks(self):
# pixels = [BLACK for i in xrange(4 * 4)]
# pixels[0] = RED
# pixels[1] = (0,0x80,0xff)
# pixels[4] = (0x80,0,0)
# pixels[5] = (0,0x40,0x80)
# im = FakeImage((4, 4), pixels)
# blocks = getblocks2(im, 2)
# self.assertEqual(4,len(blocks))
# block = blocks[0]
# #Because the block is smaller than the image, only blocksize must be considered.
# meanred = (0xff + 0x80) // 4
# meangreen = (0x80 + 0x40) // 4
# meanblue = (0xff + 0x80) // 4
# self.assertEqual((meanred,meangreen,meanblue),block)
# self.assertEqual(BLACK,blocks[1])
# self.assertEqual(BLACK,blocks[2])
# self.assertEqual(BLACK,blocks[3])
#
class TCgetblocks2(unittest.TestCase):
def test_empty_image(self):
im = empty()
blocks = getblocks2(im,1)
self.assertEqual(0,len(blocks))
def test_one_block_image(self):
im = four_pixels()
blocks = getblocks2(im,1)
self.assertEqual(1,len(blocks))
block = blocks[0]
meanred = (0xff + 0x80) // 4
meangreen = (0x80 + 0x40) // 4
meanblue = (0xff + 0x80) // 4
self.assertEqual((meanred,meangreen,meanblue),block)
def test_four_blocks_all_black(self):
im = FakeImage((2, 2), [BLACK, BLACK, BLACK, BLACK])
blocks = getblocks2(im,2)
self.assertEqual(4,len(blocks))
for block in blocks:
self.assertEqual(BLACK,block)
def test_two_pixels_image_horizontal(self):
pixels = [RED,BLUE]
im = FakeImage((2, 1), pixels)
blocks = getblocks2(im,2)
self.assertEqual(4,len(blocks))
self.assertEqual(RED,blocks[0])
self.assertEqual(BLUE,blocks[1])
self.assertEqual(RED,blocks[2])
self.assertEqual(BLUE,blocks[3])
def test_two_pixels_image_vertical(self):
pixels = [RED,BLUE]
im = FakeImage((1, 2), pixels)
blocks = getblocks2(im,2)
self.assertEqual(4,len(blocks))
self.assertEqual(RED,blocks[0])
self.assertEqual(RED,blocks[1])
self.assertEqual(BLUE,blocks[2])
self.assertEqual(BLUE,blocks[3])
class TCavgdiff(unittest.TestCase):
def test_empty(self):
self.assertRaises(NoBlocksError, my_avgdiff, [], [])
def test_two_blocks(self):
im = empty()
b1 = (5,10,15)
b2 = (255,250,245)
b3 = (0,0,0)
b4 = (255,0,255)
blocks1 = [b1,b2]
blocks2 = [b3,b4]
expected1 = 5 + 10 + 15
expected2 = 0 + 250 + 10
expected = (expected1 + expected2) // 2
self.assertEqual(expected, my_avgdiff(blocks1, blocks2))
def test_blocks_not_the_same_size(self):
b = (0,0,0)
self.assertRaises(DifferentBlockCountError,my_avgdiff,[b,b],[b])
def test_first_arg_is_empty_but_not_second(self):
#Don't return 0 (as when the 2 lists are empty), raise!
b = (0,0,0)
self.assertRaises(DifferentBlockCountError,my_avgdiff,[],[b])
def test_limit(self):
ref = (0,0,0)
b1 = (10,10,10) #avg 30
b2 = (20,20,20) #avg 45
b3 = (30,30,30) #avg 60
blocks1 = [ref,ref,ref]
blocks2 = [b1,b2,b3]
self.assertEqual(45,my_avgdiff(blocks1,blocks2,44))
def test_min_iterations(self):
ref = (0,0,0)
b1 = (10,10,10) #avg 30
b2 = (20,20,20) #avg 45
b3 = (10,10,10) #avg 40
blocks1 = [ref,ref,ref]
blocks2 = [b1,b2,b3]
self.assertEqual(40,my_avgdiff(blocks1,blocks2,45 - 1,3))
# Bah, I don't know why this test fails, but I don't think it matters very much
# def test_just_over_the_limit(self):
# #A score just over the limit might return exactly the limit due to truncating. We should
# #ceil() the result in this case.
# ref = (0,0,0)
# b1 = (10,0,0)
# b2 = (11,0,0)
# blocks1 = [ref,ref]
# blocks2 = [b1,b2]
# self.assertEqual(11,my_avgdiff(blocks1,blocks2,10))
#
def test_return_at_least_1_at_the_slightest_difference(self):
ref = (0,0,0)
b1 = (1,0,0)
blocks1 = [ref for i in xrange(250)]
blocks2 = [ref for i in xrange(250)]
blocks2[0] = b1
self.assertEqual(1,my_avgdiff(blocks1,blocks2))
def test_return_0_if_there_is_no_difference(self):
ref = (0,0,0)
blocks1 = [ref,ref]
blocks2 = [ref,ref]
self.assertEqual(0,my_avgdiff(blocks1,blocks2))
# class TCmaxdiff(unittest.TestCase):
# def test_empty(self):
# self.assertRaises(NoBlocksError,maxdiff,[],[])
#
# def test_two_blocks(self):
# b1 = (5,10,15)
# b2 = (255,250,245)
# b3 = (0,0,0)
# b4 = (255,0,255)
# blocks1 = [b1,b2]
# blocks2 = [b3,b4]
# expected1 = 5 + 10 + 15
# expected2 = 0 + 250 + 10
# expected = max(expected1,expected2)
# self.assertEqual(expected,maxdiff(blocks1,blocks2))
#
# def test_blocks_not_the_same_size(self):
# b = (0,0,0)
# self.assertRaises(DifferentBlockCountError,maxdiff,[b,b],[b])
#
# def test_first_arg_is_empty_but_not_second(self):
# #Don't return 0 (as when the 2 lists are empty), raise!
# b = (0,0,0)
# self.assertRaises(DifferentBlockCountError,maxdiff,[],[b])
#
# def test_limit(self):
# b1 = (5,10,15)
# b2 = (255,250,245)
# b3 = (0,0,0)
# b4 = (255,0,255)
# blocks1 = [b1,b2]
# blocks2 = [b3,b4]
# expected1 = 5 + 10 + 15
# expected2 = 0 + 250 + 10
# self.assertEqual(expected1,maxdiff(blocks1,blocks2,expected1 - 1))
#
if __name__ == "__main__":
unittest.main()

134
py/picture/cache.py Normal file
View File

@@ -0,0 +1,134 @@
#!/usr/bin/env python
"""
Unit Name: hs.picture.cache
Created By: Virgil Dupras
Created On: 2006/09/14
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
$Revision: 4392 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import os
import logging
import sqlite3 as sqlite
import hsutil.sqlite
from _cache import string_to_colors
def colors_to_string(colors):
"""Transform the 3 sized tuples 'colors' into a hex string.
[(0,100,255)] --> 0064ff
[(1,2,3),(4,5,6)] --> 010203040506
"""
return ''.join(['%02x%02x%02x' % (r,g,b) for r,g,b in colors])
# This function is an important bottleneck of dupeGuru PE. It has been converted to Cython.
# def string_to_colors(s):
# """Transform the string 's' in a list of 3 sized tuples.
# """
# result = []
# for i in xrange(0, len(s), 6):
# number = int(s[i:i+6], 16)
# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
# return result
class Cache(object):
"""A class to cache picture blocks.
"""
def __init__(self, db=':memory:', threaded=True):
def create_tables():
sql = "create table pictures(path TEXT, blocks TEXT)"
self.con.execute(sql);
sql = "create index idx_path on pictures (path)"
self.con.execute(sql)
self.dbname = db
if threaded:
self.con = hsutil.sqlite.ThreadedConn(db, True)
else:
self.con = sqlite.connect(db, isolation_level=None)
try:
self.con.execute("select * from pictures where 1=2")
except sqlite.OperationalError: # new db
create_tables()
except sqlite.DatabaseError, e: # corrupted db
logging.warning('Could not create picture cache because of an error: %s', str(e))
self.con.close()
os.remove(db)
if threaded:
self.con = hsutil.sqlite.ThreadedConn(db, True)
else:
self.con = sqlite.connect(db, isolation_level=None)
create_tables()
def __contains__(self, key):
sql = "select count(*) from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchall()
return result[0][0] > 0
def __delitem__(self, key):
if key not in self:
raise KeyError(key)
sql = "delete from pictures where path = ?"
self.con.execute(sql, [key])
# Optimized
def __getitem__(self, key):
if isinstance(key, int):
sql = "select blocks from pictures where rowid = ?"
else:
sql = "select blocks from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchone()
if result:
result = string_to_colors(result[0])
return result
else:
raise KeyError(key)
def __iter__(self):
sql = "select path from pictures"
result = self.con.execute(sql)
return (row[0] for row in result)
def __len__(self):
sql = "select count(*) from pictures"
result = self.con.execute(sql).fetchall()
return result[0][0]
def __setitem__(self, key, value):
value = colors_to_string(value)
if key in self:
sql = "update pictures set blocks = ? where path = ?"
else:
sql = "insert into pictures(blocks,path) values(?,?)"
try:
self.con.execute(sql, [value, key])
except sqlite.OperationalError:
logging.warning('Picture cache could not set %r for key %r', value, key)
except sqlite.DatabaseError, e:
logging.warning('DatabaseError while setting %r for key %r: %s', value, key, str(e))
def clear(self):
sql = "delete from pictures"
self.con.execute(sql)
def filter(self, func):
to_delete = [key for key in self if not func(key)]
for key in to_delete:
del self[key]
def get_id(self, path):
sql = "select rowid from pictures where path = ?"
result = self.con.execute(sql, [path]).fetchone()
if result:
return result[0]
else:
raise ValueError(path)
def get_multiple(self, rowids):
sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids))
cur = self.con.execute(sql)
return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)

159
py/picture/cache_test.py Normal file
View File

@@ -0,0 +1,159 @@
#!/usr/bin/env python
"""
Unit Name: tests.picture.cache
Created By: Virgil Dupras
Created On: 2006/09/14
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
$Revision: 4385 $
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
"""
import unittest
from StringIO import StringIO
import os.path as op
import os
import threading
from hsutil.testcase import TestCase
from .cache import *
class TCcolors_to_string(unittest.TestCase):
def test_no_color(self):
self.assertEqual('',colors_to_string([]))
def test_single_color(self):
self.assertEqual('000000',colors_to_string([(0,0,0)]))
self.assertEqual('010101',colors_to_string([(1,1,1)]))
self.assertEqual('0a141e',colors_to_string([(10,20,30)]))
def test_two_colors(self):
self.assertEqual('000102030405',colors_to_string([(0,1,2),(3,4,5)]))
class TCstring_to_colors(unittest.TestCase):
def test_empty(self):
self.assertEqual([],string_to_colors(''))
def test_single_color(self):
self.assertEqual([(0,0,0)],string_to_colors('000000'))
self.assertEqual([(2,3,4)],string_to_colors('020304'))
self.assertEqual([(10,20,30)],string_to_colors('0a141e'))
def test_two_colors(self):
self.assertEqual([(10,20,30),(40,50,60)],string_to_colors('0a141e28323c'))
def test_incomplete_color(self):
# don't return anything if it's not a complete color
self.assertEqual([],string_to_colors('102'))
class TCCache(TestCase):
def test_empty(self):
c = Cache()
self.assertEqual(0,len(c))
self.assertRaises(KeyError,c.__getitem__,'foo')
def test_set_then_retrieve_blocks(self):
c = Cache()
b = [(0,0,0),(1,2,3)]
c['foo'] = b
self.assertEqual(b,c['foo'])
def test_delitem(self):
c = Cache()
c['foo'] = ''
del c['foo']
self.assert_('foo' not in c)
self.assertRaises(KeyError,c.__delitem__,'foo')
def test_persistance(self):
DBNAME = op.join(self.tmpdir(), 'hstest.db')
c = Cache(DBNAME)
c['foo'] = [(1,2,3)]
del c
c = Cache(DBNAME)
self.assertEqual([(1,2,3)],c['foo'])
del c
os.remove(DBNAME)
def test_filter(self):
c = Cache()
c['foo'] = ''
c['bar'] = ''
c['baz'] = ''
c.filter(lambda p:p != 'bar') #only 'bar' is removed
self.assertEqual(2,len(c))
self.assert_('foo' in c)
self.assert_('baz' in c)
self.assert_('bar' not in c)
def test_clear(self):
c = Cache()
c['foo'] = ''
c['bar'] = ''
c['baz'] = ''
c.clear()
self.assertEqual(0,len(c))
self.assert_('foo' not in c)
self.assert_('baz' not in c)
self.assert_('bar' not in c)
def test_corrupted_db(self):
dbname = op.join(self.tmpdir(), 'foo.db')
fp = open(dbname, 'w')
fp.write('invalid sqlite content')
fp.close()
c = Cache(dbname) # should not raise a DatabaseError
c['foo'] = [(1, 2, 3)]
del c
c = Cache(dbname)
self.assertEqual(c['foo'], [(1, 2, 3)])
def test_by_id(self):
# it's possible to use the cache by referring to the files by their row_id
c = Cache()
b = [(0,0,0),(1,2,3)]
c['foo'] = b
foo_id = c.get_id('foo')
self.assertEqual(c[foo_id], b)
class TCCacheSQLEscape(unittest.TestCase):
def test_contains(self):
c = Cache()
self.assert_("foo'bar" not in c)
def test_getitem(self):
c = Cache()
self.assertRaises(KeyError, c.__getitem__, "foo'bar")
def test_setitem(self):
c = Cache()
c["foo'bar"] = []
def test_delitem(self):
c = Cache()
c["foo'bar"] = []
try:
del c["foo'bar"]
except KeyError:
self.fail()
class TCCacheThreaded(unittest.TestCase):
def test_access_cache(self):
def thread_run():
try:
c['foo'] = [(1,2,3)]
except sqlite.ProgrammingError:
self.fail()
c = Cache()
t = threading.Thread(target=thread_run)
t.start()
t.join()
self.assertEqual([(1,2,3)], c['foo'])
if __name__ == "__main__":
unittest.main()

136
py/picture/matchbase.py Normal file
View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python
"""
Unit Name: hs.picture._match
Created By: Virgil Dupras
Created On: 2007/02/25
Last modified by:$Author: virgil $
Last modified on:$Date: 2009-05-28 16:02:48 +0200 (Thu, 28 May 2009) $
$Revision: 4388 $
Copyright 2007 Hardcoded Software (http://www.hardcoded.net)
"""
import logging
import multiprocessing
from Queue import Empty
from collections import defaultdict
from hsutil import job
from hs.utils.misc import dedupe
from dupeguru.engine import Match
from block import avgdiff, DifferentBlockCountError, NoBlocksError
from cache import Cache
MIN_ITERATIONS = 3
def get_match(first,second,percentage):
if percentage < 0:
percentage = 0
return Match(first,second,percentage)
class MatchFactory(object):
cached_blocks = None
block_count_per_side = 15
threshold = 75
match_scaled = False
def _do_getmatches(self, files, j):
raise NotImplementedError()
def getmatches(self, files, j=job.nulljob):
# The MemoryError handlers in there use logging without first caring about whether or not
# there is enough memory left to carry on the operation because it is assumed that the
# MemoryError happens when trying to read an image file, which is freed from memory by the
# time that MemoryError is raised.
j = j.start_subjob([2, 8])
logging.info('Preparing %d files' % len(files))
prepared = self.prepare_files(files, j)
logging.info('Finished preparing %d files' % len(prepared))
return self._do_getmatches(prepared, j)
def prepare_files(self, files, j=job.nulljob):
prepared = [] # only files for which there was no error getting blocks
try:
for picture in j.iter_with_progress(files, 'Analyzed %d/%d pictures'):
picture.dimensions
picture.unicode_path = unicode(picture.path)
try:
if picture.unicode_path not in self.cached_blocks:
blocks = picture.get_blocks(self.block_count_per_side)
self.cached_blocks[picture.unicode_path] = blocks
prepared.append(picture)
except IOError as e:
logging.warning(unicode(e))
except MemoryError:
logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
if picture.size < 10 * 1024 * 1024: # We're really running out of memory
raise
except MemoryError:
logging.warning('Ran out of memory while preparing files')
return prepared
def async_compare(ref_id, other_ids, dbname, threshold):
cache = Cache(dbname, threaded=False)
limit = 100 - threshold
ref_blocks = cache[ref_id]
pairs = cache.get_multiple(other_ids)
results = []
for other_id, other_blocks in pairs:
try:
diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
percentage = 100 - diff
except (DifferentBlockCountError, NoBlocksError):
percentage = 0
if percentage >= threshold:
results.append((ref_id, other_id, percentage))
cache.con.close()
return results
class AsyncMatchFactory(MatchFactory):
def _do_getmatches(self, pictures, j):
def empty_out_queue(queue, into):
try:
while True:
into.append(queue.get(block=False))
except Empty:
pass
j = j.start_subjob([1, 8, 1], 'Preparing for matching')
cache = self.cached_blocks
id2picture = {}
dimensions2pictures = defaultdict(set)
for picture in pictures[:]:
try:
picture.cache_id = cache.get_id(picture.unicode_path)
id2picture[picture.cache_id] = picture
except ValueError:
pictures.remove(picture)
if not self.match_scaled:
dimensions2pictures[picture.dimensions].add(picture)
pool = multiprocessing.Pool()
async_results = []
pictures_copy = set(pictures)
for ref in j.iter_with_progress(pictures):
others = pictures_copy if self.match_scaled else dimensions2pictures[ref.dimensions]
others.remove(ref)
if others:
cache_ids = [f.cache_id for f in others]
args = (ref.cache_id, cache_ids, self.cached_blocks.dbname, self.threshold)
async_results.append(pool.apply_async(async_compare, args))
matches = []
for result in j.iter_with_progress(async_results, 'Matched %d/%d pictures'):
matches.extend(result.get())
result = []
for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
ref = id2picture[ref_id]
other = id2picture[other_id]
if percentage == 100 and ref.md5 != other.md5:
percentage = 99
if percentage >= self.threshold:
result.append(get_match(ref, other, percentage))
return result
multiprocessing.freeze_support()