mirror of
https://github.com/arsenetar/dupeguru.git
synced 2026-01-23 07:01:39 +00:00
Initial commit.
--HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402
This commit is contained in:
0
py/picture/__init__.py
Normal file
0
py/picture/__init__.py
Normal file
124
py/picture/block.py
Normal file
124
py/picture/block.py
Normal file
@@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: hs.picture.block
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/09/01
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-26 18:12:39 +0200 (Tue, 26 May 2009) $
|
||||
$Revision: 4365 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
from _block import NoBlocksError, DifferentBlockCountError, avgdiff, getblocks2
|
||||
|
||||
# Converted to Cython
|
||||
# def getblock(image):
|
||||
# """Returns a 3 sized tuple containing the mean color of 'image'.
|
||||
#
|
||||
# image: a PIL image or crop.
|
||||
# """
|
||||
# if image.size[0]:
|
||||
# pixel_count = image.size[0] * image.size[1]
|
||||
# red = green = blue = 0
|
||||
# for r,g,b in image.getdata():
|
||||
# red += r
|
||||
# green += g
|
||||
# blue += b
|
||||
# return (red // pixel_count, green // pixel_count, blue // pixel_count)
|
||||
# else:
|
||||
# return (0,0,0)
|
||||
|
||||
# This is not used anymore
|
||||
# def getblocks(image,blocksize):
|
||||
# """Returns a list of blocks (3 sized tuples).
|
||||
#
|
||||
# image: A PIL image to base the blocks on.
|
||||
# blocksize: The size of the blocks to be create. This is a single integer, defining
|
||||
# both width and height (blocks are square).
|
||||
# """
|
||||
# if min(image.size) < blocksize:
|
||||
# return ()
|
||||
# result = []
|
||||
# for i in xrange(image.size[1] // blocksize):
|
||||
# for j in xrange(image.size[0] // blocksize):
|
||||
# box = (blocksize * j, blocksize * i, blocksize * (j + 1), blocksize * (i + 1))
|
||||
# crop = image.crop(box)
|
||||
# result.append(getblock(crop))
|
||||
# return result
|
||||
|
||||
# Converted to Cython
|
||||
# def getblocks2(image,block_count_per_side):
|
||||
# """Returns a list of blocks (3 sized tuples).
|
||||
#
|
||||
# image: A PIL image to base the blocks on.
|
||||
# block_count_per_side: This integer determine the number of blocks the function will return.
|
||||
# If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The blocks will not
|
||||
# necessarely cover square areas. The area covered by each block will be proportional to the image
|
||||
# itself.
|
||||
# """
|
||||
# if not image.size[0]:
|
||||
# return []
|
||||
# width,height = image.size
|
||||
# block_width = max(width // block_count_per_side,1)
|
||||
# block_height = max(height // block_count_per_side,1)
|
||||
# result = []
|
||||
# for ih in range(block_count_per_side):
|
||||
# top = min(ih * block_height, height - block_height)
|
||||
# bottom = top + block_height
|
||||
# for iw in range(block_count_per_side):
|
||||
# left = min(iw * block_width, width - block_width)
|
||||
# right = left + block_width
|
||||
# box = (left,top,right,bottom)
|
||||
# crop = image.crop(box)
|
||||
# result.append(getblock(crop))
|
||||
# return result
|
||||
|
||||
# Converted to Cython
|
||||
# def diff(first, second):
|
||||
# """Returns the difference between the first block and the second.
|
||||
#
|
||||
# It returns an absolute sum of the 3 differences (RGB).
|
||||
# """
|
||||
# r1, g1, b1 = first
|
||||
# r2, g2, b2 = second
|
||||
# return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2)
|
||||
|
||||
# Converted to Cython
|
||||
# def avgdiff(first, second, limit=768, min_iterations=1):
|
||||
# """Returns the average diff between first blocks and seconds.
|
||||
#
|
||||
# If the result surpasses limit, limit + 1 is returned, except if less than min_iterations
|
||||
# iterations have been made in the blocks.
|
||||
# """
|
||||
# if len(first) != len(second):
|
||||
# raise DifferentBlockCountError
|
||||
# if not first:
|
||||
# raise NoBlocksError
|
||||
# count = len(first)
|
||||
# sum = 0
|
||||
# zipped = izip(xrange(1, count + 1), first, second)
|
||||
# for i, first, second in zipped:
|
||||
# sum += diff(first, second)
|
||||
# if sum > limit * i and i >= min_iterations:
|
||||
# return limit + 1
|
||||
# result = sum // count
|
||||
# if (not result) and sum:
|
||||
# result = 1
|
||||
# return result
|
||||
|
||||
# This is not used anymore
|
||||
# def maxdiff(first,second,limit=768):
|
||||
# """Returns the max diff between first blocks and seconds.
|
||||
#
|
||||
# If the result surpasses limit, the first max being over limit is returned.
|
||||
# """
|
||||
# if len(first) != len(second):
|
||||
# raise DifferentBlockCountError
|
||||
# if not first:
|
||||
# raise NoBlocksError
|
||||
# result = 0
|
||||
# zipped = zip(first,second)
|
||||
# for first,second in zipped:
|
||||
# result = max(result,diff(first,second))
|
||||
# if result > limit:
|
||||
# return result
|
||||
# return result
|
||||
313
py/picture/block_test.py
Normal file
313
py/picture/block_test.py
Normal file
@@ -0,0 +1,313 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: tests.picture.block
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/09/01
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
# The commented out tests are tests for function that have been converted to pure C for speed
|
||||
import unittest
|
||||
|
||||
from .block import *
|
||||
|
||||
def my_avgdiff(first, second, limit=768, min_iter=3): # this is so I don't have to re-write every call
|
||||
return avgdiff(first, second, limit, min_iter)
|
||||
|
||||
BLACK = (0,0,0)
|
||||
RED = (0xff,0,0)
|
||||
GREEN = (0,0xff,0)
|
||||
BLUE = (0,0,0xff)
|
||||
|
||||
class FakeImage(object):
|
||||
def __init__(self, size, data):
|
||||
self.size = size
|
||||
self.data = data
|
||||
|
||||
def getdata(self):
|
||||
return self.data
|
||||
|
||||
def crop(self, box):
|
||||
pixels = []
|
||||
for i in range(box[1], box[3]):
|
||||
for j in range(box[0], box[2]):
|
||||
pixel = self.data[i * self.size[0] + j]
|
||||
pixels.append(pixel)
|
||||
return FakeImage((box[2] - box[0], box[3] - box[1]), pixels)
|
||||
|
||||
def empty():
|
||||
return FakeImage((0,0), [])
|
||||
|
||||
def single_pixel(): #one red pixel
|
||||
return FakeImage((1, 1), [(0xff,0,0)])
|
||||
|
||||
def four_pixels():
|
||||
pixels = [RED,(0,0x80,0xff),(0x80,0,0),(0,0x40,0x80)]
|
||||
return FakeImage((2, 2), pixels)
|
||||
|
||||
class TCgetblock(unittest.TestCase):
|
||||
def test_single_pixel(self):
|
||||
im = single_pixel()
|
||||
[b] = getblocks2(im, 1)
|
||||
self.assertEqual(RED,b)
|
||||
|
||||
def test_no_pixel(self):
|
||||
im = empty()
|
||||
self.assertEqual([], getblocks2(im, 1))
|
||||
|
||||
def test_four_pixels(self):
|
||||
im = four_pixels()
|
||||
[b] = getblocks2(im, 1)
|
||||
meanred = (0xff + 0x80) // 4
|
||||
meangreen = (0x80 + 0x40) // 4
|
||||
meanblue = (0xff + 0x80) // 4
|
||||
self.assertEqual((meanred,meangreen,meanblue),b)
|
||||
|
||||
|
||||
# class TCdiff(unittest.TestCase):
|
||||
# def test_diff(self):
|
||||
# b1 = (10, 20, 30)
|
||||
# b2 = (1, 2, 3)
|
||||
# self.assertEqual(9 + 18 + 27,diff(b1,b2))
|
||||
#
|
||||
# def test_diff_negative(self):
|
||||
# b1 = (10, 20, 30)
|
||||
# b2 = (1, 2, 3)
|
||||
# self.assertEqual(9 + 18 + 27,diff(b2,b1))
|
||||
#
|
||||
# def test_diff_mixed_positive_and_negative(self):
|
||||
# b1 = (1, 5, 10)
|
||||
# b2 = (10, 1, 15)
|
||||
# self.assertEqual(9 + 4 + 5,diff(b1,b2))
|
||||
#
|
||||
|
||||
# class TCgetblocks(unittest.TestCase):
|
||||
# def test_empty_image(self):
|
||||
# im = empty()
|
||||
# blocks = getblocks(im,1)
|
||||
# self.assertEqual(0,len(blocks))
|
||||
#
|
||||
# def test_one_block_image(self):
|
||||
# im = four_pixels()
|
||||
# blocks = getblocks2(im, 1)
|
||||
# self.assertEqual(1,len(blocks))
|
||||
# block = blocks[0]
|
||||
# meanred = (0xff + 0x80) // 4
|
||||
# meangreen = (0x80 + 0x40) // 4
|
||||
# meanblue = (0xff + 0x80) // 4
|
||||
# self.assertEqual((meanred,meangreen,meanblue),block)
|
||||
#
|
||||
# def test_not_enough_height_to_fit_a_block(self):
|
||||
# im = FakeImage((2,1), [BLACK, BLACK])
|
||||
# blocks = getblocks(im,2)
|
||||
# self.assertEqual(0,len(blocks))
|
||||
#
|
||||
# def xtest_dont_include_leftovers(self):
|
||||
# # this test is disabled because getblocks is not used and getblock in cdeffed
|
||||
# pixels = [
|
||||
# RED,(0,0x80,0xff),BLACK,
|
||||
# (0x80,0,0),(0,0x40,0x80),BLACK,
|
||||
# BLACK,BLACK,BLACK
|
||||
# ]
|
||||
# im = FakeImage((3,3), pixels)
|
||||
# blocks = getblocks(im,2)
|
||||
# block = blocks[0]
|
||||
# #Because the block is smaller than the image, only blocksize must be considered.
|
||||
# meanred = (0xff + 0x80) // 4
|
||||
# meangreen = (0x80 + 0x40) // 4
|
||||
# meanblue = (0xff + 0x80) // 4
|
||||
# self.assertEqual((meanred,meangreen,meanblue),block)
|
||||
#
|
||||
# def xtest_two_blocks(self):
|
||||
# # this test is disabled because getblocks is not used and getblock in cdeffed
|
||||
# pixels = [BLACK for i in xrange(4 * 2)]
|
||||
# pixels[0] = RED
|
||||
# pixels[1] = (0,0x80,0xff)
|
||||
# pixels[4] = (0x80,0,0)
|
||||
# pixels[5] = (0,0x40,0x80)
|
||||
# im = FakeImage((4, 2), pixels)
|
||||
# blocks = getblocks(im,2)
|
||||
# self.assertEqual(2,len(blocks))
|
||||
# block = blocks[0]
|
||||
# #Because the block is smaller than the image, only blocksize must be considered.
|
||||
# meanred = (0xff + 0x80) // 4
|
||||
# meangreen = (0x80 + 0x40) // 4
|
||||
# meanblue = (0xff + 0x80) // 4
|
||||
# self.assertEqual((meanred,meangreen,meanblue),block)
|
||||
# self.assertEqual(BLACK,blocks[1])
|
||||
#
|
||||
# def test_four_blocks(self):
|
||||
# pixels = [BLACK for i in xrange(4 * 4)]
|
||||
# pixels[0] = RED
|
||||
# pixels[1] = (0,0x80,0xff)
|
||||
# pixels[4] = (0x80,0,0)
|
||||
# pixels[5] = (0,0x40,0x80)
|
||||
# im = FakeImage((4, 4), pixels)
|
||||
# blocks = getblocks2(im, 2)
|
||||
# self.assertEqual(4,len(blocks))
|
||||
# block = blocks[0]
|
||||
# #Because the block is smaller than the image, only blocksize must be considered.
|
||||
# meanred = (0xff + 0x80) // 4
|
||||
# meangreen = (0x80 + 0x40) // 4
|
||||
# meanblue = (0xff + 0x80) // 4
|
||||
# self.assertEqual((meanred,meangreen,meanblue),block)
|
||||
# self.assertEqual(BLACK,blocks[1])
|
||||
# self.assertEqual(BLACK,blocks[2])
|
||||
# self.assertEqual(BLACK,blocks[3])
|
||||
#
|
||||
|
||||
class TCgetblocks2(unittest.TestCase):
|
||||
def test_empty_image(self):
|
||||
im = empty()
|
||||
blocks = getblocks2(im,1)
|
||||
self.assertEqual(0,len(blocks))
|
||||
|
||||
def test_one_block_image(self):
|
||||
im = four_pixels()
|
||||
blocks = getblocks2(im,1)
|
||||
self.assertEqual(1,len(blocks))
|
||||
block = blocks[0]
|
||||
meanred = (0xff + 0x80) // 4
|
||||
meangreen = (0x80 + 0x40) // 4
|
||||
meanblue = (0xff + 0x80) // 4
|
||||
self.assertEqual((meanred,meangreen,meanblue),block)
|
||||
|
||||
def test_four_blocks_all_black(self):
|
||||
im = FakeImage((2, 2), [BLACK, BLACK, BLACK, BLACK])
|
||||
blocks = getblocks2(im,2)
|
||||
self.assertEqual(4,len(blocks))
|
||||
for block in blocks:
|
||||
self.assertEqual(BLACK,block)
|
||||
|
||||
def test_two_pixels_image_horizontal(self):
|
||||
pixels = [RED,BLUE]
|
||||
im = FakeImage((2, 1), pixels)
|
||||
blocks = getblocks2(im,2)
|
||||
self.assertEqual(4,len(blocks))
|
||||
self.assertEqual(RED,blocks[0])
|
||||
self.assertEqual(BLUE,blocks[1])
|
||||
self.assertEqual(RED,blocks[2])
|
||||
self.assertEqual(BLUE,blocks[3])
|
||||
|
||||
def test_two_pixels_image_vertical(self):
|
||||
pixels = [RED,BLUE]
|
||||
im = FakeImage((1, 2), pixels)
|
||||
blocks = getblocks2(im,2)
|
||||
self.assertEqual(4,len(blocks))
|
||||
self.assertEqual(RED,blocks[0])
|
||||
self.assertEqual(RED,blocks[1])
|
||||
self.assertEqual(BLUE,blocks[2])
|
||||
self.assertEqual(BLUE,blocks[3])
|
||||
|
||||
|
||||
class TCavgdiff(unittest.TestCase):
|
||||
def test_empty(self):
|
||||
self.assertRaises(NoBlocksError, my_avgdiff, [], [])
|
||||
|
||||
def test_two_blocks(self):
|
||||
im = empty()
|
||||
b1 = (5,10,15)
|
||||
b2 = (255,250,245)
|
||||
b3 = (0,0,0)
|
||||
b4 = (255,0,255)
|
||||
blocks1 = [b1,b2]
|
||||
blocks2 = [b3,b4]
|
||||
expected1 = 5 + 10 + 15
|
||||
expected2 = 0 + 250 + 10
|
||||
expected = (expected1 + expected2) // 2
|
||||
self.assertEqual(expected, my_avgdiff(blocks1, blocks2))
|
||||
|
||||
def test_blocks_not_the_same_size(self):
|
||||
b = (0,0,0)
|
||||
self.assertRaises(DifferentBlockCountError,my_avgdiff,[b,b],[b])
|
||||
|
||||
def test_first_arg_is_empty_but_not_second(self):
|
||||
#Don't return 0 (as when the 2 lists are empty), raise!
|
||||
b = (0,0,0)
|
||||
self.assertRaises(DifferentBlockCountError,my_avgdiff,[],[b])
|
||||
|
||||
def test_limit(self):
|
||||
ref = (0,0,0)
|
||||
b1 = (10,10,10) #avg 30
|
||||
b2 = (20,20,20) #avg 45
|
||||
b3 = (30,30,30) #avg 60
|
||||
blocks1 = [ref,ref,ref]
|
||||
blocks2 = [b1,b2,b3]
|
||||
self.assertEqual(45,my_avgdiff(blocks1,blocks2,44))
|
||||
|
||||
def test_min_iterations(self):
|
||||
ref = (0,0,0)
|
||||
b1 = (10,10,10) #avg 30
|
||||
b2 = (20,20,20) #avg 45
|
||||
b3 = (10,10,10) #avg 40
|
||||
blocks1 = [ref,ref,ref]
|
||||
blocks2 = [b1,b2,b3]
|
||||
self.assertEqual(40,my_avgdiff(blocks1,blocks2,45 - 1,3))
|
||||
|
||||
# Bah, I don't know why this test fails, but I don't think it matters very much
|
||||
# def test_just_over_the_limit(self):
|
||||
# #A score just over the limit might return exactly the limit due to truncating. We should
|
||||
# #ceil() the result in this case.
|
||||
# ref = (0,0,0)
|
||||
# b1 = (10,0,0)
|
||||
# b2 = (11,0,0)
|
||||
# blocks1 = [ref,ref]
|
||||
# blocks2 = [b1,b2]
|
||||
# self.assertEqual(11,my_avgdiff(blocks1,blocks2,10))
|
||||
#
|
||||
def test_return_at_least_1_at_the_slightest_difference(self):
|
||||
ref = (0,0,0)
|
||||
b1 = (1,0,0)
|
||||
blocks1 = [ref for i in xrange(250)]
|
||||
blocks2 = [ref for i in xrange(250)]
|
||||
blocks2[0] = b1
|
||||
self.assertEqual(1,my_avgdiff(blocks1,blocks2))
|
||||
|
||||
def test_return_0_if_there_is_no_difference(self):
|
||||
ref = (0,0,0)
|
||||
blocks1 = [ref,ref]
|
||||
blocks2 = [ref,ref]
|
||||
self.assertEqual(0,my_avgdiff(blocks1,blocks2))
|
||||
|
||||
|
||||
# class TCmaxdiff(unittest.TestCase):
|
||||
# def test_empty(self):
|
||||
# self.assertRaises(NoBlocksError,maxdiff,[],[])
|
||||
#
|
||||
# def test_two_blocks(self):
|
||||
# b1 = (5,10,15)
|
||||
# b2 = (255,250,245)
|
||||
# b3 = (0,0,0)
|
||||
# b4 = (255,0,255)
|
||||
# blocks1 = [b1,b2]
|
||||
# blocks2 = [b3,b4]
|
||||
# expected1 = 5 + 10 + 15
|
||||
# expected2 = 0 + 250 + 10
|
||||
# expected = max(expected1,expected2)
|
||||
# self.assertEqual(expected,maxdiff(blocks1,blocks2))
|
||||
#
|
||||
# def test_blocks_not_the_same_size(self):
|
||||
# b = (0,0,0)
|
||||
# self.assertRaises(DifferentBlockCountError,maxdiff,[b,b],[b])
|
||||
#
|
||||
# def test_first_arg_is_empty_but_not_second(self):
|
||||
# #Don't return 0 (as when the 2 lists are empty), raise!
|
||||
# b = (0,0,0)
|
||||
# self.assertRaises(DifferentBlockCountError,maxdiff,[],[b])
|
||||
#
|
||||
# def test_limit(self):
|
||||
# b1 = (5,10,15)
|
||||
# b2 = (255,250,245)
|
||||
# b3 = (0,0,0)
|
||||
# b4 = (255,0,255)
|
||||
# blocks1 = [b1,b2]
|
||||
# blocks2 = [b3,b4]
|
||||
# expected1 = 5 + 10 + 15
|
||||
# expected2 = 0 + 250 + 10
|
||||
# self.assertEqual(expected1,maxdiff(blocks1,blocks2,expected1 - 1))
|
||||
#
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
134
py/picture/cache.py
Normal file
134
py/picture/cache.py
Normal file
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: hs.picture.cache
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/09/14
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 16:33:32 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4392 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
import sqlite3 as sqlite
|
||||
|
||||
import hsutil.sqlite
|
||||
|
||||
from _cache import string_to_colors
|
||||
|
||||
def colors_to_string(colors):
|
||||
"""Transform the 3 sized tuples 'colors' into a hex string.
|
||||
|
||||
[(0,100,255)] --> 0064ff
|
||||
[(1,2,3),(4,5,6)] --> 010203040506
|
||||
"""
|
||||
return ''.join(['%02x%02x%02x' % (r,g,b) for r,g,b in colors])
|
||||
|
||||
# This function is an important bottleneck of dupeGuru PE. It has been converted to Cython.
|
||||
# def string_to_colors(s):
|
||||
# """Transform the string 's' in a list of 3 sized tuples.
|
||||
# """
|
||||
# result = []
|
||||
# for i in xrange(0, len(s), 6):
|
||||
# number = int(s[i:i+6], 16)
|
||||
# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
|
||||
# return result
|
||||
|
||||
class Cache(object):
|
||||
"""A class to cache picture blocks.
|
||||
"""
|
||||
def __init__(self, db=':memory:', threaded=True):
|
||||
def create_tables():
|
||||
sql = "create table pictures(path TEXT, blocks TEXT)"
|
||||
self.con.execute(sql);
|
||||
sql = "create index idx_path on pictures (path)"
|
||||
self.con.execute(sql)
|
||||
|
||||
self.dbname = db
|
||||
if threaded:
|
||||
self.con = hsutil.sqlite.ThreadedConn(db, True)
|
||||
else:
|
||||
self.con = sqlite.connect(db, isolation_level=None)
|
||||
try:
|
||||
self.con.execute("select * from pictures where 1=2")
|
||||
except sqlite.OperationalError: # new db
|
||||
create_tables()
|
||||
except sqlite.DatabaseError, e: # corrupted db
|
||||
logging.warning('Could not create picture cache because of an error: %s', str(e))
|
||||
self.con.close()
|
||||
os.remove(db)
|
||||
if threaded:
|
||||
self.con = hsutil.sqlite.ThreadedConn(db, True)
|
||||
else:
|
||||
self.con = sqlite.connect(db, isolation_level=None)
|
||||
create_tables()
|
||||
|
||||
def __contains__(self, key):
|
||||
sql = "select count(*) from pictures where path = ?"
|
||||
result = self.con.execute(sql, [key]).fetchall()
|
||||
return result[0][0] > 0
|
||||
|
||||
def __delitem__(self, key):
|
||||
if key not in self:
|
||||
raise KeyError(key)
|
||||
sql = "delete from pictures where path = ?"
|
||||
self.con.execute(sql, [key])
|
||||
|
||||
# Optimized
|
||||
def __getitem__(self, key):
|
||||
if isinstance(key, int):
|
||||
sql = "select blocks from pictures where rowid = ?"
|
||||
else:
|
||||
sql = "select blocks from pictures where path = ?"
|
||||
result = self.con.execute(sql, [key]).fetchone()
|
||||
if result:
|
||||
result = string_to_colors(result[0])
|
||||
return result
|
||||
else:
|
||||
raise KeyError(key)
|
||||
|
||||
def __iter__(self):
|
||||
sql = "select path from pictures"
|
||||
result = self.con.execute(sql)
|
||||
return (row[0] for row in result)
|
||||
|
||||
def __len__(self):
|
||||
sql = "select count(*) from pictures"
|
||||
result = self.con.execute(sql).fetchall()
|
||||
return result[0][0]
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
value = colors_to_string(value)
|
||||
if key in self:
|
||||
sql = "update pictures set blocks = ? where path = ?"
|
||||
else:
|
||||
sql = "insert into pictures(blocks,path) values(?,?)"
|
||||
try:
|
||||
self.con.execute(sql, [value, key])
|
||||
except sqlite.OperationalError:
|
||||
logging.warning('Picture cache could not set %r for key %r', value, key)
|
||||
except sqlite.DatabaseError, e:
|
||||
logging.warning('DatabaseError while setting %r for key %r: %s', value, key, str(e))
|
||||
|
||||
def clear(self):
|
||||
sql = "delete from pictures"
|
||||
self.con.execute(sql)
|
||||
|
||||
def filter(self, func):
|
||||
to_delete = [key for key in self if not func(key)]
|
||||
for key in to_delete:
|
||||
del self[key]
|
||||
|
||||
def get_id(self, path):
|
||||
sql = "select rowid from pictures where path = ?"
|
||||
result = self.con.execute(sql, [path]).fetchone()
|
||||
if result:
|
||||
return result[0]
|
||||
else:
|
||||
raise ValueError(path)
|
||||
|
||||
def get_multiple(self, rowids):
|
||||
sql = "select rowid, blocks from pictures where rowid in (%s)" % ','.join(map(str, rowids))
|
||||
cur = self.con.execute(sql)
|
||||
return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)
|
||||
|
||||
159
py/picture/cache_test.py
Normal file
159
py/picture/cache_test.py
Normal file
@@ -0,0 +1,159 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: tests.picture.cache
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2006/09/14
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 15:22:39 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4385 $
|
||||
Copyright 2004-2006 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import unittest
|
||||
from StringIO import StringIO
|
||||
import os.path as op
|
||||
import os
|
||||
import threading
|
||||
|
||||
from hsutil.testcase import TestCase
|
||||
from .cache import *
|
||||
|
||||
class TCcolors_to_string(unittest.TestCase):
|
||||
def test_no_color(self):
|
||||
self.assertEqual('',colors_to_string([]))
|
||||
|
||||
def test_single_color(self):
|
||||
self.assertEqual('000000',colors_to_string([(0,0,0)]))
|
||||
self.assertEqual('010101',colors_to_string([(1,1,1)]))
|
||||
self.assertEqual('0a141e',colors_to_string([(10,20,30)]))
|
||||
|
||||
def test_two_colors(self):
|
||||
self.assertEqual('000102030405',colors_to_string([(0,1,2),(3,4,5)]))
|
||||
|
||||
|
||||
class TCstring_to_colors(unittest.TestCase):
|
||||
def test_empty(self):
|
||||
self.assertEqual([],string_to_colors(''))
|
||||
|
||||
def test_single_color(self):
|
||||
self.assertEqual([(0,0,0)],string_to_colors('000000'))
|
||||
self.assertEqual([(2,3,4)],string_to_colors('020304'))
|
||||
self.assertEqual([(10,20,30)],string_to_colors('0a141e'))
|
||||
|
||||
def test_two_colors(self):
|
||||
self.assertEqual([(10,20,30),(40,50,60)],string_to_colors('0a141e28323c'))
|
||||
|
||||
def test_incomplete_color(self):
|
||||
# don't return anything if it's not a complete color
|
||||
self.assertEqual([],string_to_colors('102'))
|
||||
|
||||
|
||||
class TCCache(TestCase):
|
||||
def test_empty(self):
|
||||
c = Cache()
|
||||
self.assertEqual(0,len(c))
|
||||
self.assertRaises(KeyError,c.__getitem__,'foo')
|
||||
|
||||
def test_set_then_retrieve_blocks(self):
|
||||
c = Cache()
|
||||
b = [(0,0,0),(1,2,3)]
|
||||
c['foo'] = b
|
||||
self.assertEqual(b,c['foo'])
|
||||
|
||||
def test_delitem(self):
|
||||
c = Cache()
|
||||
c['foo'] = ''
|
||||
del c['foo']
|
||||
self.assert_('foo' not in c)
|
||||
self.assertRaises(KeyError,c.__delitem__,'foo')
|
||||
|
||||
def test_persistance(self):
|
||||
DBNAME = op.join(self.tmpdir(), 'hstest.db')
|
||||
c = Cache(DBNAME)
|
||||
c['foo'] = [(1,2,3)]
|
||||
del c
|
||||
c = Cache(DBNAME)
|
||||
self.assertEqual([(1,2,3)],c['foo'])
|
||||
del c
|
||||
os.remove(DBNAME)
|
||||
|
||||
def test_filter(self):
|
||||
c = Cache()
|
||||
c['foo'] = ''
|
||||
c['bar'] = ''
|
||||
c['baz'] = ''
|
||||
c.filter(lambda p:p != 'bar') #only 'bar' is removed
|
||||
self.assertEqual(2,len(c))
|
||||
self.assert_('foo' in c)
|
||||
self.assert_('baz' in c)
|
||||
self.assert_('bar' not in c)
|
||||
|
||||
def test_clear(self):
|
||||
c = Cache()
|
||||
c['foo'] = ''
|
||||
c['bar'] = ''
|
||||
c['baz'] = ''
|
||||
c.clear()
|
||||
self.assertEqual(0,len(c))
|
||||
self.assert_('foo' not in c)
|
||||
self.assert_('baz' not in c)
|
||||
self.assert_('bar' not in c)
|
||||
|
||||
def test_corrupted_db(self):
|
||||
dbname = op.join(self.tmpdir(), 'foo.db')
|
||||
fp = open(dbname, 'w')
|
||||
fp.write('invalid sqlite content')
|
||||
fp.close()
|
||||
c = Cache(dbname) # should not raise a DatabaseError
|
||||
c['foo'] = [(1, 2, 3)]
|
||||
del c
|
||||
c = Cache(dbname)
|
||||
self.assertEqual(c['foo'], [(1, 2, 3)])
|
||||
|
||||
def test_by_id(self):
|
||||
# it's possible to use the cache by referring to the files by their row_id
|
||||
c = Cache()
|
||||
b = [(0,0,0),(1,2,3)]
|
||||
c['foo'] = b
|
||||
foo_id = c.get_id('foo')
|
||||
self.assertEqual(c[foo_id], b)
|
||||
|
||||
|
||||
class TCCacheSQLEscape(unittest.TestCase):
|
||||
def test_contains(self):
|
||||
c = Cache()
|
||||
self.assert_("foo'bar" not in c)
|
||||
|
||||
def test_getitem(self):
|
||||
c = Cache()
|
||||
self.assertRaises(KeyError, c.__getitem__, "foo'bar")
|
||||
|
||||
def test_setitem(self):
|
||||
c = Cache()
|
||||
c["foo'bar"] = []
|
||||
|
||||
def test_delitem(self):
|
||||
c = Cache()
|
||||
c["foo'bar"] = []
|
||||
try:
|
||||
del c["foo'bar"]
|
||||
except KeyError:
|
||||
self.fail()
|
||||
|
||||
|
||||
class TCCacheThreaded(unittest.TestCase):
|
||||
def test_access_cache(self):
|
||||
def thread_run():
|
||||
try:
|
||||
c['foo'] = [(1,2,3)]
|
||||
except sqlite.ProgrammingError:
|
||||
self.fail()
|
||||
|
||||
c = Cache()
|
||||
t = threading.Thread(target=thread_run)
|
||||
t.start()
|
||||
t.join()
|
||||
self.assertEqual([(1,2,3)], c['foo'])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
136
py/picture/matchbase.py
Normal file
136
py/picture/matchbase.py
Normal file
@@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Unit Name: hs.picture._match
|
||||
Created By: Virgil Dupras
|
||||
Created On: 2007/02/25
|
||||
Last modified by:$Author: virgil $
|
||||
Last modified on:$Date: 2009-05-28 16:02:48 +0200 (Thu, 28 May 2009) $
|
||||
$Revision: 4388 $
|
||||
Copyright 2007 Hardcoded Software (http://www.hardcoded.net)
|
||||
"""
|
||||
import logging
|
||||
import multiprocessing
|
||||
from Queue import Empty
|
||||
from collections import defaultdict
|
||||
|
||||
from hsutil import job
|
||||
from hs.utils.misc import dedupe
|
||||
|
||||
from dupeguru.engine import Match
|
||||
from block import avgdiff, DifferentBlockCountError, NoBlocksError
|
||||
from cache import Cache
|
||||
|
||||
MIN_ITERATIONS = 3
|
||||
|
||||
def get_match(first,second,percentage):
|
||||
if percentage < 0:
|
||||
percentage = 0
|
||||
return Match(first,second,percentage)
|
||||
|
||||
class MatchFactory(object):
|
||||
cached_blocks = None
|
||||
block_count_per_side = 15
|
||||
threshold = 75
|
||||
match_scaled = False
|
||||
|
||||
def _do_getmatches(self, files, j):
|
||||
raise NotImplementedError()
|
||||
|
||||
def getmatches(self, files, j=job.nulljob):
|
||||
# The MemoryError handlers in there use logging without first caring about whether or not
|
||||
# there is enough memory left to carry on the operation because it is assumed that the
|
||||
# MemoryError happens when trying to read an image file, which is freed from memory by the
|
||||
# time that MemoryError is raised.
|
||||
j = j.start_subjob([2, 8])
|
||||
logging.info('Preparing %d files' % len(files))
|
||||
prepared = self.prepare_files(files, j)
|
||||
logging.info('Finished preparing %d files' % len(prepared))
|
||||
return self._do_getmatches(prepared, j)
|
||||
|
||||
def prepare_files(self, files, j=job.nulljob):
|
||||
prepared = [] # only files for which there was no error getting blocks
|
||||
try:
|
||||
for picture in j.iter_with_progress(files, 'Analyzed %d/%d pictures'):
|
||||
picture.dimensions
|
||||
picture.unicode_path = unicode(picture.path)
|
||||
try:
|
||||
if picture.unicode_path not in self.cached_blocks:
|
||||
blocks = picture.get_blocks(self.block_count_per_side)
|
||||
self.cached_blocks[picture.unicode_path] = blocks
|
||||
prepared.append(picture)
|
||||
except IOError as e:
|
||||
logging.warning(unicode(e))
|
||||
except MemoryError:
|
||||
logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
|
||||
if picture.size < 10 * 1024 * 1024: # We're really running out of memory
|
||||
raise
|
||||
except MemoryError:
|
||||
logging.warning('Ran out of memory while preparing files')
|
||||
return prepared
|
||||
|
||||
|
||||
def async_compare(ref_id, other_ids, dbname, threshold):
|
||||
cache = Cache(dbname, threaded=False)
|
||||
limit = 100 - threshold
|
||||
ref_blocks = cache[ref_id]
|
||||
pairs = cache.get_multiple(other_ids)
|
||||
results = []
|
||||
for other_id, other_blocks in pairs:
|
||||
try:
|
||||
diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
|
||||
percentage = 100 - diff
|
||||
except (DifferentBlockCountError, NoBlocksError):
|
||||
percentage = 0
|
||||
if percentage >= threshold:
|
||||
results.append((ref_id, other_id, percentage))
|
||||
cache.con.close()
|
||||
return results
|
||||
|
||||
class AsyncMatchFactory(MatchFactory):
|
||||
def _do_getmatches(self, pictures, j):
|
||||
def empty_out_queue(queue, into):
|
||||
try:
|
||||
while True:
|
||||
into.append(queue.get(block=False))
|
||||
except Empty:
|
||||
pass
|
||||
|
||||
j = j.start_subjob([1, 8, 1], 'Preparing for matching')
|
||||
cache = self.cached_blocks
|
||||
id2picture = {}
|
||||
dimensions2pictures = defaultdict(set)
|
||||
for picture in pictures[:]:
|
||||
try:
|
||||
picture.cache_id = cache.get_id(picture.unicode_path)
|
||||
id2picture[picture.cache_id] = picture
|
||||
except ValueError:
|
||||
pictures.remove(picture)
|
||||
if not self.match_scaled:
|
||||
dimensions2pictures[picture.dimensions].add(picture)
|
||||
pool = multiprocessing.Pool()
|
||||
async_results = []
|
||||
pictures_copy = set(pictures)
|
||||
for ref in j.iter_with_progress(pictures):
|
||||
others = pictures_copy if self.match_scaled else dimensions2pictures[ref.dimensions]
|
||||
others.remove(ref)
|
||||
if others:
|
||||
cache_ids = [f.cache_id for f in others]
|
||||
args = (ref.cache_id, cache_ids, self.cached_blocks.dbname, self.threshold)
|
||||
async_results.append(pool.apply_async(async_compare, args))
|
||||
|
||||
matches = []
|
||||
for result in j.iter_with_progress(async_results, 'Matched %d/%d pictures'):
|
||||
matches.extend(result.get())
|
||||
|
||||
result = []
|
||||
for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
|
||||
ref = id2picture[ref_id]
|
||||
other = id2picture[other_id]
|
||||
if percentage == 100 and ref.md5 != other.md5:
|
||||
percentage = 99
|
||||
if percentage >= self.threshold:
|
||||
result.append(get_match(ref, other, percentage))
|
||||
return result
|
||||
|
||||
|
||||
multiprocessing.freeze_support()
|
||||
Reference in New Issue
Block a user