[#72 state:fixed] When files are deleted during the scan, don't include them in the grouping phase.

--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40225
This commit is contained in:
hsoft 2009-10-30 11:09:04 +00:00
parent 88127d8b8d
commit f070e90347
5 changed files with 448 additions and 406 deletions

View File

@ -208,7 +208,9 @@ def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob)
j = j.start_subjob([2, 8]) j = j.start_subjob([2, 8])
size2files = defaultdict(set) size2files = defaultdict(set)
for file in j.iter_with_progress(files, 'Read size of %d/%d files'): for file in j.iter_with_progress(files, 'Read size of %d/%d files'):
size2files[getattr(file, sizeattr)].add(file) filesize = getattr(file, sizeattr)
if filesize:
size2files[filesize].add(file)
possible_matches = [files for files in size2files.values() if len(files) > 1] possible_matches = [files for files in size2files.values() if len(files) > 1]
del size2files del size2files
result = [] result = []

View File

@ -10,7 +10,7 @@
import logging import logging
from hsutil import job from hsutil import job, io
from hsutil.misc import dedupe from hsutil.misc import dedupe
from hsutil.str import get_file_ext, rem_file_ext from hsutil.str import get_file_ext, rem_file_ext
@ -80,9 +80,10 @@ class Scanner(object):
logging.info('Getting matches') logging.info('Getting matches')
matches = self._getmatches(files, j) matches = self._getmatches(files, j)
logging.info('Found %d matches' % len(matches)) logging.info('Found %d matches' % len(matches))
j.set_progress(100, 'Removing false matches')
if not self.mix_file_kind: if not self.mix_file_kind:
j.set_progress(100, 'Removing false matches')
matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)] matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
matches = [m for m in matches if io.exists(m.first.path) and io.exists(m.second.path)]
if self.ignore_list: if self.ignore_list:
j = j.start_subjob(2) j = j.start_subjob(2)
iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list') iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')

View File

@ -15,16 +15,21 @@ from hsutil import job
from hsutil.decorators import log_calls from hsutil.decorators import log_calls
from hsutil.testcase import TestCase from hsutil.testcase import TestCase
from .. import engine from .. import engine, fs
from ..engine import * from ..engine import *
class NamedObject(object): class NamedObject(object):
def __init__(self, name="foobar", with_words=False): def __init__(self, name="foobar", with_words=False, size=1):
self.name = name self.name = name
self.size = size
self.md5partial = name
self.md5 = name
if with_words: if with_words:
self.words = getwords(name) self.words = getwords(name)
no = NamedObject
def get_match_triangle(): def get_match_triangle():
o1 = NamedObject(with_words=True) o1 = NamedObject(with_words=True)
o2 = NamedObject(with_words=True) o2 = NamedObject(with_words=True)
@ -486,6 +491,12 @@ class GetMatches(TestCase):
self.assertEqual(42, len(r)) self.assertEqual(42, len(r))
class GetMatchesByContents(TestCase):
def test_dont_compare_empty_files(self):
o1, o2 = no(size=0), no(size=0)
assert not getmatches_by_contents([o1, o2])
class TCGroup(TestCase): class TCGroup(TestCase):
def test_empy(self): def test_empy(self):
g = Group() g = Group()

View File

@ -21,7 +21,6 @@ from .. import engine
from ..results import * from ..results import *
class NamedObject(engine_test.NamedObject): class NamedObject(engine_test.NamedObject):
size = 1
path = property(lambda x:Path('basepath') + x.name) path = property(lambda x:Path('basepath') + x.name)
is_ref = False is_ref = False

View File

@ -9,9 +9,11 @@
from nose.tools import eq_ from nose.tools import eq_
from hsutil import job from hsutil import job, io
from hsutil.path import Path from hsutil.path import Path
from hsutil.testcase import TestCase
from .. import fs
from ..engine import getwords, Match from ..engine import getwords, Match
from ..ignore import IgnoreList from ..ignore import IgnoreList
from ..scanner import * from ..scanner import *
@ -27,412 +29,439 @@ class NamedObject(object):
no = NamedObject no = NamedObject
#--- Scanner #--- Scanner
def test_empty(): class ScannerTestFakeFiles(TestCase):
s = Scanner() def setUp(self):
r = s.GetDupeGroups([]) # This is a hack to avoid invalidating all previous tests since the scanner started to test
eq_(r, []) # for file existence before doing the match grouping.
self.mock(io, 'exists', lambda _: True)
def test_default_settings():
s = Scanner()
eq_(s.min_match_percentage, 80)
eq_(s.scan_type, SCAN_TYPE_FILENAME)
eq_(s.mix_file_kind, True)
eq_(s.word_weighting, False)
eq_(s.match_similar_words, False)
assert isinstance(s.ignore_list, IgnoreList)
def test_simple_with_default_settings():
s = Scanner()
f = [no('foo bar'), no('foo bar'), no('foo bleh')]
r = s.GetDupeGroups(f)
eq_(len(r), 1)
g = r[0]
#'foo bleh' cannot be in the group because the default min match % is 80
eq_(len(g), 2)
assert g.ref in f[:2]
assert g.dupes[0] in f[:2]
def test_simple_with_lower_min_match():
s = Scanner()
s.min_match_percentage = 50
f = [no('foo bar'), no('foo bar'), no('foo bleh')]
r = s.GetDupeGroups(f)
eq_(len(r), 1)
g = r[0]
eq_(len(g), 3)
def test_trim_all_ref_groups():
# When all files of a group are ref, don't include that group in the results, but also don't
# count the files from that group as discarded.
s = Scanner()
f = [no('foo'), no('foo'), no('bar'), no('bar')]
f[2].is_ref = True
f[3].is_ref = True
r = s.GetDupeGroups(f)
eq_(len(r), 1)
eq_(s.discarded_file_count, 0)
def test_priorize():
s = Scanner()
f = [no('foo'), no('foo'), no('bar'), no('bar')]
f[1].size = 2
f[2].size = 3
f[3].is_ref = True
r = s.GetDupeGroups(f)
g1, g2 = r
assert f[1] in (g1.ref,g2.ref)
assert f[0] in (g1.dupes[0],g2.dupes[0])
assert f[3] in (g1.ref,g2.ref)
assert f[2] in (g1.dupes[0],g2.dupes[0])
def test_content_scan():
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'), no('bar'), no('bleh')]
f[0].md5 = f[0].md5partial = 'foobar'
f[1].md5 = f[1].md5partial = 'foobar'
f[2].md5 = f[2].md5partial = 'bleh'
r = s.GetDupeGroups(f)
eq_(len(r), 1)
eq_(len(r[0]), 2)
eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded!
def test_content_scan_compare_sizes_first():
class MyFile(no):
@property
def md5(file):
raise AssertionError()
s = Scanner() def test_empty(self):
s.scan_type = SCAN_TYPE_CONTENT s = Scanner()
f = [MyFile('foo', 1), MyFile('bar', 2)] r = s.GetDupeGroups([])
eq_(len(s.GetDupeGroups(f)), 0) eq_(r, [])
def test_min_match_perc_doesnt_matter_for_content_scan():
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'), no('bar'), no('bleh')]
f[0].md5 = f[0].md5partial = 'foobar'
f[1].md5 = f[1].md5partial = 'foobar'
f[2].md5 = f[2].md5partial = 'bleh'
s.min_match_percentage = 101
r = s.GetDupeGroups(f)
eq_(len(r), 1)
eq_(len(r[0]), 2)
s.min_match_percentage = 0
r = s.GetDupeGroups(f)
eq_(len(r), 1)
eq_(len(r[0]), 2)
def test_content_scan_doesnt_put_md5_in_words_at_the_end():
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'),no('bar')]
f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
r = s.GetDupeGroups(f)
g = r[0]
def test_extension_is_not_counted_in_filename_scan():
s = Scanner()
s.min_match_percentage = 100
f = [no('foo.bar'), no('foo.bleh')]
r = s.GetDupeGroups(f)
eq_(len(r), 1)
eq_(len(r[0]), 2)
def test_job():
def do_progress(progress, desc=''):
log.append(progress)
return True
s = Scanner() def test_default_settings(self):
log = [] s = Scanner()
f = [no('foo bar'), no('foo bar'), no('foo bleh')] eq_(s.min_match_percentage, 80)
r = s.GetDupeGroups(f, job.Job(1, do_progress)) eq_(s.scan_type, SCAN_TYPE_FILENAME)
eq_(log[0], 0) eq_(s.mix_file_kind, True)
eq_(log[-1], 100) eq_(s.word_weighting, False)
eq_(s.match_similar_words, False)
def test_mix_file_kind(): assert isinstance(s.ignore_list, IgnoreList)
s = Scanner()
s.mix_file_kind = False def test_simple_with_default_settings(self):
f = [no('foo.1'), no('foo.2')] s = Scanner()
r = s.GetDupeGroups(f) f = [no('foo bar'), no('foo bar'), no('foo bleh')]
eq_(len(r), 0) r = s.GetDupeGroups(f)
eq_(len(r), 1)
def test_word_weighting(): g = r[0]
s = Scanner() #'foo bleh' cannot be in the group because the default min match % is 80
s.min_match_percentage = 75 eq_(len(g), 2)
s.word_weighting = True assert g.ref in f[:2]
f = [no('foo bar'), no('foo bar bleh')] assert g.dupes[0] in f[:2]
r = s.GetDupeGroups(f)
eq_(len(r), 1) def test_simple_with_lower_min_match(self):
g = r[0] s = Scanner()
m = g.get_match_of(g.dupes[0]) s.min_match_percentage = 50
eq_(m.percentage, 75) # 16 letters, 12 matching f = [no('foo bar'), no('foo bar'), no('foo bleh')]
r = s.GetDupeGroups(f)
def test_similar_words(): eq_(len(r), 1)
s = Scanner() g = r[0]
s.match_similar_words = True eq_(len(g), 3)
f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')]
r = s.GetDupeGroups(f) def test_trim_all_ref_groups(self):
eq_(len(r), 2) # When all files of a group are ref, don't include that group in the results, but also don't
# count the files from that group as discarded.
def test_fields(): s = Scanner()
s = Scanner() f = [no('foo'), no('foo'), no('bar'), no('bar')]
s.scan_type = SCAN_TYPE_FIELDS f[2].is_ref = True
f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')] f[3].is_ref = True
r = s.GetDupeGroups(f) r = s.GetDupeGroups(f)
eq_(len(r), 0) eq_(len(r), 1)
eq_(s.discarded_file_count, 0)
def test_fields_no_order():
s = Scanner() def test_priorize(self):
s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER s = Scanner()
f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')] f = [no('foo'), no('foo'), no('bar'), no('bar')]
r = s.GetDupeGroups(f) f[1].size = 2
eq_(len(r), 1) f[2].size = 3
f[3].is_ref = True
def test_tag_scan(): r = s.GetDupeGroups(f)
s = Scanner() g1, g2 = r
s.scan_type = SCAN_TYPE_TAG assert f[1] in (g1.ref,g2.ref)
o1 = no('foo') assert f[0] in (g1.dupes[0],g2.dupes[0])
o2 = no('bar') assert f[3] in (g1.ref,g2.ref)
o1.artist = 'The White Stripes' assert f[2] in (g1.dupes[0],g2.dupes[0])
o1.title = 'The Air Near My Fingers'
o2.artist = 'The White Stripes' def test_content_scan(self):
o2.title = 'The Air Near My Fingers' s = Scanner()
r = s.GetDupeGroups([o1,o2]) s.scan_type = SCAN_TYPE_CONTENT
eq_(len(r), 1) f = [no('foo'), no('bar'), no('bleh')]
f[0].md5 = f[0].md5partial = 'foobar'
def test_tag_with_album_scan(): f[1].md5 = f[1].md5partial = 'foobar'
s = Scanner() f[2].md5 = f[2].md5partial = 'bleh'
s.scan_type = SCAN_TYPE_TAG r = s.GetDupeGroups(f)
s.scanned_tags = set(['artist', 'album', 'title']) eq_(len(r), 1)
o1 = no('foo') eq_(len(r[0]), 2)
o2 = no('bar') eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded!
o3 = no('bleh')
o1.artist = 'The White Stripes' def test_content_scan_compare_sizes_first(self):
o1.title = 'The Air Near My Fingers' class MyFile(no):
o1.album = 'Elephant' @property
o2.artist = 'The White Stripes' def md5(file):
o2.title = 'The Air Near My Fingers' raise AssertionError()
o2.album = 'Elephant'
o3.artist = 'The White Stripes' s = Scanner()
o3.title = 'The Air Near My Fingers' s.scan_type = SCAN_TYPE_CONTENT
o3.album = 'foobar' f = [MyFile('foo', 1), MyFile('bar', 2)]
r = s.GetDupeGroups([o1,o2,o3]) eq_(len(s.GetDupeGroups(f)), 0)
eq_(len(r), 1)
def test_min_match_perc_doesnt_matter_for_content_scan(self):
def test_that_dash_in_tags_dont_create_new_fields(): s = Scanner()
s = Scanner() s.scan_type = SCAN_TYPE_CONTENT
s.scan_type = SCAN_TYPE_TAG f = [no('foo'), no('bar'), no('bleh')]
s.scanned_tags = set(['artist', 'album', 'title']) f[0].md5 = f[0].md5partial = 'foobar'
s.min_match_percentage = 50 f[1].md5 = f[1].md5partial = 'foobar'
o1 = no('foo') f[2].md5 = f[2].md5partial = 'bleh'
o2 = no('bar') s.min_match_percentage = 101
o1.artist = 'The White Stripes - a' r = s.GetDupeGroups(f)
o1.title = 'The Air Near My Fingers - a' eq_(len(r), 1)
o1.album = 'Elephant - a' eq_(len(r[0]), 2)
o2.artist = 'The White Stripes - b' s.min_match_percentage = 0
o2.title = 'The Air Near My Fingers - b' r = s.GetDupeGroups(f)
o2.album = 'Elephant - b' eq_(len(r), 1)
r = s.GetDupeGroups([o1,o2]) eq_(len(r[0]), 2)
eq_(len(r), 1)
def test_content_scan_doesnt_put_md5_in_words_at_the_end(self):
def test_tag_scan_with_different_scanned(): s = Scanner()
s = Scanner() s.scan_type = SCAN_TYPE_CONTENT
s.scan_type = SCAN_TYPE_TAG f = [no('foo'),no('bar')]
s.scanned_tags = set(['track', 'year']) f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
o1 = no('foo') f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
o2 = no('bar') r = s.GetDupeGroups(f)
o1.artist = 'The White Stripes' g = r[0]
o1.title = 'some title'
o1.track = 'foo' def test_extension_is_not_counted_in_filename_scan(self):
o1.year = 'bar' s = Scanner()
o2.artist = 'The White Stripes' s.min_match_percentage = 100
o2.title = 'another title' f = [no('foo.bar'), no('foo.bleh')]
o2.track = 'foo' r = s.GetDupeGroups(f)
o2.year = 'bar' eq_(len(r), 1)
r = s.GetDupeGroups([o1, o2]) eq_(len(r[0]), 2)
eq_(len(r), 1)
def test_job(self):
def test_tag_scan_only_scans_existing_tags(): def do_progress(progress, desc=''):
s = Scanner() log.append(progress)
s.scan_type = SCAN_TYPE_TAG return True
s.scanned_tags = set(['artist', 'foo'])
o1 = no('foo') s = Scanner()
o2 = no('bar') log = []
o1.artist = 'The White Stripes' f = [no('foo bar'), no('foo bar'), no('foo bleh')]
o1.foo = 'foo' r = s.GetDupeGroups(f, job.Job(1, do_progress))
o2.artist = 'The White Stripes' eq_(log[0], 0)
o2.foo = 'bar' eq_(log[-1], 100)
r = s.GetDupeGroups([o1, o2])
eq_(len(r), 1) # Because 'foo' is not scanned, they match def test_mix_file_kind(self):
s = Scanner()
def test_tag_scan_converts_to_str(): s.mix_file_kind = False
s = Scanner() f = [no('foo.1'), no('foo.2')]
s.scan_type = SCAN_TYPE_TAG r = s.GetDupeGroups(f)
s.scanned_tags = set(['track']) eq_(len(r), 0)
o1 = no('foo')
o2 = no('bar') def test_word_weighting(self):
o1.track = 42 s = Scanner()
o2.track = 42 s.min_match_percentage = 75
try: s.word_weighting = True
f = [no('foo bar'), no('foo bar bleh')]
r = s.GetDupeGroups(f)
eq_(len(r), 1)
g = r[0]
m = g.get_match_of(g.dupes[0])
eq_(m.percentage, 75) # 16 letters, 12 matching
def test_similar_words(self):
s = Scanner()
s.match_similar_words = True
f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')]
r = s.GetDupeGroups(f)
eq_(len(r), 2)
def test_fields(self):
s = Scanner()
s.scan_type = SCAN_TYPE_FIELDS
f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')]
r = s.GetDupeGroups(f)
eq_(len(r), 0)
def test_fields_no_order(self):
s = Scanner()
s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER
f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')]
r = s.GetDupeGroups(f)
eq_(len(r), 1)
def test_tag_scan(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
o1 = no('foo')
o2 = no('bar')
o1.artist = 'The White Stripes'
o1.title = 'The Air Near My Fingers'
o2.artist = 'The White Stripes'
o2.title = 'The Air Near My Fingers'
r = s.GetDupeGroups([o1,o2])
eq_(len(r), 1)
def test_tag_with_album_scan(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['artist', 'album', 'title'])
o1 = no('foo')
o2 = no('bar')
o3 = no('bleh')
o1.artist = 'The White Stripes'
o1.title = 'The Air Near My Fingers'
o1.album = 'Elephant'
o2.artist = 'The White Stripes'
o2.title = 'The Air Near My Fingers'
o2.album = 'Elephant'
o3.artist = 'The White Stripes'
o3.title = 'The Air Near My Fingers'
o3.album = 'foobar'
r = s.GetDupeGroups([o1,o2,o3])
eq_(len(r), 1)
def test_that_dash_in_tags_dont_create_new_fields(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['artist', 'album', 'title'])
s.min_match_percentage = 50
o1 = no('foo')
o2 = no('bar')
o1.artist = 'The White Stripes - a'
o1.title = 'The Air Near My Fingers - a'
o1.album = 'Elephant - a'
o2.artist = 'The White Stripes - b'
o2.title = 'The Air Near My Fingers - b'
o2.album = 'Elephant - b'
r = s.GetDupeGroups([o1,o2])
eq_(len(r), 1)
def test_tag_scan_with_different_scanned(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['track', 'year'])
o1 = no('foo')
o2 = no('bar')
o1.artist = 'The White Stripes'
o1.title = 'some title'
o1.track = 'foo'
o1.year = 'bar'
o2.artist = 'The White Stripes'
o2.title = 'another title'
o2.track = 'foo'
o2.year = 'bar'
r = s.GetDupeGroups([o1, o2]) r = s.GetDupeGroups([o1, o2])
except TypeError: eq_(len(r), 1)
raise AssertionError()
eq_(len(r), 1) def test_tag_scan_only_scans_existing_tags(self):
s = Scanner()
def test_tag_scan_non_ascii(): s.scan_type = SCAN_TYPE_TAG
s = Scanner() s.scanned_tags = set(['artist', 'foo'])
s.scan_type = SCAN_TYPE_TAG o1 = no('foo')
s.scanned_tags = set(['title']) o2 = no('bar')
o1 = no('foo') o1.artist = 'The White Stripes'
o2 = no('bar') o1.foo = 'foo'
o1.title = u'foobar\u00e9' o2.artist = 'The White Stripes'
o2.title = u'foobar\u00e9' o2.foo = 'bar'
try:
r = s.GetDupeGroups([o1, o2]) r = s.GetDupeGroups([o1, o2])
except UnicodeEncodeError: eq_(len(r), 1) # Because 'foo' is not scanned, they match
raise AssertionError()
eq_(len(r), 1)
def test_audio_content_scan():
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT_AUDIO
f = [no('foo'), no('bar'), no('bleh')]
f[0].md5 = 'foo'
f[1].md5 = 'bar'
f[2].md5 = 'bleh'
f[0].md5partial = 'foo'
f[1].md5partial = 'foo'
f[2].md5partial = 'bleh'
f[0].audiosize = 1
f[1].audiosize = 1
f[2].audiosize = 1
r = s.GetDupeGroups(f)
eq_(len(r), 1)
eq_(len(r[0]), 2)
def test_audio_content_scan_compare_sizes_first(): def test_tag_scan_converts_to_str(self):
class MyFile(no): s = Scanner()
@property s.scan_type = SCAN_TYPE_TAG
def md5partial(file): s.scanned_tags = set(['track'])
o1 = no('foo')
o2 = no('bar')
o1.track = 42
o2.track = 42
try:
r = s.GetDupeGroups([o1, o2])
except TypeError:
raise AssertionError() raise AssertionError()
eq_(len(r), 1)
s = Scanner() def test_tag_scan_non_ascii(self):
s.scan_type = SCAN_TYPE_CONTENT_AUDIO s = Scanner()
f = [MyFile('foo'), MyFile('bar')] s.scan_type = SCAN_TYPE_TAG
f[0].audiosize = 1 s.scanned_tags = set(['title'])
f[1].audiosize = 2 o1 = no('foo')
eq_(len(s.GetDupeGroups(f)), 0) o2 = no('bar')
o1.title = u'foobar\u00e9'
def test_ignore_list(): o2.title = u'foobar\u00e9'
s = Scanner() try:
f1 = no('foobar') r = s.GetDupeGroups([o1, o2])
f2 = no('foobar') except UnicodeEncodeError:
f3 = no('foobar') raise AssertionError()
f1.path = Path('dir1/foobar') eq_(len(r), 1)
f2.path = Path('dir2/foobar')
f3.path = Path('dir3/foobar') def test_audio_content_scan(self):
s.ignore_list.Ignore(str(f1.path),str(f2.path)) s = Scanner()
s.ignore_list.Ignore(str(f1.path),str(f3.path)) s.scan_type = SCAN_TYPE_CONTENT_AUDIO
r = s.GetDupeGroups([f1,f2,f3]) f = [no('foo'), no('bar'), no('bleh')]
eq_(len(r), 1) f[0].md5 = 'foo'
g = r[0] f[1].md5 = 'bar'
eq_(len(g.dupes), 1) f[2].md5 = 'bleh'
assert f1 not in g f[0].md5partial = 'foo'
assert f2 in g f[1].md5partial = 'foo'
assert f3 in g f[2].md5partial = 'bleh'
# Ignored matches are not counted as discarded f[0].audiosize = 1
eq_(s.discarded_file_count, 0) f[1].audiosize = 1
f[2].audiosize = 1
def test_ignore_list_checks_for_unicode(): r = s.GetDupeGroups(f)
#scanner was calling path_str for ignore list checks. Since the Path changes, it must eq_(len(r), 1)
#be unicode(path) eq_(len(r[0]), 2)
s = Scanner()
f1 = no('foobar') def test_audio_content_scan_compare_sizes_first(self):
f2 = no('foobar') class MyFile(no):
f3 = no('foobar') @property
f1.path = Path(u'foo1\u00e9') def md5partial(file):
f2.path = Path(u'foo2\u00e9') raise AssertionError()
f3.path = Path(u'foo3\u00e9')
s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path)) s = Scanner()
s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path)) s.scan_type = SCAN_TYPE_CONTENT_AUDIO
r = s.GetDupeGroups([f1,f2,f3]) f = [MyFile('foo'), MyFile('bar')]
eq_(len(r), 1) f[0].audiosize = 1
g = r[0] f[1].audiosize = 2
eq_(len(g.dupes), 1) eq_(len(s.GetDupeGroups(f)), 0)
assert f1 not in g
assert f2 in g def test_ignore_list(self):
assert f3 in g s = Scanner()
f1 = no('foobar')
def test_file_evaluates_to_false(): f2 = no('foobar')
# A very wrong way to use any() was added at some point, causing resulting group list f3 = no('foobar')
# to be empty. f1.path = Path('dir1/foobar')
class FalseNamedObject(NamedObject): f2.path = Path('dir2/foobar')
def __nonzero__(self): f3.path = Path('dir3/foobar')
return False s.ignore_list.Ignore(str(f1.path),str(f2.path))
s.ignore_list.Ignore(str(f1.path),str(f3.path))
r = s.GetDupeGroups([f1,f2,f3])
eq_(len(r), 1)
g = r[0]
eq_(len(g.dupes), 1)
assert f1 not in g
assert f2 in g
assert f3 in g
# Ignored matches are not counted as discarded
eq_(s.discarded_file_count, 0)
def test_ignore_list_checks_for_unicode(self):
#scanner was calling path_str for ignore list checks. Since the Path changes, it must
#be unicode(path)
s = Scanner()
f1 = no('foobar')
f2 = no('foobar')
f3 = no('foobar')
f1.path = Path(u'foo1\u00e9')
f2.path = Path(u'foo2\u00e9')
f3.path = Path(u'foo3\u00e9')
s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path))
s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path))
r = s.GetDupeGroups([f1,f2,f3])
eq_(len(r), 1)
g = r[0]
eq_(len(g.dupes), 1)
assert f1 not in g
assert f2 in g
assert f3 in g
def test_file_evaluates_to_false(self):
# A very wrong way to use any() was added at some point, causing resulting group list
# to be empty.
class FalseNamedObject(NamedObject):
def __nonzero__(self):
return False
s = Scanner() s = Scanner()
f1 = FalseNamedObject('foobar') f1 = FalseNamedObject('foobar')
f2 = FalseNamedObject('foobar') f2 = FalseNamedObject('foobar')
r = s.GetDupeGroups([f1, f2]) r = s.GetDupeGroups([f1, f2])
eq_(len(r), 1) eq_(len(r), 1)
def test_size_threshold(self):
# Only file equal or higher than the size_threshold in size are scanned
s = Scanner()
f1 = no('foo', 1)
f2 = no('foo', 2)
f3 = no('foo', 3)
s.size_threshold = 2
groups = s.GetDupeGroups([f1,f2,f3])
eq_(len(groups), 1)
[group] = groups
eq_(len(group), 2)
assert f1 not in group
assert f2 in group
assert f3 in group
def test_tie_breaker_path_deepness(self):
# If there is a tie in prioritization, path deepness is used as a tie breaker
s = Scanner()
o1, o2 = no('foo'), no('foo')
o1.path = Path('foo')
o2.path = Path('foo/bar')
[group] = s.GetDupeGroups([o1, o2])
assert group.ref is o2
def test_tie_breaker_copy(self):
# if copy is in the words used (even if it has a deeper path), it becomes a dupe
s = Scanner()
o1, o2 = no('foo bar Copy'), no('foo bar')
o1.path = Path('deeper/path')
o2.path = Path('foo')
[group] = s.GetDupeGroups([o1, o2])
assert group.ref is o2
def test_tie_breaker_same_name_plus_digit(self):
# if ref has the same words as dupe, but has some just one extra word which is a digit, it
# becomes a dupe
s = Scanner()
o1, o2 = no('foo bar 42'), no('foo bar')
o1.path = Path('deeper/path')
o2.path = Path('foo')
[group] = s.GetDupeGroups([o1, o2])
assert group.ref is o2
def test_partial_group_match(self):
# Count the number od discarded matches (when a file doesn't match all other dupes of the
# group) in Scanner.discarded_file_count
s = Scanner()
o1, o2, o3 = no('a b'), no('a'), no('b')
s.min_match_percentage = 50
[group] = s.GetDupeGroups([o1, o2, o3])
eq_(len(group), 2)
assert o1 in group
assert o2 in group
assert o3 not in group
eq_(s.discarded_file_count, 1)
def test_size_threshold(): class ScannerTest(TestCase):
# Only file equal or higher than the size_threshold in size are scanned def test_dont_group_files_that_dont_exist(self):
s = Scanner() # when creating groups, check that files exist first. It's possible that these files have
f1 = no('foo', 1) # been moved during the scan by the user.
f2 = no('foo', 2) # In this test, we have to delete one of the files between the get_matches() part and the
f3 = no('foo', 3) # get_groups() part.
s.size_threshold = 2 s = Scanner()
groups = s.GetDupeGroups([f1,f2,f3]) s.scan_type = SCAN_TYPE_CONTENT
eq_(len(groups), 1) p = self.tmppath()
[group] = groups io.open(p + 'file1', 'w').write('foo')
eq_(len(group), 2) io.open(p + 'file2', 'w').write('foo')
assert f1 not in group file1, file2 = fs.get_files(p)
assert f2 in group def getmatches(*args, **kw):
assert f3 in group io.remove(file2.path)
return [Match(file1, file2, 100)]
def test_tie_breaker_path_deepness(): s._getmatches = getmatches
# If there is a tie in prioritization, path deepness is used as a tie breaker
s = Scanner() assert not s.GetDupeGroups([file1, file2])
o1, o2 = no('foo'), no('foo')
o1.path = Path('foo')
o2.path = Path('foo/bar')
[group] = s.GetDupeGroups([o1, o2])
assert group.ref is o2
def test_tie_breaker_copy():
# if copy is in the words used (even if it has a deeper path), it becomes a dupe
s = Scanner()
o1, o2 = no('foo bar Copy'), no('foo bar')
o1.path = Path('deeper/path')
o2.path = Path('foo')
[group] = s.GetDupeGroups([o1, o2])
assert group.ref is o2
def test_tie_breaker_same_name_plus_digit():
# if ref has the same words as dupe, but has some just one extra word which is a digit, it
# becomes a dupe
s = Scanner()
o1, o2 = no('foo bar 42'), no('foo bar')
o1.path = Path('deeper/path')
o2.path = Path('foo')
[group] = s.GetDupeGroups([o1, o2])
assert group.ref is o2
def test_partial_group_match():
# Count the number od discarded matches (when a file doesn't match all other dupes of the
# group) in Scanner.discarded_file_count
s = Scanner()
o1, o2, o3 = no('a b'), no('a'), no('b')
s.min_match_percentage = 50
[group] = s.GetDupeGroups([o1, o2, o3])
eq_(len(group), 2)
assert o1 in group
assert o2 in group
assert o3 not in group
eq_(s.discarded_file_count, 1)