[#72 state:fixed] When files are deleted during the scan, don't include them in the grouping phase.

--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40225
This commit is contained in:
hsoft 2009-10-30 11:09:04 +00:00
parent 88127d8b8d
commit f070e90347
5 changed files with 448 additions and 406 deletions

View File

@ -208,7 +208,9 @@ def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob)
j = j.start_subjob([2, 8])
size2files = defaultdict(set)
for file in j.iter_with_progress(files, 'Read size of %d/%d files'):
size2files[getattr(file, sizeattr)].add(file)
filesize = getattr(file, sizeattr)
if filesize:
size2files[filesize].add(file)
possible_matches = [files for files in size2files.values() if len(files) > 1]
del size2files
result = []

View File

@ -10,7 +10,7 @@
import logging
from hsutil import job
from hsutil import job, io
from hsutil.misc import dedupe
from hsutil.str import get_file_ext, rem_file_ext
@ -80,9 +80,10 @@ class Scanner(object):
logging.info('Getting matches')
matches = self._getmatches(files, j)
logging.info('Found %d matches' % len(matches))
if not self.mix_file_kind:
j.set_progress(100, 'Removing false matches')
if not self.mix_file_kind:
matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
matches = [m for m in matches if io.exists(m.first.path) and io.exists(m.second.path)]
if self.ignore_list:
j = j.start_subjob(2)
iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')

View File

@ -15,16 +15,21 @@ from hsutil import job
from hsutil.decorators import log_calls
from hsutil.testcase import TestCase
from .. import engine
from .. import engine, fs
from ..engine import *
class NamedObject(object):
def __init__(self, name="foobar", with_words=False):
def __init__(self, name="foobar", with_words=False, size=1):
self.name = name
self.size = size
self.md5partial = name
self.md5 = name
if with_words:
self.words = getwords(name)
no = NamedObject
def get_match_triangle():
o1 = NamedObject(with_words=True)
o2 = NamedObject(with_words=True)
@ -486,6 +491,12 @@ class GetMatches(TestCase):
self.assertEqual(42, len(r))
class GetMatchesByContents(TestCase):
def test_dont_compare_empty_files(self):
o1, o2 = no(size=0), no(size=0)
assert not getmatches_by_contents([o1, o2])
class TCGroup(TestCase):
def test_empy(self):
g = Group()

View File

@ -21,7 +21,6 @@ from .. import engine
from ..results import *
class NamedObject(engine_test.NamedObject):
size = 1
path = property(lambda x:Path('basepath') + x.name)
is_ref = False

View File

@ -9,9 +9,11 @@
from nose.tools import eq_
from hsutil import job
from hsutil import job, io
from hsutil.path import Path
from hsutil.testcase import TestCase
from .. import fs
from ..engine import getwords, Match
from ..ignore import IgnoreList
from ..scanner import *
@ -27,12 +29,18 @@ class NamedObject(object):
no = NamedObject
#--- Scanner
def test_empty():
class ScannerTestFakeFiles(TestCase):
def setUp(self):
# This is a hack to avoid invalidating all previous tests since the scanner started to test
# for file existence before doing the match grouping.
self.mock(io, 'exists', lambda _: True)
def test_empty(self):
s = Scanner()
r = s.GetDupeGroups([])
eq_(r, [])
def test_default_settings():
def test_default_settings(self):
s = Scanner()
eq_(s.min_match_percentage, 80)
eq_(s.scan_type, SCAN_TYPE_FILENAME)
@ -41,7 +49,7 @@ def test_default_settings():
eq_(s.match_similar_words, False)
assert isinstance(s.ignore_list, IgnoreList)
def test_simple_with_default_settings():
def test_simple_with_default_settings(self):
s = Scanner()
f = [no('foo bar'), no('foo bar'), no('foo bleh')]
r = s.GetDupeGroups(f)
@ -52,7 +60,7 @@ def test_simple_with_default_settings():
assert g.ref in f[:2]
assert g.dupes[0] in f[:2]
def test_simple_with_lower_min_match():
def test_simple_with_lower_min_match(self):
s = Scanner()
s.min_match_percentage = 50
f = [no('foo bar'), no('foo bar'), no('foo bleh')]
@ -61,7 +69,7 @@ def test_simple_with_lower_min_match():
g = r[0]
eq_(len(g), 3)
def test_trim_all_ref_groups():
def test_trim_all_ref_groups(self):
# When all files of a group are ref, don't include that group in the results, but also don't
# count the files from that group as discarded.
s = Scanner()
@ -72,7 +80,7 @@ def test_trim_all_ref_groups():
eq_(len(r), 1)
eq_(s.discarded_file_count, 0)
def test_priorize():
def test_priorize(self):
s = Scanner()
f = [no('foo'), no('foo'), no('bar'), no('bar')]
f[1].size = 2
@ -85,7 +93,7 @@ def test_priorize():
assert f[3] in (g1.ref,g2.ref)
assert f[2] in (g1.dupes[0],g2.dupes[0])
def test_content_scan():
def test_content_scan(self):
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'), no('bar'), no('bleh')]
@ -97,7 +105,7 @@ def test_content_scan():
eq_(len(r[0]), 2)
eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded!
def test_content_scan_compare_sizes_first():
def test_content_scan_compare_sizes_first(self):
class MyFile(no):
@property
def md5(file):
@ -108,7 +116,7 @@ def test_content_scan_compare_sizes_first():
f = [MyFile('foo', 1), MyFile('bar', 2)]
eq_(len(s.GetDupeGroups(f)), 0)
def test_min_match_perc_doesnt_matter_for_content_scan():
def test_min_match_perc_doesnt_matter_for_content_scan(self):
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'), no('bar'), no('bleh')]
@ -124,7 +132,7 @@ def test_min_match_perc_doesnt_matter_for_content_scan():
eq_(len(r), 1)
eq_(len(r[0]), 2)
def test_content_scan_doesnt_put_md5_in_words_at_the_end():
def test_content_scan_doesnt_put_md5_in_words_at_the_end(self):
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'),no('bar')]
@ -133,7 +141,7 @@ def test_content_scan_doesnt_put_md5_in_words_at_the_end():
r = s.GetDupeGroups(f)
g = r[0]
def test_extension_is_not_counted_in_filename_scan():
def test_extension_is_not_counted_in_filename_scan(self):
s = Scanner()
s.min_match_percentage = 100
f = [no('foo.bar'), no('foo.bleh')]
@ -141,7 +149,7 @@ def test_extension_is_not_counted_in_filename_scan():
eq_(len(r), 1)
eq_(len(r[0]), 2)
def test_job():
def test_job(self):
def do_progress(progress, desc=''):
log.append(progress)
return True
@ -153,14 +161,14 @@ def test_job():
eq_(log[0], 0)
eq_(log[-1], 100)
def test_mix_file_kind():
def test_mix_file_kind(self):
s = Scanner()
s.mix_file_kind = False
f = [no('foo.1'), no('foo.2')]
r = s.GetDupeGroups(f)
eq_(len(r), 0)
def test_word_weighting():
def test_word_weighting(self):
s = Scanner()
s.min_match_percentage = 75
s.word_weighting = True
@ -171,28 +179,28 @@ def test_word_weighting():
m = g.get_match_of(g.dupes[0])
eq_(m.percentage, 75) # 16 letters, 12 matching
def test_similar_words():
def test_similar_words(self):
s = Scanner()
s.match_similar_words = True
f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')]
r = s.GetDupeGroups(f)
eq_(len(r), 2)
def test_fields():
def test_fields(self):
s = Scanner()
s.scan_type = SCAN_TYPE_FIELDS
f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')]
r = s.GetDupeGroups(f)
eq_(len(r), 0)
def test_fields_no_order():
def test_fields_no_order(self):
s = Scanner()
s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER
f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')]
r = s.GetDupeGroups(f)
eq_(len(r), 1)
def test_tag_scan():
def test_tag_scan(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
o1 = no('foo')
@ -204,7 +212,7 @@ def test_tag_scan():
r = s.GetDupeGroups([o1,o2])
eq_(len(r), 1)
def test_tag_with_album_scan():
def test_tag_with_album_scan(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['artist', 'album', 'title'])
@ -223,7 +231,7 @@ def test_tag_with_album_scan():
r = s.GetDupeGroups([o1,o2,o3])
eq_(len(r), 1)
def test_that_dash_in_tags_dont_create_new_fields():
def test_that_dash_in_tags_dont_create_new_fields(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['artist', 'album', 'title'])
@ -239,7 +247,7 @@ def test_that_dash_in_tags_dont_create_new_fields():
r = s.GetDupeGroups([o1,o2])
eq_(len(r), 1)
def test_tag_scan_with_different_scanned():
def test_tag_scan_with_different_scanned(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['track', 'year'])
@ -256,7 +264,7 @@ def test_tag_scan_with_different_scanned():
r = s.GetDupeGroups([o1, o2])
eq_(len(r), 1)
def test_tag_scan_only_scans_existing_tags():
def test_tag_scan_only_scans_existing_tags(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['artist', 'foo'])
@ -269,7 +277,7 @@ def test_tag_scan_only_scans_existing_tags():
r = s.GetDupeGroups([o1, o2])
eq_(len(r), 1) # Because 'foo' is not scanned, they match
def test_tag_scan_converts_to_str():
def test_tag_scan_converts_to_str(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['track'])
@ -283,7 +291,7 @@ def test_tag_scan_converts_to_str():
raise AssertionError()
eq_(len(r), 1)
def test_tag_scan_non_ascii():
def test_tag_scan_non_ascii(self):
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['title'])
@ -297,7 +305,7 @@ def test_tag_scan_non_ascii():
raise AssertionError()
eq_(len(r), 1)
def test_audio_content_scan():
def test_audio_content_scan(self):
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT_AUDIO
f = [no('foo'), no('bar'), no('bleh')]
@ -314,7 +322,7 @@ def test_audio_content_scan():
eq_(len(r), 1)
eq_(len(r[0]), 2)
def test_audio_content_scan_compare_sizes_first():
def test_audio_content_scan_compare_sizes_first(self):
class MyFile(no):
@property
def md5partial(file):
@ -327,7 +335,7 @@ def test_audio_content_scan_compare_sizes_first():
f[1].audiosize = 2
eq_(len(s.GetDupeGroups(f)), 0)
def test_ignore_list():
def test_ignore_list(self):
s = Scanner()
f1 = no('foobar')
f2 = no('foobar')
@ -347,7 +355,7 @@ def test_ignore_list():
# Ignored matches are not counted as discarded
eq_(s.discarded_file_count, 0)
def test_ignore_list_checks_for_unicode():
def test_ignore_list_checks_for_unicode(self):
#scanner was calling path_str for ignore list checks. Since the Path changes, it must
#be unicode(path)
s = Scanner()
@ -367,7 +375,7 @@ def test_ignore_list_checks_for_unicode():
assert f2 in g
assert f3 in g
def test_file_evaluates_to_false():
def test_file_evaluates_to_false(self):
# A very wrong way to use any() was added at some point, causing resulting group list
# to be empty.
class FalseNamedObject(NamedObject):
@ -381,7 +389,7 @@ def test_file_evaluates_to_false():
r = s.GetDupeGroups([f1, f2])
eq_(len(r), 1)
def test_size_threshold():
def test_size_threshold(self):
# Only file equal or higher than the size_threshold in size are scanned
s = Scanner()
f1 = no('foo', 1)
@ -396,7 +404,7 @@ def test_size_threshold():
assert f2 in group
assert f3 in group
def test_tie_breaker_path_deepness():
def test_tie_breaker_path_deepness(self):
# If there is a tie in prioritization, path deepness is used as a tie breaker
s = Scanner()
o1, o2 = no('foo'), no('foo')
@ -405,7 +413,7 @@ def test_tie_breaker_path_deepness():
[group] = s.GetDupeGroups([o1, o2])
assert group.ref is o2
def test_tie_breaker_copy():
def test_tie_breaker_copy(self):
# if copy is in the words used (even if it has a deeper path), it becomes a dupe
s = Scanner()
o1, o2 = no('foo bar Copy'), no('foo bar')
@ -414,7 +422,7 @@ def test_tie_breaker_copy():
[group] = s.GetDupeGroups([o1, o2])
assert group.ref is o2
def test_tie_breaker_same_name_plus_digit():
def test_tie_breaker_same_name_plus_digit(self):
# if ref has the same words as dupe, but has some just one extra word which is a digit, it
# becomes a dupe
s = Scanner()
@ -424,7 +432,7 @@ def test_tie_breaker_same_name_plus_digit():
[group] = s.GetDupeGroups([o1, o2])
assert group.ref is o2
def test_partial_group_match():
def test_partial_group_match(self):
# Count the number od discarded matches (when a file doesn't match all other dupes of the
# group) in Scanner.discarded_file_count
s = Scanner()
@ -436,3 +444,24 @@ def test_partial_group_match():
assert o2 in group
assert o3 not in group
eq_(s.discarded_file_count, 1)
class ScannerTest(TestCase):
def test_dont_group_files_that_dont_exist(self):
# when creating groups, check that files exist first. It's possible that these files have
# been moved during the scan by the user.
# In this test, we have to delete one of the files between the get_matches() part and the
# get_groups() part.
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
p = self.tmppath()
io.open(p + 'file1', 'w').write('foo')
io.open(p + 'file2', 'w').write('foo')
file1, file2 = fs.get_files(p)
def getmatches(*args, **kw):
io.remove(file2.path)
return [Match(file1, file2, 100)]
s._getmatches = getmatches
assert not s.GetDupeGroups([file1, file2])