[#72 state:fixed] When files are deleted during the scan, don't include them in the grouping phase.

--HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40225
2026-02-03 20:01:38 +00:00 · 2009-10-30 11:09:04 +00:00
parent 88127d8b8d
commit f070e90347
5 changed files with 448 additions and 406 deletions
--- a/base/py/engine.py
+++ b/base/py/engine.py
@@ -208,7 +208,9 @@ def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob)
    j = j.start_subjob([2, 8])
    size2files = defaultdict(set)
    for file in j.iter_with_progress(files, 'Read size of %d/%d files'):
-        size2files[getattr(file, sizeattr)].add(file)
+        filesize = getattr(file, sizeattr)
+        if filesize:
+            size2files[filesize].add(file)
    possible_matches = [files for files in size2files.values() if len(files) > 1]
    del size2files
    result = []
--- a/base/py/scanner.py
+++ b/base/py/scanner.py
@@ -10,7 +10,7 @@
 import logging


-from hsutil import job
+from hsutil import job, io
 from hsutil.misc import dedupe
 from hsutil.str import get_file_ext, rem_file_ext

@@ -80,9 +80,10 @@ class Scanner(object):
        logging.info('Getting matches')
        matches = self._getmatches(files, j)
        logging.info('Found %d matches' % len(matches))
+        j.set_progress(100, 'Removing false matches')
        if not self.mix_file_kind:
-            j.set_progress(100, 'Removing false matches')
            matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
+        matches = [m for m in matches if io.exists(m.first.path) and io.exists(m.second.path)]
        if self.ignore_list:
            j = j.start_subjob(2)
            iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
--- a/base/py/tests/engine_test.py
+++ b/base/py/tests/engine_test.py
@@ -15,16 +15,21 @@ from hsutil import job
 from hsutil.decorators import log_calls
 from hsutil.testcase import TestCase

-from .. import engine
+from .. import engine, fs
 from ..engine import *

 class NamedObject(object):
-    def __init__(self, name="foobar", with_words=False):
+    def __init__(self, name="foobar", with_words=False, size=1):
        self.name = name
+        self.size = size
+        self.md5partial = name
+        self.md5 = name
        if with_words:
            self.words = getwords(name)
    

+no = NamedObject
+
 def get_match_triangle():
    o1 = NamedObject(with_words=True)
    o2 = NamedObject(with_words=True)
@@ -486,6 +491,12 @@ class GetMatches(TestCase):
        self.assertEqual(42, len(r))
    

+class GetMatchesByContents(TestCase):
+    def test_dont_compare_empty_files(self):
+        o1, o2 = no(size=0), no(size=0)
+        assert not getmatches_by_contents([o1, o2])
+    
+
 class TCGroup(TestCase):
    def test_empy(self):
        g = Group()
--- a/base/py/tests/results_test.py
+++ b/base/py/tests/results_test.py
@@ -21,7 +21,6 @@ from .. import engine
 from ..results import *

 class NamedObject(engine_test.NamedObject):
-    size = 1
    path = property(lambda x:Path('basepath') + x.name)
    is_ref = False
    
--- a/base/py/tests/scanner_test.py
+++ b/base/py/tests/scanner_test.py
@@ -9,9 +9,11 @@

 from nose.tools import eq_

-from hsutil import job
+from hsutil import job, io
 from hsutil.path import Path
+from hsutil.testcase import TestCase

+from .. import fs
 from ..engine import getwords, Match
 from ..ignore import IgnoreList
 from ..scanner import *
@@ -27,412 +29,439 @@ class NamedObject(object):
 no = NamedObject

 #--- Scanner
-def test_empty():
-    s = Scanner()
-    r = s.GetDupeGroups([])
-    eq_(r, [])
-
-def test_default_settings():
-    s = Scanner()
-    eq_(s.min_match_percentage, 80)
-    eq_(s.scan_type, SCAN_TYPE_FILENAME)
-    eq_(s.mix_file_kind, True)
-    eq_(s.word_weighting, False)
-    eq_(s.match_similar_words, False)
-    assert isinstance(s.ignore_list, IgnoreList)
-
-def test_simple_with_default_settings():
-    s = Scanner()
-    f = [no('foo bar'), no('foo bar'), no('foo bleh')]
-    r = s.GetDupeGroups(f)
-    eq_(len(r), 1)
-    g = r[0]
-    #'foo bleh' cannot be in the group because the default min match % is 80
-    eq_(len(g), 2)
-    assert g.ref in f[:2]
-    assert g.dupes[0] in f[:2]
-
-def test_simple_with_lower_min_match():
-    s = Scanner()
-    s.min_match_percentage = 50
-    f = [no('foo bar'), no('foo bar'), no('foo bleh')]
-    r = s.GetDupeGroups(f)
-    eq_(len(r), 1)
-    g = r[0]
-    eq_(len(g), 3)
-
-def test_trim_all_ref_groups():
-    # When all files of a group are ref, don't include that group in the results, but also don't
-    # count the files from that group as discarded.
-    s = Scanner()
-    f = [no('foo'), no('foo'), no('bar'), no('bar')]
-    f[2].is_ref = True
-    f[3].is_ref = True
-    r = s.GetDupeGroups(f)
-    eq_(len(r), 1)
-    eq_(s.discarded_file_count, 0)
-
-def test_priorize():
-    s = Scanner()
-    f = [no('foo'), no('foo'), no('bar'), no('bar')]
-    f[1].size = 2
-    f[2].size = 3
-    f[3].is_ref = True
-    r = s.GetDupeGroups(f)
-    g1, g2 = r
-    assert f[1] in (g1.ref,g2.ref)
-    assert f[0] in (g1.dupes[0],g2.dupes[0])
-    assert f[3] in (g1.ref,g2.ref)
-    assert f[2] in (g1.dupes[0],g2.dupes[0])
-
-def test_content_scan():
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_CONTENT
-    f = [no('foo'), no('bar'), no('bleh')]
-    f[0].md5 = f[0].md5partial = 'foobar'
-    f[1].md5 = f[1].md5partial = 'foobar'
-    f[2].md5 = f[2].md5partial = 'bleh'
-    r = s.GetDupeGroups(f)
-    eq_(len(r), 1)
-    eq_(len(r[0]), 2)
-    eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded!
-
-def test_content_scan_compare_sizes_first():
-    class MyFile(no):
-        @property
-        def md5(file):
-            raise AssertionError()
+class ScannerTestFakeFiles(TestCase):
+    def setUp(self):
+        # This is a hack to avoid invalidating all previous tests since the scanner started to test
+        # for file existence before doing the match grouping.
+        self.mock(io, 'exists', lambda _: True)
    
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_CONTENT
-    f = [MyFile('foo', 1), MyFile('bar', 2)]
-    eq_(len(s.GetDupeGroups(f)), 0)
-
-def test_min_match_perc_doesnt_matter_for_content_scan():
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_CONTENT
-    f = [no('foo'), no('bar'), no('bleh')]
-    f[0].md5 = f[0].md5partial = 'foobar'
-    f[1].md5 = f[1].md5partial = 'foobar'
-    f[2].md5 = f[2].md5partial = 'bleh'
-    s.min_match_percentage = 101
-    r = s.GetDupeGroups(f)
-    eq_(len(r), 1)
-    eq_(len(r[0]), 2)
-    s.min_match_percentage = 0
-    r = s.GetDupeGroups(f)
-    eq_(len(r), 1)
-    eq_(len(r[0]), 2)
-
-def test_content_scan_doesnt_put_md5_in_words_at_the_end():
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_CONTENT
-    f = [no('foo'),no('bar')]
-    f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
-    f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
-    r = s.GetDupeGroups(f)
-    g = r[0]
-
-def test_extension_is_not_counted_in_filename_scan():
-    s = Scanner()
-    s.min_match_percentage = 100
-    f = [no('foo.bar'), no('foo.bleh')]
-    r = s.GetDupeGroups(f)
-    eq_(len(r), 1)
-    eq_(len(r[0]), 2)
-
-def test_job():
-    def do_progress(progress, desc=''):
-        log.append(progress)
-        return True
+    def test_empty(self):
+        s = Scanner()
+        r = s.GetDupeGroups([])
+        eq_(r, [])
    
-    s = Scanner()
-    log = []
-    f = [no('foo bar'), no('foo bar'), no('foo bleh')]
-    r = s.GetDupeGroups(f, job.Job(1, do_progress))
-    eq_(log[0], 0)
-    eq_(log[-1], 100)
-
-def test_mix_file_kind():
-    s = Scanner()
-    s.mix_file_kind = False
-    f = [no('foo.1'), no('foo.2')]
-    r = s.GetDupeGroups(f)
-    eq_(len(r), 0)
-
-def test_word_weighting():
-    s = Scanner()
-    s.min_match_percentage = 75
-    s.word_weighting = True
-    f = [no('foo bar'), no('foo bar bleh')]
-    r = s.GetDupeGroups(f)
-    eq_(len(r), 1)
-    g = r[0]
-    m = g.get_match_of(g.dupes[0])
-    eq_(m.percentage, 75) # 16 letters, 12 matching
-
-def test_similar_words():
-    s = Scanner()
-    s.match_similar_words = True
-    f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')]
-    r = s.GetDupeGroups(f)
-    eq_(len(r), 2)
-
-def test_fields():
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_FIELDS
-    f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')]
-    r = s.GetDupeGroups(f)
-    eq_(len(r), 0)
-
-def test_fields_no_order():
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER
-    f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')]
-    r = s.GetDupeGroups(f)
-    eq_(len(r), 1)
-
-def test_tag_scan():
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_TAG
-    o1 = no('foo')
-    o2 = no('bar')
-    o1.artist = 'The White Stripes'
-    o1.title = 'The Air Near My Fingers'
-    o2.artist = 'The White Stripes'
-    o2.title = 'The Air Near My Fingers'
-    r = s.GetDupeGroups([o1,o2])
-    eq_(len(r), 1)
-
-def test_tag_with_album_scan():
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_TAG
-    s.scanned_tags = set(['artist', 'album', 'title'])
-    o1 = no('foo')
-    o2 = no('bar')
-    o3 = no('bleh')
-    o1.artist = 'The White Stripes'
-    o1.title = 'The Air Near My Fingers'
-    o1.album = 'Elephant'
-    o2.artist = 'The White Stripes'
-    o2.title = 'The Air Near My Fingers'
-    o2.album = 'Elephant'
-    o3.artist = 'The White Stripes'
-    o3.title = 'The Air Near My Fingers'
-    o3.album = 'foobar'
-    r = s.GetDupeGroups([o1,o2,o3])
-    eq_(len(r), 1)
-
-def test_that_dash_in_tags_dont_create_new_fields():
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_TAG
-    s.scanned_tags = set(['artist', 'album', 'title'])
-    s.min_match_percentage = 50
-    o1 = no('foo')
-    o2 = no('bar')
-    o1.artist = 'The White Stripes - a'
-    o1.title = 'The Air Near My Fingers - a'
-    o1.album = 'Elephant - a'
-    o2.artist = 'The White Stripes - b'
-    o2.title = 'The Air Near My Fingers - b'
-    o2.album = 'Elephant - b'
-    r = s.GetDupeGroups([o1,o2])
-    eq_(len(r), 1)
-
-def test_tag_scan_with_different_scanned():
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_TAG
-    s.scanned_tags = set(['track', 'year'])
-    o1 = no('foo')
-    o2 = no('bar')
-    o1.artist = 'The White Stripes'
-    o1.title = 'some title'
-    o1.track = 'foo'
-    o1.year = 'bar'
-    o2.artist = 'The White Stripes'
-    o2.title = 'another title'
-    o2.track = 'foo'
-    o2.year = 'bar'
-    r = s.GetDupeGroups([o1, o2])
-    eq_(len(r), 1)
-
-def test_tag_scan_only_scans_existing_tags():
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_TAG
-    s.scanned_tags = set(['artist', 'foo'])
-    o1 = no('foo')
-    o2 = no('bar')
-    o1.artist = 'The White Stripes'
-    o1.foo = 'foo'
-    o2.artist = 'The White Stripes'
-    o2.foo = 'bar'
-    r = s.GetDupeGroups([o1, o2])
-    eq_(len(r), 1) # Because 'foo' is not scanned, they match
-
-def test_tag_scan_converts_to_str():
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_TAG
-    s.scanned_tags = set(['track'])
-    o1 = no('foo')
-    o2 = no('bar')
-    o1.track = 42
-    o2.track = 42
-    try:
+    def test_default_settings(self):
+        s = Scanner()
+        eq_(s.min_match_percentage, 80)
+        eq_(s.scan_type, SCAN_TYPE_FILENAME)
+        eq_(s.mix_file_kind, True)
+        eq_(s.word_weighting, False)
+        eq_(s.match_similar_words, False)
+        assert isinstance(s.ignore_list, IgnoreList)
+    
+    def test_simple_with_default_settings(self):
+        s = Scanner()
+        f = [no('foo bar'), no('foo bar'), no('foo bleh')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        g = r[0]
+        #'foo bleh' cannot be in the group because the default min match % is 80
+        eq_(len(g), 2)
+        assert g.ref in f[:2]
+        assert g.dupes[0] in f[:2]
+    
+    def test_simple_with_lower_min_match(self):
+        s = Scanner()
+        s.min_match_percentage = 50
+        f = [no('foo bar'), no('foo bar'), no('foo bleh')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        g = r[0]
+        eq_(len(g), 3)
+    
+    def test_trim_all_ref_groups(self):
+        # When all files of a group are ref, don't include that group in the results, but also don't
+        # count the files from that group as discarded.
+        s = Scanner()
+        f = [no('foo'), no('foo'), no('bar'), no('bar')]
+        f[2].is_ref = True
+        f[3].is_ref = True
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        eq_(s.discarded_file_count, 0)
+    
+    def test_priorize(self):
+        s = Scanner()
+        f = [no('foo'), no('foo'), no('bar'), no('bar')]
+        f[1].size = 2
+        f[2].size = 3
+        f[3].is_ref = True
+        r = s.GetDupeGroups(f)
+        g1, g2 = r
+        assert f[1] in (g1.ref,g2.ref)
+        assert f[0] in (g1.dupes[0],g2.dupes[0])
+        assert f[3] in (g1.ref,g2.ref)
+        assert f[2] in (g1.dupes[0],g2.dupes[0])
+    
+    def test_content_scan(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT
+        f = [no('foo'), no('bar'), no('bleh')]
+        f[0].md5 = f[0].md5partial = 'foobar'
+        f[1].md5 = f[1].md5partial = 'foobar'
+        f[2].md5 = f[2].md5partial = 'bleh'
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        eq_(len(r[0]), 2)
+        eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded!
+    
+    def test_content_scan_compare_sizes_first(self):
+        class MyFile(no):
+            @property
+            def md5(file):
+                raise AssertionError()
+    
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT
+        f = [MyFile('foo', 1), MyFile('bar', 2)]
+        eq_(len(s.GetDupeGroups(f)), 0)
+    
+    def test_min_match_perc_doesnt_matter_for_content_scan(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT
+        f = [no('foo'), no('bar'), no('bleh')]
+        f[0].md5 = f[0].md5partial = 'foobar'
+        f[1].md5 = f[1].md5partial = 'foobar'
+        f[2].md5 = f[2].md5partial = 'bleh'
+        s.min_match_percentage = 101
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        eq_(len(r[0]), 2)
+        s.min_match_percentage = 0
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        eq_(len(r[0]), 2)
+    
+    def test_content_scan_doesnt_put_md5_in_words_at_the_end(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT
+        f = [no('foo'),no('bar')]
+        f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
+        f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
+        r = s.GetDupeGroups(f)
+        g = r[0]
+    
+    def test_extension_is_not_counted_in_filename_scan(self):
+        s = Scanner()
+        s.min_match_percentage = 100
+        f = [no('foo.bar'), no('foo.bleh')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        eq_(len(r[0]), 2)
+    
+    def test_job(self):
+        def do_progress(progress, desc=''):
+            log.append(progress)
+            return True
+    
+        s = Scanner()
+        log = []
+        f = [no('foo bar'), no('foo bar'), no('foo bleh')]
+        r = s.GetDupeGroups(f, job.Job(1, do_progress))
+        eq_(log[0], 0)
+        eq_(log[-1], 100)
+    
+    def test_mix_file_kind(self):
+        s = Scanner()
+        s.mix_file_kind = False
+        f = [no('foo.1'), no('foo.2')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 0)
+    
+    def test_word_weighting(self):
+        s = Scanner()
+        s.min_match_percentage = 75
+        s.word_weighting = True
+        f = [no('foo bar'), no('foo bar bleh')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        g = r[0]
+        m = g.get_match_of(g.dupes[0])
+        eq_(m.percentage, 75) # 16 letters, 12 matching
+    
+    def test_similar_words(self):
+        s = Scanner()
+        s.match_similar_words = True
+        f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 2)
+    
+    def test_fields(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_FIELDS
+        f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 0)
+    
+    def test_fields_no_order(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER
+        f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+    
+    def test_tag_scan(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        o1 = no('foo')
+        o2 = no('bar')
+        o1.artist = 'The White Stripes'
+        o1.title = 'The Air Near My Fingers'
+        o2.artist = 'The White Stripes'
+        o2.title = 'The Air Near My Fingers'
+        r = s.GetDupeGroups([o1,o2])
+        eq_(len(r), 1)
+    
+    def test_tag_with_album_scan(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        s.scanned_tags = set(['artist', 'album', 'title'])
+        o1 = no('foo')
+        o2 = no('bar')
+        o3 = no('bleh')
+        o1.artist = 'The White Stripes'
+        o1.title = 'The Air Near My Fingers'
+        o1.album = 'Elephant'
+        o2.artist = 'The White Stripes'
+        o2.title = 'The Air Near My Fingers'
+        o2.album = 'Elephant'
+        o3.artist = 'The White Stripes'
+        o3.title = 'The Air Near My Fingers'
+        o3.album = 'foobar'
+        r = s.GetDupeGroups([o1,o2,o3])
+        eq_(len(r), 1)
+    
+    def test_that_dash_in_tags_dont_create_new_fields(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        s.scanned_tags = set(['artist', 'album', 'title'])
+        s.min_match_percentage = 50
+        o1 = no('foo')
+        o2 = no('bar')
+        o1.artist = 'The White Stripes - a'
+        o1.title = 'The Air Near My Fingers - a'
+        o1.album = 'Elephant - a'
+        o2.artist = 'The White Stripes - b'
+        o2.title = 'The Air Near My Fingers - b'
+        o2.album = 'Elephant - b'
+        r = s.GetDupeGroups([o1,o2])
+        eq_(len(r), 1)
+    
+    def test_tag_scan_with_different_scanned(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        s.scanned_tags = set(['track', 'year'])
+        o1 = no('foo')
+        o2 = no('bar')
+        o1.artist = 'The White Stripes'
+        o1.title = 'some title'
+        o1.track = 'foo'
+        o1.year = 'bar'
+        o2.artist = 'The White Stripes'
+        o2.title = 'another title'
+        o2.track = 'foo'
+        o2.year = 'bar'
        r = s.GetDupeGroups([o1, o2])
-    except TypeError:
-        raise AssertionError()
-    eq_(len(r), 1)
-
-def test_tag_scan_non_ascii():
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_TAG
-    s.scanned_tags = set(['title'])
-    o1 = no('foo')
-    o2 = no('bar')
-    o1.title = u'foobar\u00e9'
-    o2.title = u'foobar\u00e9'
-    try:
+        eq_(len(r), 1)
+    
+    def test_tag_scan_only_scans_existing_tags(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        s.scanned_tags = set(['artist', 'foo'])
+        o1 = no('foo')
+        o2 = no('bar')
+        o1.artist = 'The White Stripes'
+        o1.foo = 'foo'
+        o2.artist = 'The White Stripes'
+        o2.foo = 'bar'
        r = s.GetDupeGroups([o1, o2])
-    except UnicodeEncodeError:
-        raise AssertionError()
-    eq_(len(r), 1)
-
-def test_audio_content_scan():
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_CONTENT_AUDIO
-    f = [no('foo'), no('bar'), no('bleh')]
-    f[0].md5 = 'foo'
-    f[1].md5 = 'bar'
-    f[2].md5 = 'bleh'
-    f[0].md5partial = 'foo'
-    f[1].md5partial = 'foo'
-    f[2].md5partial = 'bleh'
-    f[0].audiosize = 1
-    f[1].audiosize = 1
-    f[2].audiosize = 1
-    r = s.GetDupeGroups(f)
-    eq_(len(r), 1)
-    eq_(len(r[0]), 2)
+        eq_(len(r), 1) # Because 'foo' is not scanned, they match
    
-def test_audio_content_scan_compare_sizes_first():
-    class MyFile(no):
-        @property
-        def md5partial(file):
+    def test_tag_scan_converts_to_str(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        s.scanned_tags = set(['track'])
+        o1 = no('foo')
+        o2 = no('bar')
+        o1.track = 42
+        o2.track = 42
+        try:
+            r = s.GetDupeGroups([o1, o2])
+        except TypeError:
            raise AssertionError()
+        eq_(len(r), 1)
    
-    s = Scanner()
-    s.scan_type = SCAN_TYPE_CONTENT_AUDIO
-    f = [MyFile('foo'), MyFile('bar')]
-    f[0].audiosize = 1
-    f[1].audiosize = 2
-    eq_(len(s.GetDupeGroups(f)), 0)
-
-def test_ignore_list():
-    s = Scanner()
-    f1 = no('foobar')
-    f2 = no('foobar')
-    f3 = no('foobar')
-    f1.path = Path('dir1/foobar')
-    f2.path = Path('dir2/foobar')
-    f3.path = Path('dir3/foobar')
-    s.ignore_list.Ignore(str(f1.path),str(f2.path))
-    s.ignore_list.Ignore(str(f1.path),str(f3.path))
-    r = s.GetDupeGroups([f1,f2,f3])
-    eq_(len(r), 1)
-    g = r[0]
-    eq_(len(g.dupes), 1)
-    assert f1 not in g
-    assert f2 in g
-    assert f3 in g
-    # Ignored matches are not counted as discarded
-    eq_(s.discarded_file_count, 0)
-
-def test_ignore_list_checks_for_unicode():
-    #scanner was calling path_str for ignore list checks. Since the Path changes, it must
-    #be unicode(path)
-    s = Scanner()
-    f1 = no('foobar')
-    f2 = no('foobar')
-    f3 = no('foobar')
-    f1.path = Path(u'foo1\u00e9')
-    f2.path = Path(u'foo2\u00e9')
-    f3.path = Path(u'foo3\u00e9')
-    s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path))
-    s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path))
-    r = s.GetDupeGroups([f1,f2,f3])
-    eq_(len(r), 1)
-    g = r[0]
-    eq_(len(g.dupes), 1)
-    assert f1 not in g
-    assert f2 in g
-    assert f3 in g
-
-def test_file_evaluates_to_false():
-    # A very wrong way to use any() was added at some point, causing resulting group list
-    # to be empty.
-    class FalseNamedObject(NamedObject):
-        def __nonzero__(self):
-            return False
+    def test_tag_scan_non_ascii(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        s.scanned_tags = set(['title'])
+        o1 = no('foo')
+        o2 = no('bar')
+        o1.title = u'foobar\u00e9'
+        o2.title = u'foobar\u00e9'
+        try:
+            r = s.GetDupeGroups([o1, o2])
+        except UnicodeEncodeError:
+            raise AssertionError()
+        eq_(len(r), 1)
+    
+    def test_audio_content_scan(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT_AUDIO
+        f = [no('foo'), no('bar'), no('bleh')]
+        f[0].md5 = 'foo'
+        f[1].md5 = 'bar'
+        f[2].md5 = 'bleh'
+        f[0].md5partial = 'foo'
+        f[1].md5partial = 'foo'
+        f[2].md5partial = 'bleh'
+        f[0].audiosize = 1
+        f[1].audiosize = 1
+        f[2].audiosize = 1
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        eq_(len(r[0]), 2)
+    
+    def test_audio_content_scan_compare_sizes_first(self):
+        class MyFile(no):
+            @property
+            def md5partial(file):
+                raise AssertionError()
+    
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT_AUDIO
+        f = [MyFile('foo'), MyFile('bar')]
+        f[0].audiosize = 1
+        f[1].audiosize = 2
+        eq_(len(s.GetDupeGroups(f)), 0)
+    
+    def test_ignore_list(self):
+        s = Scanner()
+        f1 = no('foobar')
+        f2 = no('foobar')
+        f3 = no('foobar')
+        f1.path = Path('dir1/foobar')
+        f2.path = Path('dir2/foobar')
+        f3.path = Path('dir3/foobar')
+        s.ignore_list.Ignore(str(f1.path),str(f2.path))
+        s.ignore_list.Ignore(str(f1.path),str(f3.path))
+        r = s.GetDupeGroups([f1,f2,f3])
+        eq_(len(r), 1)
+        g = r[0]
+        eq_(len(g.dupes), 1)
+        assert f1 not in g
+        assert f2 in g
+        assert f3 in g
+        # Ignored matches are not counted as discarded
+        eq_(s.discarded_file_count, 0)
+    
+    def test_ignore_list_checks_for_unicode(self):
+        #scanner was calling path_str for ignore list checks. Since the Path changes, it must
+        #be unicode(path)
+        s = Scanner()
+        f1 = no('foobar')
+        f2 = no('foobar')
+        f3 = no('foobar')
+        f1.path = Path(u'foo1\u00e9')
+        f2.path = Path(u'foo2\u00e9')
+        f3.path = Path(u'foo3\u00e9')
+        s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path))
+        s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path))
+        r = s.GetDupeGroups([f1,f2,f3])
+        eq_(len(r), 1)
+        g = r[0]
+        eq_(len(g.dupes), 1)
+        assert f1 not in g
+        assert f2 in g
+        assert f3 in g
+    
+    def test_file_evaluates_to_false(self):
+        # A very wrong way to use any() was added at some point, causing resulting group list
+        # to be empty.
+        class FalseNamedObject(NamedObject):
+            def __nonzero__(self):
+                return False
        
    
-    s = Scanner()
-    f1 = FalseNamedObject('foobar')
-    f2 = FalseNamedObject('foobar')
-    r = s.GetDupeGroups([f1, f2])
-    eq_(len(r), 1)
+        s = Scanner()
+        f1 = FalseNamedObject('foobar')
+        f2 = FalseNamedObject('foobar')
+        r = s.GetDupeGroups([f1, f2])
+        eq_(len(r), 1)
+    
+    def test_size_threshold(self):
+        # Only file equal or higher than the size_threshold in size are scanned
+        s = Scanner()
+        f1 = no('foo', 1)
+        f2 = no('foo', 2)
+        f3 = no('foo', 3)
+        s.size_threshold = 2
+        groups = s.GetDupeGroups([f1,f2,f3])
+        eq_(len(groups), 1)
+        [group] = groups
+        eq_(len(group), 2)
+        assert f1 not in group
+        assert f2 in group
+        assert f3 in group
+    
+    def test_tie_breaker_path_deepness(self):
+        # If there is a tie in prioritization, path deepness is used as a tie breaker
+        s = Scanner()
+        o1, o2 = no('foo'), no('foo')
+        o1.path = Path('foo')
+        o2.path = Path('foo/bar')
+        [group] = s.GetDupeGroups([o1, o2])
+        assert group.ref is o2
+    
+    def test_tie_breaker_copy(self):
+        # if copy is in the words used (even if it has a deeper path), it becomes a dupe
+        s = Scanner()
+        o1, o2 = no('foo bar Copy'), no('foo bar')
+        o1.path = Path('deeper/path')
+        o2.path = Path('foo')
+        [group] = s.GetDupeGroups([o1, o2])
+        assert group.ref is o2
+    
+    def test_tie_breaker_same_name_plus_digit(self):
+        # if ref has the same words as dupe, but has some just one extra word which is a digit, it
+        # becomes a dupe
+        s = Scanner()
+        o1, o2 = no('foo bar 42'), no('foo bar')
+        o1.path = Path('deeper/path')
+        o2.path = Path('foo')
+        [group] = s.GetDupeGroups([o1, o2])
+        assert group.ref is o2
+    
+    def test_partial_group_match(self):
+        # Count the number od discarded matches (when a file doesn't match all other dupes of the 
+        # group) in Scanner.discarded_file_count
+        s = Scanner()
+        o1, o2, o3 = no('a b'), no('a'), no('b')
+        s.min_match_percentage = 50
+        [group] = s.GetDupeGroups([o1, o2, o3])
+        eq_(len(group), 2)
+        assert o1 in group
+        assert o2 in group
+        assert o3 not in group
+        eq_(s.discarded_file_count, 1)
+    

-def test_size_threshold():
-    # Only file equal or higher than the size_threshold in size are scanned
-    s = Scanner()
-    f1 = no('foo', 1)
-    f2 = no('foo', 2)
-    f3 = no('foo', 3)
-    s.size_threshold = 2
-    groups = s.GetDupeGroups([f1,f2,f3])
-    eq_(len(groups), 1)
-    [group] = groups
-    eq_(len(group), 2)
-    assert f1 not in group
-    assert f2 in group
-    assert f3 in group
-
-def test_tie_breaker_path_deepness():
-    # If there is a tie in prioritization, path deepness is used as a tie breaker
-    s = Scanner()
-    o1, o2 = no('foo'), no('foo')
-    o1.path = Path('foo')
-    o2.path = Path('foo/bar')
-    [group] = s.GetDupeGroups([o1, o2])
-    assert group.ref is o2
-
-def test_tie_breaker_copy():
-    # if copy is in the words used (even if it has a deeper path), it becomes a dupe
-    s = Scanner()
-    o1, o2 = no('foo bar Copy'), no('foo bar')
-    o1.path = Path('deeper/path')
-    o2.path = Path('foo')
-    [group] = s.GetDupeGroups([o1, o2])
-    assert group.ref is o2
-
-def test_tie_breaker_same_name_plus_digit():
-    # if ref has the same words as dupe, but has some just one extra word which is a digit, it
-    # becomes a dupe
-    s = Scanner()
-    o1, o2 = no('foo bar 42'), no('foo bar')
-    o1.path = Path('deeper/path')
-    o2.path = Path('foo')
-    [group] = s.GetDupeGroups([o1, o2])
-    assert group.ref is o2
-
-def test_partial_group_match():
-    # Count the number od discarded matches (when a file doesn't match all other dupes of the 
-    # group) in Scanner.discarded_file_count
-    s = Scanner()
-    o1, o2, o3 = no('a b'), no('a'), no('b')
-    s.min_match_percentage = 50
-    [group] = s.GetDupeGroups([o1, o2, o3])
-    eq_(len(group), 2)
-    assert o1 in group
-    assert o2 in group
-    assert o3 not in group
-    eq_(s.discarded_file_count, 1)
+class ScannerTest(TestCase):
+    def test_dont_group_files_that_dont_exist(self):
+        # when creating groups, check that files exist first. It's possible that these files have
+        # been moved during the scan by the user.
+        # In this test, we have to delete one of the files between the get_matches() part and the
+        # get_groups() part.
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT
+        p = self.tmppath()
+        io.open(p + 'file1', 'w').write('foo')
+        io.open(p + 'file2', 'w').write('foo')
+        file1, file2 = fs.get_files(p)
+        def getmatches(*args, **kw):
+            io.remove(file2.path)
+            return [Match(file1, file2, 100)]
+        s._getmatches = getmatches
+        
+        assert not s.GetDupeGroups([file1, file2])
+