dupeguru/core/tests/scanner_test.py

# Created By: Virgil Dupras
# Created On: 2006/03/03
# Copyright 2010 Hardcoded Software (http://www.hardcoded.net)
# 
# This software is licensed under the "BSD" License as described in the "LICENSE" file, 
# which should be included with this package. The terms are also available at 
# http://www.hardcoded.net/licenses/bsd_license

from jobprogress import job
from hscommon import io
from hscommon.path import Path
from hscommon.testutil import eq_

from .. import fs
from ..engine import getwords, Match
from ..ignore import IgnoreList
from ..scanner import *

class NamedObject:
    def __init__(self, name="foobar", size=1):
        self.name = name
        self.size = size
        self.path = Path('')
        self.words = getwords(name)
    
    def __repr__(self):
        return '<NamedObject %r>' % self.name
    

no = NamedObject

def pytest_funcarg__fake_fileexists(request):
    # This is a hack to avoid invalidating all previous tests since the scanner started to test
    # for file existence before doing the match grouping.
    monkeypatch = request.getfuncargvalue('monkeypatch')
    monkeypatch.setattr(io, 'exists', lambda _: True)

def test_empty(fake_fileexists):
    s = Scanner()
    r = s.GetDupeGroups([])
    eq_(r, [])

def test_default_settings(fake_fileexists):
    s = Scanner()
    eq_(s.min_match_percentage, 80)
    eq_(s.scan_type, ScanType.Filename)
    eq_(s.mix_file_kind, True)
    eq_(s.word_weighting, False)
    eq_(s.match_similar_words, False)
    assert isinstance(s.ignore_list, IgnoreList)

def test_simple_with_default_settings(fake_fileexists):
    s = Scanner()
    f = [no('foo bar'), no('foo bar'), no('foo bleh')]
    r = s.GetDupeGroups(f)
    eq_(len(r), 1)
    g = r[0]
    #'foo bleh' cannot be in the group because the default min match % is 80
    eq_(len(g), 2)
    assert g.ref in f[:2]
    assert g.dupes[0] in f[:2]

def test_simple_with_lower_min_match(fake_fileexists):
    s = Scanner()
    s.min_match_percentage = 50
    f = [no('foo bar'), no('foo bar'), no('foo bleh')]
    r = s.GetDupeGroups(f)
    eq_(len(r), 1)
    g = r[0]
    eq_(len(g), 3)

def test_trim_all_ref_groups(fake_fileexists):
    # When all files of a group are ref, don't include that group in the results, but also don't
    # count the files from that group as discarded.
    s = Scanner()
    f = [no('foo'), no('foo'), no('bar'), no('bar')]
    f[2].is_ref = True
    f[3].is_ref = True
    r = s.GetDupeGroups(f)
    eq_(len(r), 1)
    eq_(s.discarded_file_count, 0)

def test_priorize(fake_fileexists):
    s = Scanner()
    f = [no('foo'), no('foo'), no('bar'), no('bar')]
    f[1].size = 2
    f[2].size = 3
    f[3].is_ref = True
    r = s.GetDupeGroups(f)
    g1, g2 = r
    assert f[1] in (g1.ref,g2.ref)
    assert f[0] in (g1.dupes[0],g2.dupes[0])
    assert f[3] in (g1.ref,g2.ref)
    assert f[2] in (g1.dupes[0],g2.dupes[0])

def test_content_scan(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Contents
    f = [no('foo'), no('bar'), no('bleh')]
    f[0].md5 = f[0].md5partial = 'foobar'
    f[1].md5 = f[1].md5partial = 'foobar'
    f[2].md5 = f[2].md5partial = 'bleh'
    r = s.GetDupeGroups(f)
    eq_(len(r), 1)
    eq_(len(r[0]), 2)
    eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded!

def test_content_scan_compare_sizes_first(fake_fileexists):
    class MyFile(no):
        @property
        def md5(file):
            raise AssertionError()

    s = Scanner()
    s.scan_type = ScanType.Contents
    f = [MyFile('foo', 1), MyFile('bar', 2)]
    eq_(len(s.GetDupeGroups(f)), 0)

def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Contents
    f = [no('foo'), no('bar'), no('bleh')]
    f[0].md5 = f[0].md5partial = 'foobar'
    f[1].md5 = f[1].md5partial = 'foobar'
    f[2].md5 = f[2].md5partial = 'bleh'
    s.min_match_percentage = 101
    r = s.GetDupeGroups(f)
    eq_(len(r), 1)
    eq_(len(r[0]), 2)
    s.min_match_percentage = 0
    r = s.GetDupeGroups(f)
    eq_(len(r), 1)
    eq_(len(r[0]), 2)

def test_content_scan_doesnt_put_md5_in_words_at_the_end(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Contents
    f = [no('foo'),no('bar')]
    f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
    f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
    r = s.GetDupeGroups(f)
    g = r[0]

def test_extension_is_not_counted_in_filename_scan(fake_fileexists):
    s = Scanner()
    s.min_match_percentage = 100
    f = [no('foo.bar'), no('foo.bleh')]
    r = s.GetDupeGroups(f)
    eq_(len(r), 1)
    eq_(len(r[0]), 2)

def test_job(fake_fileexists):
    def do_progress(progress, desc=''):
        log.append(progress)
        return True

    s = Scanner()
    log = []
    f = [no('foo bar'), no('foo bar'), no('foo bleh')]
    r = s.GetDupeGroups(f, job.Job(1, do_progress))
    eq_(log[0], 0)
    eq_(log[-1], 100)

def test_mix_file_kind(fake_fileexists):
    s = Scanner()
    s.mix_file_kind = False
    f = [no('foo.1'), no('foo.2')]
    r = s.GetDupeGroups(f)
    eq_(len(r), 0)

def test_word_weighting(fake_fileexists):
    s = Scanner()
    s.min_match_percentage = 75
    s.word_weighting = True
    f = [no('foo bar'), no('foo bar bleh')]
    r = s.GetDupeGroups(f)
    eq_(len(r), 1)
    g = r[0]
    m = g.get_match_of(g.dupes[0])
    eq_(m.percentage, 75) # 16 letters, 12 matching

def test_similar_words(fake_fileexists):
    s = Scanner()
    s.match_similar_words = True
    f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')]
    r = s.GetDupeGroups(f)
    eq_(len(r), 2)

def test_fields(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Fields
    f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')]
    r = s.GetDupeGroups(f)
    eq_(len(r), 0)

def test_fields_no_order(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.FieldsNoOrder
    f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')]
    r = s.GetDupeGroups(f)
    eq_(len(r), 1)

def test_tag_scan(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Tag
    o1 = no('foo')
    o2 = no('bar')
    o1.artist = 'The White Stripes'
    o1.title = 'The Air Near My Fingers'
    o2.artist = 'The White Stripes'
    o2.title = 'The Air Near My Fingers'
    r = s.GetDupeGroups([o1,o2])
    eq_(len(r), 1)

def test_tag_with_album_scan(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Tag
    s.scanned_tags = set(['artist', 'album', 'title'])
    o1 = no('foo')
    o2 = no('bar')
    o3 = no('bleh')
    o1.artist = 'The White Stripes'
    o1.title = 'The Air Near My Fingers'
    o1.album = 'Elephant'
    o2.artist = 'The White Stripes'
    o2.title = 'The Air Near My Fingers'
    o2.album = 'Elephant'
    o3.artist = 'The White Stripes'
    o3.title = 'The Air Near My Fingers'
    o3.album = 'foobar'
    r = s.GetDupeGroups([o1,o2,o3])
    eq_(len(r), 1)

def test_that_dash_in_tags_dont_create_new_fields(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Tag
    s.scanned_tags = set(['artist', 'album', 'title'])
    s.min_match_percentage = 50
    o1 = no('foo')
    o2 = no('bar')
    o1.artist = 'The White Stripes - a'
    o1.title = 'The Air Near My Fingers - a'
    o1.album = 'Elephant - a'
    o2.artist = 'The White Stripes - b'
    o2.title = 'The Air Near My Fingers - b'
    o2.album = 'Elephant - b'
    r = s.GetDupeGroups([o1,o2])
    eq_(len(r), 1)

def test_tag_scan_with_different_scanned(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Tag
    s.scanned_tags = set(['track', 'year'])
    o1 = no('foo')
    o2 = no('bar')
    o1.artist = 'The White Stripes'
    o1.title = 'some title'
    o1.track = 'foo'
    o1.year = 'bar'
    o2.artist = 'The White Stripes'
    o2.title = 'another title'
    o2.track = 'foo'
    o2.year = 'bar'
    r = s.GetDupeGroups([o1, o2])
    eq_(len(r), 1)

def test_tag_scan_only_scans_existing_tags(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Tag
    s.scanned_tags = set(['artist', 'foo'])
    o1 = no('foo')
    o2 = no('bar')
    o1.artist = 'The White Stripes'
    o1.foo = 'foo'
    o2.artist = 'The White Stripes'
    o2.foo = 'bar'
    r = s.GetDupeGroups([o1, o2])
    eq_(len(r), 1) # Because 'foo' is not scanned, they match

def test_tag_scan_converts_to_str(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Tag
    s.scanned_tags = set(['track'])
    o1 = no('foo')
    o2 = no('bar')
    o1.track = 42
    o2.track = 42
    try:
        r = s.GetDupeGroups([o1, o2])
    except TypeError:
        raise AssertionError()
    eq_(len(r), 1)

def test_tag_scan_non_ascii(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.Tag
    s.scanned_tags = set(['title'])
    o1 = no('foo')
    o2 = no('bar')
    o1.title = 'foobar\u00e9'
    o2.title = 'foobar\u00e9'
    try:
        r = s.GetDupeGroups([o1, o2])
    except UnicodeEncodeError:
        raise AssertionError()
    eq_(len(r), 1)

def test_audio_content_scan(fake_fileexists):
    s = Scanner()
    s.scan_type = ScanType.ContentsAudio
    f = [no('foo'), no('bar'), no('bleh')]
    f[0].md5 = 'foo'
    f[1].md5 = 'bar'
    f[2].md5 = 'bleh'
    f[0].md5partial = 'foo'
    f[1].md5partial = 'foo'
    f[2].md5partial = 'bleh'
    f[0].audiosize = 1
    f[1].audiosize = 1
    f[2].audiosize = 1
    r = s.GetDupeGroups(f)
    eq_(len(r), 1)
    eq_(len(r[0]), 2)

def test_audio_content_scan_compare_sizes_first(fake_fileexists):
    class MyFile(no):
        @property
        def md5partial(file):
            raise AssertionError()

    s = Scanner()
    s.scan_type = ScanType.ContentsAudio
    f = [MyFile('foo'), MyFile('bar')]
    f[0].audiosize = 1
    f[1].audiosize = 2
    eq_(len(s.GetDupeGroups(f)), 0)

def test_ignore_list(fake_fileexists):
    s = Scanner()
    f1 = no('foobar')
    f2 = no('foobar')
    f3 = no('foobar')
    f1.path = Path('dir1/foobar')
    f2.path = Path('dir2/foobar')
    f3.path = Path('dir3/foobar')
    s.ignore_list.Ignore(str(f1.path),str(f2.path))
    s.ignore_list.Ignore(str(f1.path),str(f3.path))
    r = s.GetDupeGroups([f1,f2,f3])
    eq_(len(r), 1)
    g = r[0]
    eq_(len(g.dupes), 1)
    assert f1 not in g
    assert f2 in g
    assert f3 in g
    # Ignored matches are not counted as discarded
    eq_(s.discarded_file_count, 0)

def test_ignore_list_checks_for_unicode(fake_fileexists):
    #scanner was calling path_str for ignore list checks. Since the Path changes, it must
    #be unicode(path)
    s = Scanner()
    f1 = no('foobar')
    f2 = no('foobar')
    f3 = no('foobar')
    f1.path = Path('foo1\u00e9')
    f2.path = Path('foo2\u00e9')
    f3.path = Path('foo3\u00e9')
    s.ignore_list.Ignore(str(f1.path),str(f2.path))
    s.ignore_list.Ignore(str(f1.path),str(f3.path))
    r = s.GetDupeGroups([f1,f2,f3])
    eq_(len(r), 1)
    g = r[0]
    eq_(len(g.dupes), 1)
    assert f1 not in g
    assert f2 in g
    assert f3 in g

def test_file_evaluates_to_false(fake_fileexists):
    # A very wrong way to use any() was added at some point, causing resulting group list
    # to be empty.
    class FalseNamedObject(NamedObject):
        def __bool__(self):
            return False
    

    s = Scanner()
    f1 = FalseNamedObject('foobar')
    f2 = FalseNamedObject('foobar')
    r = s.GetDupeGroups([f1, f2])
    eq_(len(r), 1)

def test_size_threshold(fake_fileexists):
    # Only file equal or higher than the size_threshold in size are scanned
    s = Scanner()
    f1 = no('foo', 1)
    f2 = no('foo', 2)
    f3 = no('foo', 3)
    s.size_threshold = 2
    groups = s.GetDupeGroups([f1,f2,f3])
    eq_(len(groups), 1)
    [group] = groups
    eq_(len(group), 2)
    assert f1 not in group
    assert f2 in group
    assert f3 in group

def test_tie_breaker_path_deepness(fake_fileexists):
    # If there is a tie in prioritization, path deepness is used as a tie breaker
    s = Scanner()
    o1, o2 = no('foo'), no('foo')
    o1.path = Path('foo')
    o2.path = Path('foo/bar')
    [group] = s.GetDupeGroups([o1, o2])
    assert group.ref is o2

def test_tie_breaker_copy(fake_fileexists):
    # if copy is in the words used (even if it has a deeper path), it becomes a dupe
    s = Scanner()
    o1, o2 = no('foo bar Copy'), no('foo bar')
    o1.path = Path('deeper/path')
    o2.path = Path('foo')
    [group] = s.GetDupeGroups([o1, o2])
    assert group.ref is o2

def test_tie_breaker_same_name_plus_digit(fake_fileexists):
    # if ref has the same words as dupe, but has some just one extra word which is a digit, it
    # becomes a dupe
    s = Scanner()
    o1 = no('foo bar 42')
    o2 = no('foo bar [42]')
    o3 = no('foo bar (42)')
    o4 = no('foo bar {42}')
    o5 = no('foo bar')
    # all numbered names have deeper paths, so they'll end up ref if the digits aren't correctly
    # used as tie breakers
    o1.path = Path('deeper/path')
    o2.path = Path('deeper/path')
    o3.path = Path('deeper/path')
    o4.path = Path('deeper/path')
    o5.path = Path('foo')
    [group] = s.GetDupeGroups([o1, o2, o3, o4, o5])
    assert group.ref is o5

def test_partial_group_match(fake_fileexists):
    # Count the number od discarded matches (when a file doesn't match all other dupes of the 
    # group) in Scanner.discarded_file_count
    s = Scanner()
    o1, o2, o3 = no('a b'), no('a'), no('b')
    s.min_match_percentage = 50
    [group] = s.GetDupeGroups([o1, o2, o3])
    eq_(len(group), 2)
    assert o1 in group
    assert o2 in group
    assert o3 not in group
    eq_(s.discarded_file_count, 1)

def test_dont_group_files_that_dont_exist(tmpdir):
    # when creating groups, check that files exist first. It's possible that these files have
    # been moved during the scan by the user.
    # In this test, we have to delete one of the files between the get_matches() part and the
    # get_groups() part.
    s = Scanner()
    s.scan_type = ScanType.Contents
    p = Path(str(tmpdir))
    io.open(p + 'file1', 'w').write('foo')
    io.open(p + 'file2', 'w').write('foo')
    file1, file2 = fs.get_files(p)
    def getmatches(*args, **kw):
        io.remove(file2.path)
        return [Match(file1, file2, 100)]
    s._getmatches = getmatches
    
    assert not s.GetDupeGroups([file1, file2])
Moved the tests to a "tests" subfolder, and "un-unittest"'ed some of them. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4025 2009-06-07 14:26:46 +00:00			`# Created By: Virgil Dupras`
			`# Created On: 2006/03/03`
Changed copyright year to 2010 2010-01-01 20:11:34 +00:00			`# Copyright 2010 Hardcoded Software (http://www.hardcoded.net)`
Relicensed to HS License. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40100 2009-08-05 08:59:46 +00:00			`#`
Re-licensed to BSD 2010-09-30 10:17:41 +00:00			`# This software is licensed under the "BSD" License as described in the "LICENSE" file,`
Relicensed to HS License. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40100 2009-08-05 08:59:46 +00:00			`# which should be included with this package. The terms are also available at`
Re-licensed to BSD 2010-09-30 10:17:41 +00:00			`# http://www.hardcoded.net/licenses/bsd_license`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
Adapted to the job-related code moving to the 'jobprogress' package. 2010-11-20 11:42:15 +00:00			`from jobprogress import job`
Changed references to what has already been moved from hsutil to hscommon (io, path, testutil). 2011-01-11 10:59:53 +00:00			`from hscommon import io`
			`from hscommon.path import Path`
Stop using hsutil.testcase. 2011-01-05 10:11:21 +00:00			`from hscommon.testutil import eq_`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
[#72 state:fixed] When files are deleted during the scan, don't include them in the grouping phase. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40225 2009-10-30 11:09:04 +00:00			`from .. import fs`
Moved the tests to a "tests" subfolder, and "un-unittest"'ed some of them. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%4025 2009-06-07 14:26:46 +00:00			`from ..engine import getwords, Match`
			`from ..ignore import IgnoreList`
			`from ..scanner import *`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
Stop using hsutil.testcase. 2011-01-05 10:11:21 +00:00			`class NamedObject:`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`def __init__(self, name="foobar", size=1):`
			`self.name = name`
			`self.size = size`
			`self.path = Path('')`
			`self.words = getwords(name)`

Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`def __repr__(self):`
			`return '<NamedObject %r>' % self.name`

Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00
			`no = NamedObject`

Stop using hsutil.testcase. 2011-01-05 10:11:21 +00:00			`def pytest_funcarg__fake_fileexists(request):`
			`# This is a hack to avoid invalidating all previous tests since the scanner started to test`
			`# for file existence before doing the match grouping.`
			`monkeypatch = request.getfuncargvalue('monkeypatch')`
			`monkeypatch.setattr(io, 'exists', lambda _: True)`

			`def test_empty(fake_fileexists):`
			`s = Scanner()`
			`r = s.GetDupeGroups([])`
			`eq_(r, [])`

			`def test_default_settings(fake_fileexists):`
			`s = Scanner()`
			`eq_(s.min_match_percentage, 80)`
			`eq_(s.scan_type, ScanType.Filename)`
			`eq_(s.mix_file_kind, True)`
			`eq_(s.word_weighting, False)`
			`eq_(s.match_similar_words, False)`
			`assert isinstance(s.ignore_list, IgnoreList)`

			`def test_simple_with_default_settings(fake_fileexists):`
			`s = Scanner()`
			`f = [no('foo bar'), no('foo bar'), no('foo bleh')]`
			`r = s.GetDupeGroups(f)`
			`eq_(len(r), 1)`
			`g = r[0]`
			`#'foo bleh' cannot be in the group because the default min match % is 80`
			`eq_(len(g), 2)`
			`assert g.ref in f[:2]`
			`assert g.dupes[0] in f[:2]`

			`def test_simple_with_lower_min_match(fake_fileexists):`
			`s = Scanner()`
			`s.min_match_percentage = 50`
			`f = [no('foo bar'), no('foo bar'), no('foo bleh')]`
			`r = s.GetDupeGroups(f)`
			`eq_(len(r), 1)`
			`g = r[0]`
			`eq_(len(g), 3)`

			`def test_trim_all_ref_groups(fake_fileexists):`
			`# When all files of a group are ref, don't include that group in the results, but also don't`
			`# count the files from that group as discarded.`
			`s = Scanner()`
			`f = [no('foo'), no('foo'), no('bar'), no('bar')]`
			`f[2].is_ref = True`
			`f[3].is_ref = True`
			`r = s.GetDupeGroups(f)`
			`eq_(len(r), 1)`
			`eq_(s.discarded_file_count, 0)`

			`def test_priorize(fake_fileexists):`
			`s = Scanner()`
			`f = [no('foo'), no('foo'), no('bar'), no('bar')]`
			`f[1].size = 2`
			`f[2].size = 3`
			`f[3].is_ref = True`
			`r = s.GetDupeGroups(f)`
			`g1, g2 = r`
			`assert f[1] in (g1.ref,g2.ref)`
			`assert f[0] in (g1.dupes[0],g2.dupes[0])`
			`assert f[3] in (g1.ref,g2.ref)`
			`assert f[2] in (g1.dupes[0],g2.dupes[0])`

			`def test_content_scan(fake_fileexists):`
			`s = Scanner()`
			`s.scan_type = ScanType.Contents`
			`f = [no('foo'), no('bar'), no('bleh')]`
			`f[0].md5 = f[0].md5partial = 'foobar'`
			`f[1].md5 = f[1].md5partial = 'foobar'`
			`f[2].md5 = f[2].md5partial = 'bleh'`
			`r = s.GetDupeGroups(f)`
			`eq_(len(r), 1)`
			`eq_(len(r[0]), 2)`
			`eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded!`

			`def test_content_scan_compare_sizes_first(fake_fileexists):`
			`class MyFile(no):`
			`@property`
			`def md5(file):`
			`raise AssertionError()`

			`s = Scanner()`
			`s.scan_type = ScanType.Contents`
			`f = [MyFile('foo', 1), MyFile('bar', 2)]`
			`eq_(len(s.GetDupeGroups(f)), 0)`

			`def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists):`
			`s = Scanner()`
			`s.scan_type = ScanType.Contents`
			`f = [no('foo'), no('bar'), no('bleh')]`
			`f[0].md5 = f[0].md5partial = 'foobar'`
			`f[1].md5 = f[1].md5partial = 'foobar'`
			`f[2].md5 = f[2].md5partial = 'bleh'`
			`s.min_match_percentage = 101`
			`r = s.GetDupeGroups(f)`
			`eq_(len(r), 1)`
			`eq_(len(r[0]), 2)`
			`s.min_match_percentage = 0`
			`r = s.GetDupeGroups(f)`
			`eq_(len(r), 1)`
			`eq_(len(r[0]), 2)`

			`def test_content_scan_doesnt_put_md5_in_words_at_the_end(fake_fileexists):`
			`s = Scanner()`
			`s.scan_type = ScanType.Contents`
			`f = [no('foo'),no('bar')]`
			`f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'`
			`f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'`
			`r = s.GetDupeGroups(f)`
			`g = r[0]`

			`def test_extension_is_not_counted_in_filename_scan(fake_fileexists):`
			`s = Scanner()`
			`s.min_match_percentage = 100`
			`f = [no('foo.bar'), no('foo.bleh')]`
			`r = s.GetDupeGroups(f)`
			`eq_(len(r), 1)`
			`eq_(len(r[0]), 2)`

			`def test_job(fake_fileexists):`
			`def do_progress(progress, desc=''):`
			`log.append(progress)`
			`return True`

			`s = Scanner()`
			`log = []`
			`f = [no('foo bar'), no('foo bar'), no('foo bleh')]`
			`r = s.GetDupeGroups(f, job.Job(1, do_progress))`
			`eq_(log[0], 0)`
			`eq_(log[-1], 100)`

			`def test_mix_file_kind(fake_fileexists):`
			`s = Scanner()`
			`s.mix_file_kind = False`
			`f = [no('foo.1'), no('foo.2')]`
			`r = s.GetDupeGroups(f)`
			`eq_(len(r), 0)`

			`def test_word_weighting(fake_fileexists):`
			`s = Scanner()`
			`s.min_match_percentage = 75`
			`s.word_weighting = True`
			`f = [no('foo bar'), no('foo bar bleh')]`
			`r = s.GetDupeGroups(f)`
			`eq_(len(r), 1)`
			`g = r[0]`
			`m = g.get_match_of(g.dupes[0])`
			`eq_(m.percentage, 75) # 16 letters, 12 matching`

			`def test_similar_words(fake_fileexists):`
			`s = Scanner()`
			`s.match_similar_words = True`
			`f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')]`
			`r = s.GetDupeGroups(f)`
			`eq_(len(r), 2)`

			`def test_fields(fake_fileexists):`
			`s = Scanner()`
			`s.scan_type = ScanType.Fields`
			`f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')]`
			`r = s.GetDupeGroups(f)`
			`eq_(len(r), 0)`

			`def test_fields_no_order(fake_fileexists):`
			`s = Scanner()`
			`s.scan_type = ScanType.FieldsNoOrder`
			`f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')]`
			`r = s.GetDupeGroups(f)`
			`eq_(len(r), 1)`

			`def test_tag_scan(fake_fileexists):`
			`s = Scanner()`
			`s.scan_type = ScanType.Tag`
			`o1 = no('foo')`
			`o2 = no('bar')`
			`o1.artist = 'The White Stripes'`
			`o1.title = 'The Air Near My Fingers'`
			`o2.artist = 'The White Stripes'`
			`o2.title = 'The Air Near My Fingers'`
			`r = s.GetDupeGroups([o1,o2])`
			`eq_(len(r), 1)`

			`def test_tag_with_album_scan(fake_fileexists):`
			`s = Scanner()`
			`s.scan_type = ScanType.Tag`
			`s.scanned_tags = set(['artist', 'album', 'title'])`
			`o1 = no('foo')`
			`o2 = no('bar')`
			`o3 = no('bleh')`
			`o1.artist = 'The White Stripes'`
			`o1.title = 'The Air Near My Fingers'`
			`o1.album = 'Elephant'`
			`o2.artist = 'The White Stripes'`
			`o2.title = 'The Air Near My Fingers'`
			`o2.album = 'Elephant'`
			`o3.artist = 'The White Stripes'`
			`o3.title = 'The Air Near My Fingers'`
			`o3.album = 'foobar'`
			`r = s.GetDupeGroups([o1,o2,o3])`
			`eq_(len(r), 1)`

			`def test_that_dash_in_tags_dont_create_new_fields(fake_fileexists):`
			`s = Scanner()`
			`s.scan_type = ScanType.Tag`
			`s.scanned_tags = set(['artist', 'album', 'title'])`
			`s.min_match_percentage = 50`
			`o1 = no('foo')`
			`o2 = no('bar')`
			`o1.artist = 'The White Stripes - a'`
			`o1.title = 'The Air Near My Fingers - a'`
			`o1.album = 'Elephant - a'`
			`o2.artist = 'The White Stripes - b'`
			`o2.title = 'The Air Near My Fingers - b'`
			`o2.album = 'Elephant - b'`
			`r = s.GetDupeGroups([o1,o2])`
			`eq_(len(r), 1)`

			`def test_tag_scan_with_different_scanned(fake_fileexists):`
			`s = Scanner()`
			`s.scan_type = ScanType.Tag`
			`s.scanned_tags = set(['track', 'year'])`
			`o1 = no('foo')`
			`o2 = no('bar')`
			`o1.artist = 'The White Stripes'`
			`o1.title = 'some title'`
			`o1.track = 'foo'`
			`o1.year = 'bar'`
			`o2.artist = 'The White Stripes'`
			`o2.title = 'another title'`
			`o2.track = 'foo'`
			`o2.year = 'bar'`
			`r = s.GetDupeGroups([o1, o2])`
			`eq_(len(r), 1)`

			`def test_tag_scan_only_scans_existing_tags(fake_fileexists):`
			`s = Scanner()`
			`s.scan_type = ScanType.Tag`
			`s.scanned_tags = set(['artist', 'foo'])`
			`o1 = no('foo')`
			`o2 = no('bar')`
			`o1.artist = 'The White Stripes'`
			`o1.foo = 'foo'`
			`o2.artist = 'The White Stripes'`
			`o2.foo = 'bar'`
			`r = s.GetDupeGroups([o1, o2])`
			`eq_(len(r), 1) # Because 'foo' is not scanned, they match`

			`def test_tag_scan_converts_to_str(fake_fileexists):`
			`s = Scanner()`
			`s.scan_type = ScanType.Tag`
			`s.scanned_tags = set(['track'])`
			`o1 = no('foo')`
			`o2 = no('bar')`
			`o1.track = 42`
			`o2.track = 42`
			`try:`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`r = s.GetDupeGroups([o1, o2])`
Stop using hsutil.testcase. 2011-01-05 10:11:21 +00:00			`except TypeError:`
			`raise AssertionError()`
			`eq_(len(r), 1)`

			`def test_tag_scan_non_ascii(fake_fileexists):`
			`s = Scanner()`
			`s.scan_type = ScanType.Tag`
			`s.scanned_tags = set(['title'])`
			`o1 = no('foo')`
			`o2 = no('bar')`
			`o1.title = 'foobar\u00e9'`
			`o2.title = 'foobar\u00e9'`
			`try:`
Initial commit. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%402 2009-06-01 09:55:11 +00:00			`r = s.GetDupeGroups([o1, o2])`
Stop using hsutil.testcase. 2011-01-05 10:11:21 +00:00			`except UnicodeEncodeError:`
			`raise AssertionError()`
			`eq_(len(r), 1)`

			`def test_audio_content_scan(fake_fileexists):`
			`s = Scanner()`
			`s.scan_type = ScanType.ContentsAudio`
			`f = [no('foo'), no('bar'), no('bleh')]`
			`f[0].md5 = 'foo'`
			`f[1].md5 = 'bar'`
			`f[2].md5 = 'bleh'`
			`f[0].md5partial = 'foo'`
			`f[1].md5partial = 'foo'`
			`f[2].md5partial = 'bleh'`
			`f[0].audiosize = 1`
			`f[1].audiosize = 1`
			`f[2].audiosize = 1`
			`r = s.GetDupeGroups(f)`
			`eq_(len(r), 1)`
			`eq_(len(r[0]), 2)`

			`def test_audio_content_scan_compare_sizes_first(fake_fileexists):`
			`class MyFile(no):`
			`@property`
			`def md5partial(file):`
[#72 state:fixed] When files are deleted during the scan, don't include them in the grouping phase. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40225 2009-10-30 11:09:04 +00:00			`raise AssertionError()`
Stop using hsutil.testcase. 2011-01-05 10:11:21 +00:00
			`s = Scanner()`
			`s.scan_type = ScanType.ContentsAudio`
			`f = [MyFile('foo'), MyFile('bar')]`
			`f[0].audiosize = 1`
			`f[1].audiosize = 2`
			`eq_(len(s.GetDupeGroups(f)), 0)`

			`def test_ignore_list(fake_fileexists):`
			`s = Scanner()`
			`f1 = no('foobar')`
			`f2 = no('foobar')`
			`f3 = no('foobar')`
			`f1.path = Path('dir1/foobar')`
			`f2.path = Path('dir2/foobar')`
			`f3.path = Path('dir3/foobar')`
			`s.ignore_list.Ignore(str(f1.path),str(f2.path))`
			`s.ignore_list.Ignore(str(f1.path),str(f3.path))`
			`r = s.GetDupeGroups([f1,f2,f3])`
			`eq_(len(r), 1)`
			`g = r[0]`
			`eq_(len(g.dupes), 1)`
			`assert f1 not in g`
			`assert f2 in g`
			`assert f3 in g`
			`# Ignored matches are not counted as discarded`
			`eq_(s.discarded_file_count, 0)`

			`def test_ignore_list_checks_for_unicode(fake_fileexists):`
			`#scanner was calling path_str for ignore list checks. Since the Path changes, it must`
			`#be unicode(path)`
			`s = Scanner()`
			`f1 = no('foobar')`
			`f2 = no('foobar')`
			`f3 = no('foobar')`
			`f1.path = Path('foo1\u00e9')`
			`f2.path = Path('foo2\u00e9')`
			`f3.path = Path('foo3\u00e9')`
			`s.ignore_list.Ignore(str(f1.path),str(f2.path))`
			`s.ignore_list.Ignore(str(f1.path),str(f3.path))`
			`r = s.GetDupeGroups([f1,f2,f3])`
			`eq_(len(r), 1)`
			`g = r[0]`
			`eq_(len(g.dupes), 1)`
			`assert f1 not in g`
			`assert f2 in g`
			`assert f3 in g`

			`def test_file_evaluates_to_false(fake_fileexists):`
			`# A very wrong way to use any() was added at some point, causing resulting group list`
			`# to be empty.`
			`class FalseNamedObject(NamedObject):`
			`def __bool__(self):`
			`return False`
[#72 state:fixed] When files are deleted during the scan, don't include them in the grouping phase. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40225 2009-10-30 11:09:04 +00:00
Refactoring: modernized scaner_test and got rid of the obsolete SCAN_TYPE_TAG_WITH_ALBUM scan type const. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40117 2009-09-05 15:28:10 +00:00
Stop using hsutil.testcase. 2011-01-05 10:11:21 +00:00			`s = Scanner()`
			`f1 = FalseNamedObject('foobar')`
			`f2 = FalseNamedObject('foobar')`
			`r = s.GetDupeGroups([f1, f2])`
			`eq_(len(r), 1)`

			`def test_size_threshold(fake_fileexists):`
			`# Only file equal or higher than the size_threshold in size are scanned`
			`s = Scanner()`
			`f1 = no('foo', 1)`
			`f2 = no('foo', 2)`
			`f3 = no('foo', 3)`
			`s.size_threshold = 2`
			`groups = s.GetDupeGroups([f1,f2,f3])`
			`eq_(len(groups), 1)`
			`[group] = groups`
			`eq_(len(group), 2)`
			`assert f1 not in group`
			`assert f2 in group`
			`assert f3 in group`

			`def test_tie_breaker_path_deepness(fake_fileexists):`
			`# If there is a tie in prioritization, path deepness is used as a tie breaker`
			`s = Scanner()`
			`o1, o2 = no('foo'), no('foo')`
			`o1.path = Path('foo')`
			`o2.path = Path('foo/bar')`
			`[group] = s.GetDupeGroups([o1, o2])`
			`assert group.ref is o2`

			`def test_tie_breaker_copy(fake_fileexists):`
			`# if copy is in the words used (even if it has a deeper path), it becomes a dupe`
			`s = Scanner()`
			`o1, o2 = no('foo bar Copy'), no('foo bar')`
			`o1.path = Path('deeper/path')`
			`o2.path = Path('foo')`
			`[group] = s.GetDupeGroups([o1, o2])`
			`assert group.ref is o2`

			`def test_tie_breaker_same_name_plus_digit(fake_fileexists):`
			`# if ref has the same words as dupe, but has some just one extra word which is a digit, it`
			`# becomes a dupe`
			`s = Scanner()`
			`o1 = no('foo bar 42')`
			`o2 = no('foo bar [42]')`
			`o3 = no('foo bar (42)')`
			`o4 = no('foo bar {42}')`
			`o5 = no('foo bar')`
			`# all numbered names have deeper paths, so they'll end up ref if the digits aren't correctly`
			`# used as tie breakers`
			`o1.path = Path('deeper/path')`
			`o2.path = Path('deeper/path')`
			`o3.path = Path('deeper/path')`
			`o4.path = Path('deeper/path')`
			`o5.path = Path('foo')`
			`[group] = s.GetDupeGroups([o1, o2, o3, o4, o5])`
			`assert group.ref is o5`

			`def test_partial_group_match(fake_fileexists):`
			`# Count the number od discarded matches (when a file doesn't match all other dupes of the`
			`# group) in Scanner.discarded_file_count`
			`s = Scanner()`
			`o1, o2, o3 = no('a b'), no('a'), no('b')`
			`s.min_match_percentage = 50`
			`[group] = s.GetDupeGroups([o1, o2, o3])`
			`eq_(len(group), 2)`
			`assert o1 in group`
			`assert o2 in group`
			`assert o3 not in group`
			`eq_(s.discarded_file_count, 1)`

			`def test_dont_group_files_that_dont_exist(tmpdir):`
			`# when creating groups, check that files exist first. It's possible that these files have`
			`# been moved during the scan by the user.`
			`# In this test, we have to delete one of the files between the get_matches() part and the`
			`# get_groups() part.`
			`s = Scanner()`
			`s.scan_type = ScanType.Contents`
			`p = Path(str(tmpdir))`
			`io.open(p + 'file1', 'w').write('foo')`
			`io.open(p + 'file2', 'w').write('foo')`
			`file1, file2 = fs.get_files(p)`
			`def getmatches(args, *kw):`
			`io.remove(file2.path)`
			`return [Match(file1, file2, 100)]`
			`s._getmatches = getmatches`

			`assert not s.GetDupeGroups([file1, file2])`