dupeguru/core/tests/scanner_test.py

# Created By: Virgil Dupras
# Created On: 2006/03/03
# Copyright 2010 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "HS" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license

from nose.tools import eq_

from hsutil import job, io
from hsutil.path import Path
from hsutil.testcase import TestCase

from .. import fs
from ..engine import getwords, Match
from ..ignore import IgnoreList
from ..scanner import *

class NamedObject(object):
    def __init__(self, name="foobar", size=1):
        self.name = name
        self.size = size
        self.path = Path('')
        self.words = getwords(name)


no = NamedObject

#--- Scanner
class ScannerTestFakeFiles(TestCase):
    def setUp(self):
        # This is a hack to avoid invalidating all previous tests since the scanner started to test
        # for file existence before doing the match grouping.
        self.mock(io, 'exists', lambda _: True)

    def test_empty(self):
        s = Scanner()
        r = s.GetDupeGroups([])
        eq_(r, [])

    def test_default_settings(self):
        s = Scanner()
        eq_(s.min_match_percentage, 80)
        eq_(s.scan_type, SCAN_TYPE_FILENAME)
        eq_(s.mix_file_kind, True)
        eq_(s.word_weighting, False)
        eq_(s.match_similar_words, False)
        assert isinstance(s.ignore_list, IgnoreList)

    def test_simple_with_default_settings(self):
        s = Scanner()
        f = [no('foo bar'), no('foo bar'), no('foo bleh')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        g = r[0]
        #'foo bleh' cannot be in the group because the default min match % is 80
        eq_(len(g), 2)
        assert g.ref in f[:2]
        assert g.dupes[0] in f[:2]

    def test_simple_with_lower_min_match(self):
        s = Scanner()
        s.min_match_percentage = 50
        f = [no('foo bar'), no('foo bar'), no('foo bleh')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        g = r[0]
        eq_(len(g), 3)

    def test_trim_all_ref_groups(self):
        # When all files of a group are ref, don't include that group in the results, but also don't
        # count the files from that group as discarded.
        s = Scanner()
        f = [no('foo'), no('foo'), no('bar'), no('bar')]
        f[2].is_ref = True
        f[3].is_ref = True
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        eq_(s.discarded_file_count, 0)

    def test_priorize(self):
        s = Scanner()
        f = [no('foo'), no('foo'), no('bar'), no('bar')]
        f[1].size = 2
        f[2].size = 3
        f[3].is_ref = True
        r = s.GetDupeGroups(f)
        g1, g2 = r
        assert f[1] in (g1.ref,g2.ref)
        assert f[0] in (g1.dupes[0],g2.dupes[0])
        assert f[3] in (g1.ref,g2.ref)
        assert f[2] in (g1.dupes[0],g2.dupes[0])

    def test_content_scan(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT
        f = [no('foo'), no('bar'), no('bleh')]
        f[0].md5 = f[0].md5partial = 'foobar'
        f[1].md5 = f[1].md5partial = 'foobar'
        f[2].md5 = f[2].md5partial = 'bleh'
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        eq_(len(r[0]), 2)
        eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded!

    def test_content_scan_compare_sizes_first(self):
        class MyFile(no):
            @property
            def md5(file):
                raise AssertionError()

        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT
        f = [MyFile('foo', 1), MyFile('bar', 2)]
        eq_(len(s.GetDupeGroups(f)), 0)

    def test_min_match_perc_doesnt_matter_for_content_scan(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT
        f = [no('foo'), no('bar'), no('bleh')]
        f[0].md5 = f[0].md5partial = 'foobar'
        f[1].md5 = f[1].md5partial = 'foobar'
        f[2].md5 = f[2].md5partial = 'bleh'
        s.min_match_percentage = 101
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        eq_(len(r[0]), 2)
        s.min_match_percentage = 0
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        eq_(len(r[0]), 2)

    def test_content_scan_doesnt_put_md5_in_words_at_the_end(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT
        f = [no('foo'),no('bar')]
        f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
        f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
        r = s.GetDupeGroups(f)
        g = r[0]

    def test_extension_is_not_counted_in_filename_scan(self):
        s = Scanner()
        s.min_match_percentage = 100
        f = [no('foo.bar'), no('foo.bleh')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        eq_(len(r[0]), 2)

    def test_job(self):
        def do_progress(progress, desc=''):
            log.append(progress)
            return True

        s = Scanner()
        log = []
        f = [no('foo bar'), no('foo bar'), no('foo bleh')]
        r = s.GetDupeGroups(f, job.Job(1, do_progress))
        eq_(log[0], 0)
        eq_(log[-1], 100)

    def test_mix_file_kind(self):
        s = Scanner()
        s.mix_file_kind = False
        f = [no('foo.1'), no('foo.2')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 0)

    def test_word_weighting(self):
        s = Scanner()
        s.min_match_percentage = 75
        s.word_weighting = True
        f = [no('foo bar'), no('foo bar bleh')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        g = r[0]
        m = g.get_match_of(g.dupes[0])
        eq_(m.percentage, 75) # 16 letters, 12 matching

    def test_similar_words(self):
        s = Scanner()
        s.match_similar_words = True
        f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 2)

    def test_fields(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_FIELDS
        f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 0)

    def test_fields_no_order(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER
        f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)

    def test_tag_scan(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        o1 = no('foo')
        o2 = no('bar')
        o1.artist = 'The White Stripes'
        o1.title = 'The Air Near My Fingers'
        o2.artist = 'The White Stripes'
        o2.title = 'The Air Near My Fingers'
        r = s.GetDupeGroups([o1,o2])
        eq_(len(r), 1)

    def test_tag_with_album_scan(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        s.scanned_tags = set(['artist', 'album', 'title'])
        o1 = no('foo')
        o2 = no('bar')
        o3 = no('bleh')
        o1.artist = 'The White Stripes'
        o1.title = 'The Air Near My Fingers'
        o1.album = 'Elephant'
        o2.artist = 'The White Stripes'
        o2.title = 'The Air Near My Fingers'
        o2.album = 'Elephant'
        o3.artist = 'The White Stripes'
        o3.title = 'The Air Near My Fingers'
        o3.album = 'foobar'
        r = s.GetDupeGroups([o1,o2,o3])
        eq_(len(r), 1)

    def test_that_dash_in_tags_dont_create_new_fields(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        s.scanned_tags = set(['artist', 'album', 'title'])
        s.min_match_percentage = 50
        o1 = no('foo')
        o2 = no('bar')
        o1.artist = 'The White Stripes - a'
        o1.title = 'The Air Near My Fingers - a'
        o1.album = 'Elephant - a'
        o2.artist = 'The White Stripes - b'
        o2.title = 'The Air Near My Fingers - b'
        o2.album = 'Elephant - b'
        r = s.GetDupeGroups([o1,o2])
        eq_(len(r), 1)

    def test_tag_scan_with_different_scanned(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        s.scanned_tags = set(['track', 'year'])
        o1 = no('foo')
        o2 = no('bar')
        o1.artist = 'The White Stripes'
        o1.title = 'some title'
        o1.track = 'foo'
        o1.year = 'bar'
        o2.artist = 'The White Stripes'
        o2.title = 'another title'
        o2.track = 'foo'
        o2.year = 'bar'
        r = s.GetDupeGroups([o1, o2])
        eq_(len(r), 1)

    def test_tag_scan_only_scans_existing_tags(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        s.scanned_tags = set(['artist', 'foo'])
        o1 = no('foo')
        o2 = no('bar')
        o1.artist = 'The White Stripes'
        o1.foo = 'foo'
        o2.artist = 'The White Stripes'
        o2.foo = 'bar'
        r = s.GetDupeGroups([o1, o2])
        eq_(len(r), 1) # Because 'foo' is not scanned, they match

    def test_tag_scan_converts_to_str(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        s.scanned_tags = set(['track'])
        o1 = no('foo')
        o2 = no('bar')
        o1.track = 42
        o2.track = 42
        try:
            r = s.GetDupeGroups([o1, o2])
        except TypeError:
            raise AssertionError()
        eq_(len(r), 1)

    def test_tag_scan_non_ascii(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        s.scanned_tags = set(['title'])
        o1 = no('foo')
        o2 = no('bar')
        o1.title = u'foobar\u00e9'
        o2.title = u'foobar\u00e9'
        try:
            r = s.GetDupeGroups([o1, o2])
        except UnicodeEncodeError:
            raise AssertionError()
        eq_(len(r), 1)

    def test_audio_content_scan(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT_AUDIO
        f = [no('foo'), no('bar'), no('bleh')]
        f[0].md5 = 'foo'
        f[1].md5 = 'bar'
        f[2].md5 = 'bleh'
        f[0].md5partial = 'foo'
        f[1].md5partial = 'foo'
        f[2].md5partial = 'bleh'
        f[0].audiosize = 1
        f[1].audiosize = 1
        f[2].audiosize = 1
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        eq_(len(r[0]), 2)

    def test_audio_content_scan_compare_sizes_first(self):
        class MyFile(no):
            @property
            def md5partial(file):
                raise AssertionError()

        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT_AUDIO
        f = [MyFile('foo'), MyFile('bar')]
        f[0].audiosize = 1
        f[1].audiosize = 2
        eq_(len(s.GetDupeGroups(f)), 0)

    def test_ignore_list(self):
        s = Scanner()
        f1 = no('foobar')
        f2 = no('foobar')
        f3 = no('foobar')
        f1.path = Path('dir1/foobar')
        f2.path = Path('dir2/foobar')
        f3.path = Path('dir3/foobar')
        s.ignore_list.Ignore(str(f1.path),str(f2.path))
        s.ignore_list.Ignore(str(f1.path),str(f3.path))
        r = s.GetDupeGroups([f1,f2,f3])
        eq_(len(r), 1)
        g = r[0]
        eq_(len(g.dupes), 1)
        assert f1 not in g
        assert f2 in g
        assert f3 in g
        # Ignored matches are not counted as discarded
        eq_(s.discarded_file_count, 0)

    def test_ignore_list_checks_for_unicode(self):
        #scanner was calling path_str for ignore list checks. Since the Path changes, it must
        #be unicode(path)
        s = Scanner()
        f1 = no('foobar')
        f2 = no('foobar')
        f3 = no('foobar')
        f1.path = Path(u'foo1\u00e9')
        f2.path = Path(u'foo2\u00e9')
        f3.path = Path(u'foo3\u00e9')
        s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path))
        s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path))
        r = s.GetDupeGroups([f1,f2,f3])
        eq_(len(r), 1)
        g = r[0]
        eq_(len(g.dupes), 1)
        assert f1 not in g
        assert f2 in g
        assert f3 in g

    def test_file_evaluates_to_false(self):
        # A very wrong way to use any() was added at some point, causing resulting group list
        # to be empty.
        class FalseNamedObject(NamedObject):
            def __nonzero__(self):
                return False


        s = Scanner()
        f1 = FalseNamedObject('foobar')
        f2 = FalseNamedObject('foobar')
        r = s.GetDupeGroups([f1, f2])
        eq_(len(r), 1)

    def test_size_threshold(self):
        # Only file equal or higher than the size_threshold in size are scanned
        s = Scanner()
        f1 = no('foo', 1)
        f2 = no('foo', 2)
        f3 = no('foo', 3)
        s.size_threshold = 2
        groups = s.GetDupeGroups([f1,f2,f3])
        eq_(len(groups), 1)
        [group] = groups
        eq_(len(group), 2)
        assert f1 not in group
        assert f2 in group
        assert f3 in group

    def test_tie_breaker_path_deepness(self):
        # If there is a tie in prioritization, path deepness is used as a tie breaker
        s = Scanner()
        o1, o2 = no('foo'), no('foo')
        o1.path = Path('foo')
        o2.path = Path('foo/bar')
        [group] = s.GetDupeGroups([o1, o2])
        assert group.ref is o2

    def test_tie_breaker_copy(self):
        # if copy is in the words used (even if it has a deeper path), it becomes a dupe
        s = Scanner()
        o1, o2 = no('foo bar Copy'), no('foo bar')
        o1.path = Path('deeper/path')
        o2.path = Path('foo')
        [group] = s.GetDupeGroups([o1, o2])
        assert group.ref is o2

    def test_tie_breaker_same_name_plus_digit(self):
        # if ref has the same words as dupe, but has some just one extra word which is a digit, it
        # becomes a dupe
        s = Scanner()
        o1, o2 = no('foo bar 42'), no('foo bar')
        o1.path = Path('deeper/path')
        o2.path = Path('foo')
        [group] = s.GetDupeGroups([o1, o2])
        assert group.ref is o2

    def test_partial_group_match(self):
        # Count the number od discarded matches (when a file doesn't match all other dupes of the
        # group) in Scanner.discarded_file_count
        s = Scanner()
        o1, o2, o3 = no('a b'), no('a'), no('b')
        s.min_match_percentage = 50
        [group] = s.GetDupeGroups([o1, o2, o3])
        eq_(len(group), 2)
        assert o1 in group
        assert o2 in group
        assert o3 not in group
        eq_(s.discarded_file_count, 1)


class ScannerTest(TestCase):
    def test_dont_group_files_that_dont_exist(self):
        # when creating groups, check that files exist first. It's possible that these files have
        # been moved during the scan by the user.
        # In this test, we have to delete one of the files between the get_matches() part and the
        # get_groups() part.
        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT
        p = self.tmppath()
        io.open(p + 'file1', 'w').write('foo')
        io.open(p + 'file2', 'w').write('foo')
        file1, file2 = fs.get_files(p)
        def getmatches(*args, **kw):
            io.remove(file2.path)
            return [Match(file1, file2, 100)]
        s._getmatches = getmatches

        assert not s.GetDupeGroups([file1, file2])