dupeguru/core/tests/scanner_test.py

# Created By: Virgil Dupras
# Created On: 2006/03/03
# Copyright 2010 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "HS" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license

from nose.tools import eq_

from hscommon import job
from hsutil import io
from hsutil.path import Path
from hsutil.testcase import TestCase

from .. import fs
from ..engine import getwords, Match
from ..ignore import IgnoreList
from ..scanner import *

class NamedObject(object):
    def __init__(self, name="foobar", size=1):
        self.name = name
        self.size = size
        self.path = Path('')
        self.words = getwords(name)


no = NamedObject

#--- Scanner
class ScannerTestFakeFiles(TestCase):
    def setUp(self):
        # This is a hack to avoid invalidating all previous tests since the scanner started to test
        # for file existence before doing the match grouping.
        self.mock(io, 'exists', lambda _: True)

    def test_empty(self):
        s = Scanner()
        r = s.GetDupeGroups([])
        eq_(r, [])

    def test_default_settings(self):
        s = Scanner()
        eq_(s.min_match_percentage, 80)
        eq_(s.scan_type, SCAN_TYPE_FILENAME)
        eq_(s.mix_file_kind, True)
        eq_(s.word_weighting, False)
        eq_(s.match_similar_words, False)
        assert isinstance(s.ignore_list, IgnoreList)

    def test_simple_with_default_settings(self):
        s = Scanner()
        f = [no('foo bar'), no('foo bar'), no('foo bleh')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        g = r[0]
        #'foo bleh' cannot be in the group because the default min match % is 80
        eq_(len(g), 2)
        assert g.ref in f[:2]
        assert g.dupes[0] in f[:2]

    def test_simple_with_lower_min_match(self):
        s = Scanner()
        s.min_match_percentage = 50
        f = [no('foo bar'), no('foo bar'), no('foo bleh')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        g = r[0]
        eq_(len(g), 3)

    def test_trim_all_ref_groups(self):
        # When all files of a group are ref, don't include that group in the results, but also don't
        # count the files from that group as discarded.
        s = Scanner()
        f = [no('foo'), no('foo'), no('bar'), no('bar')]
        f[2].is_ref = True
        f[3].is_ref = True
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        eq_(s.discarded_file_count, 0)

    def test_priorize(self):
        s = Scanner()
        f = [no('foo'), no('foo'), no('bar'), no('bar')]
        f[1].size = 2
        f[2].size = 3
        f[3].is_ref = True
        r = s.GetDupeGroups(f)
        g1, g2 = r
        assert f[1] in (g1.ref,g2.ref)
        assert f[0] in (g1.dupes[0],g2.dupes[0])
        assert f[3] in (g1.ref,g2.ref)
        assert f[2] in (g1.dupes[0],g2.dupes[0])

    def test_content_scan(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT
        f = [no('foo'), no('bar'), no('bleh')]
        f[0].md5 = f[0].md5partial = 'foobar'
        f[1].md5 = f[1].md5partial = 'foobar'
        f[2].md5 = f[2].md5partial = 'bleh'
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        eq_(len(r[0]), 2)
        eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded!

    def test_content_scan_compare_sizes_first(self):
        class MyFile(no):
            @property
            def md5(file):
                raise AssertionError()

        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT
        f = [MyFile('foo', 1), MyFile('bar', 2)]
        eq_(len(s.GetDupeGroups(f)), 0)

    def test_min_match_perc_doesnt_matter_for_content_scan(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT
        f = [no('foo'), no('bar'), no('bleh')]
        f[0].md5 = f[0].md5partial = 'foobar'
        f[1].md5 = f[1].md5partial = 'foobar'
        f[2].md5 = f[2].md5partial = 'bleh'
        s.min_match_percentage = 101
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        eq_(len(r[0]), 2)
        s.min_match_percentage = 0
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        eq_(len(r[0]), 2)

    def test_content_scan_doesnt_put_md5_in_words_at_the_end(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT
        f = [no('foo'),no('bar')]
        f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
        f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
        r = s.GetDupeGroups(f)
        g = r[0]

    def test_extension_is_not_counted_in_filename_scan(self):
        s = Scanner()
        s.min_match_percentage = 100
        f = [no('foo.bar'), no('foo.bleh')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        eq_(len(r[0]), 2)

    def test_job(self):
        def do_progress(progress, desc=''):
            log.append(progress)
            return True

        s = Scanner()
        log = []
        f = [no('foo bar'), no('foo bar'), no('foo bleh')]
        r = s.GetDupeGroups(f, job.Job(1, do_progress))
        eq_(log[0], 0)
        eq_(log[-1], 100)

    def test_mix_file_kind(self):
        s = Scanner()
        s.mix_file_kind = False
        f = [no('foo.1'), no('foo.2')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 0)

    def test_word_weighting(self):
        s = Scanner()
        s.min_match_percentage = 75
        s.word_weighting = True
        f = [no('foo bar'), no('foo bar bleh')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        g = r[0]
        m = g.get_match_of(g.dupes[0])
        eq_(m.percentage, 75) # 16 letters, 12 matching

    def test_similar_words(self):
        s = Scanner()
        s.match_similar_words = True
        f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 2)

    def test_fields(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_FIELDS
        f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 0)

    def test_fields_no_order(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER
        f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')]
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)

    def test_tag_scan(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        o1 = no('foo')
        o2 = no('bar')
        o1.artist = 'The White Stripes'
        o1.title = 'The Air Near My Fingers'
        o2.artist = 'The White Stripes'
        o2.title = 'The Air Near My Fingers'
        r = s.GetDupeGroups([o1,o2])
        eq_(len(r), 1)

    def test_tag_with_album_scan(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        s.scanned_tags = set(['artist', 'album', 'title'])
        o1 = no('foo')
        o2 = no('bar')
        o3 = no('bleh')
        o1.artist = 'The White Stripes'
        o1.title = 'The Air Near My Fingers'
        o1.album = 'Elephant'
        o2.artist = 'The White Stripes'
        o2.title = 'The Air Near My Fingers'
        o2.album = 'Elephant'
        o3.artist = 'The White Stripes'
        o3.title = 'The Air Near My Fingers'
        o3.album = 'foobar'
        r = s.GetDupeGroups([o1,o2,o3])
        eq_(len(r), 1)

    def test_that_dash_in_tags_dont_create_new_fields(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        s.scanned_tags = set(['artist', 'album', 'title'])
        s.min_match_percentage = 50
        o1 = no('foo')
        o2 = no('bar')
        o1.artist = 'The White Stripes - a'
        o1.title = 'The Air Near My Fingers - a'
        o1.album = 'Elephant - a'
        o2.artist = 'The White Stripes - b'
        o2.title = 'The Air Near My Fingers - b'
        o2.album = 'Elephant - b'
        r = s.GetDupeGroups([o1,o2])
        eq_(len(r), 1)

    def test_tag_scan_with_different_scanned(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        s.scanned_tags = set(['track', 'year'])
        o1 = no('foo')
        o2 = no('bar')
        o1.artist = 'The White Stripes'
        o1.title = 'some title'
        o1.track = 'foo'
        o1.year = 'bar'
        o2.artist = 'The White Stripes'
        o2.title = 'another title'
        o2.track = 'foo'
        o2.year = 'bar'
        r = s.GetDupeGroups([o1, o2])
        eq_(len(r), 1)

    def test_tag_scan_only_scans_existing_tags(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        s.scanned_tags = set(['artist', 'foo'])
        o1 = no('foo')
        o2 = no('bar')
        o1.artist = 'The White Stripes'
        o1.foo = 'foo'
        o2.artist = 'The White Stripes'
        o2.foo = 'bar'
        r = s.GetDupeGroups([o1, o2])
        eq_(len(r), 1) # Because 'foo' is not scanned, they match

    def test_tag_scan_converts_to_str(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        s.scanned_tags = set(['track'])
        o1 = no('foo')
        o2 = no('bar')
        o1.track = 42
        o2.track = 42
        try:
            r = s.GetDupeGroups([o1, o2])
        except TypeError:
            raise AssertionError()
        eq_(len(r), 1)

    def test_tag_scan_non_ascii(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_TAG
        s.scanned_tags = set(['title'])
        o1 = no('foo')
        o2 = no('bar')
        o1.title = u'foobar\u00e9'
        o2.title = u'foobar\u00e9'
        try:
            r = s.GetDupeGroups([o1, o2])
        except UnicodeEncodeError:
            raise AssertionError()
        eq_(len(r), 1)

    def test_audio_content_scan(self):
        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT_AUDIO
        f = [no('foo'), no('bar'), no('bleh')]
        f[0].md5 = 'foo'
        f[1].md5 = 'bar'
        f[2].md5 = 'bleh'
        f[0].md5partial = 'foo'
        f[1].md5partial = 'foo'
        f[2].md5partial = 'bleh'
        f[0].audiosize = 1
        f[1].audiosize = 1
        f[2].audiosize = 1
        r = s.GetDupeGroups(f)
        eq_(len(r), 1)
        eq_(len(r[0]), 2)

    def test_audio_content_scan_compare_sizes_first(self):
        class MyFile(no):
            @property
            def md5partial(file):
                raise AssertionError()

        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT_AUDIO
        f = [MyFile('foo'), MyFile('bar')]
        f[0].audiosize = 1
        f[1].audiosize = 2
        eq_(len(s.GetDupeGroups(f)), 0)

    def test_ignore_list(self):
        s = Scanner()
        f1 = no('foobar')
        f2 = no('foobar')
        f3 = no('foobar')
        f1.path = Path('dir1/foobar')
        f2.path = Path('dir2/foobar')
        f3.path = Path('dir3/foobar')
        s.ignore_list.Ignore(str(f1.path),str(f2.path))
        s.ignore_list.Ignore(str(f1.path),str(f3.path))
        r = s.GetDupeGroups([f1,f2,f3])
        eq_(len(r), 1)
        g = r[0]
        eq_(len(g.dupes), 1)
        assert f1 not in g
        assert f2 in g
        assert f3 in g
        # Ignored matches are not counted as discarded
        eq_(s.discarded_file_count, 0)

    def test_ignore_list_checks_for_unicode(self):
        #scanner was calling path_str for ignore list checks. Since the Path changes, it must
        #be unicode(path)
        s = Scanner()
        f1 = no('foobar')
        f2 = no('foobar')
        f3 = no('foobar')
        f1.path = Path(u'foo1\u00e9')
        f2.path = Path(u'foo2\u00e9')
        f3.path = Path(u'foo3\u00e9')
        s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path))
        s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path))
        r = s.GetDupeGroups([f1,f2,f3])
        eq_(len(r), 1)
        g = r[0]
        eq_(len(g.dupes), 1)
        assert f1 not in g
        assert f2 in g
        assert f3 in g

    def test_file_evaluates_to_false(self):
        # A very wrong way to use any() was added at some point, causing resulting group list
        # to be empty.
        class FalseNamedObject(NamedObject):
            def __nonzero__(self):
                return False


        s = Scanner()
        f1 = FalseNamedObject('foobar')
        f2 = FalseNamedObject('foobar')
        r = s.GetDupeGroups([f1, f2])
        eq_(len(r), 1)

    def test_size_threshold(self):
        # Only file equal or higher than the size_threshold in size are scanned
        s = Scanner()
        f1 = no('foo', 1)
        f2 = no('foo', 2)
        f3 = no('foo', 3)
        s.size_threshold = 2
        groups = s.GetDupeGroups([f1,f2,f3])
        eq_(len(groups), 1)
        [group] = groups
        eq_(len(group), 2)
        assert f1 not in group
        assert f2 in group
        assert f3 in group

    def test_tie_breaker_path_deepness(self):
        # If there is a tie in prioritization, path deepness is used as a tie breaker
        s = Scanner()
        o1, o2 = no('foo'), no('foo')
        o1.path = Path('foo')
        o2.path = Path('foo/bar')
        [group] = s.GetDupeGroups([o1, o2])
        assert group.ref is o2

    def test_tie_breaker_copy(self):
        # if copy is in the words used (even if it has a deeper path), it becomes a dupe
        s = Scanner()
        o1, o2 = no('foo bar Copy'), no('foo bar')
        o1.path = Path('deeper/path')
        o2.path = Path('foo')
        [group] = s.GetDupeGroups([o1, o2])
        assert group.ref is o2

    def test_tie_breaker_same_name_plus_digit(self):
        # if ref has the same words as dupe, but has some just one extra word which is a digit, it
        # becomes a dupe
        s = Scanner()
        o1, o2 = no('foo bar 42'), no('foo bar')
        o1.path = Path('deeper/path')
        o2.path = Path('foo')
        [group] = s.GetDupeGroups([o1, o2])
        assert group.ref is o2

    def test_partial_group_match(self):
        # Count the number od discarded matches (when a file doesn't match all other dupes of the
        # group) in Scanner.discarded_file_count
        s = Scanner()
        o1, o2, o3 = no('a b'), no('a'), no('b')
        s.min_match_percentage = 50
        [group] = s.GetDupeGroups([o1, o2, o3])
        eq_(len(group), 2)
        assert o1 in group
        assert o2 in group
        assert o3 not in group
        eq_(s.discarded_file_count, 1)


class ScannerTest(TestCase):
    def test_dont_group_files_that_dont_exist(self):
        # when creating groups, check that files exist first. It's possible that these files have
        # been moved during the scan by the user.
        # In this test, we have to delete one of the files between the get_matches() part and the
        # get_groups() part.
        s = Scanner()
        s.scan_type = SCAN_TYPE_CONTENT
        p = self.tmppath()
        io.open(p + 'file1', 'w').write('foo')
        io.open(p + 'file2', 'w').write('foo')
        file1, file2 = fs.get_files(p)
        def getmatches(*args, **kw):
            io.remove(file2.path)
            return [Match(file1, file2, 100)]
        s._getmatches = getmatches

        assert not s.GetDupeGroups([file1, file2])