diff --git a/base/py/engine.py b/base/py/engine.py index b34f2edd..173cbab3 100644 --- a/base/py/engine.py +++ b/base/py/engine.py @@ -208,7 +208,9 @@ def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob) j = j.start_subjob([2, 8]) size2files = defaultdict(set) for file in j.iter_with_progress(files, 'Read size of %d/%d files'): - size2files[getattr(file, sizeattr)].add(file) + filesize = getattr(file, sizeattr) + if filesize: + size2files[filesize].add(file) possible_matches = [files for files in size2files.values() if len(files) > 1] del size2files result = [] diff --git a/base/py/scanner.py b/base/py/scanner.py index 3f999920..9cd6b21a 100644 --- a/base/py/scanner.py +++ b/base/py/scanner.py @@ -10,7 +10,7 @@ import logging -from hsutil import job +from hsutil import job, io from hsutil.misc import dedupe from hsutil.str import get_file_ext, rem_file_ext @@ -80,9 +80,10 @@ class Scanner(object): logging.info('Getting matches') matches = self._getmatches(files, j) logging.info('Found %d matches' % len(matches)) + j.set_progress(100, 'Removing false matches') if not self.mix_file_kind: - j.set_progress(100, 'Removing false matches') matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)] + matches = [m for m in matches if io.exists(m.first.path) and io.exists(m.second.path)] if self.ignore_list: j = j.start_subjob(2) iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list') diff --git a/base/py/tests/engine_test.py b/base/py/tests/engine_test.py index 1c3366bc..48e4f1d8 100644 --- a/base/py/tests/engine_test.py +++ b/base/py/tests/engine_test.py @@ -15,16 +15,21 @@ from hsutil import job from hsutil.decorators import log_calls from hsutil.testcase import TestCase -from .. import engine +from .. import engine, fs from ..engine import * class NamedObject(object): - def __init__(self, name="foobar", with_words=False): + def __init__(self, name="foobar", with_words=False, size=1): self.name = name + self.size = size + self.md5partial = name + self.md5 = name if with_words: self.words = getwords(name) +no = NamedObject + def get_match_triangle(): o1 = NamedObject(with_words=True) o2 = NamedObject(with_words=True) @@ -486,6 +491,12 @@ class GetMatches(TestCase): self.assertEqual(42, len(r)) +class GetMatchesByContents(TestCase): + def test_dont_compare_empty_files(self): + o1, o2 = no(size=0), no(size=0) + assert not getmatches_by_contents([o1, o2]) + + class TCGroup(TestCase): def test_empy(self): g = Group() diff --git a/base/py/tests/results_test.py b/base/py/tests/results_test.py index f3602b7c..086dc91d 100644 --- a/base/py/tests/results_test.py +++ b/base/py/tests/results_test.py @@ -21,7 +21,6 @@ from .. import engine from ..results import * class NamedObject(engine_test.NamedObject): - size = 1 path = property(lambda x:Path('basepath') + x.name) is_ref = False diff --git a/base/py/tests/scanner_test.py b/base/py/tests/scanner_test.py index 1ce0f8f7..6113be7b 100644 --- a/base/py/tests/scanner_test.py +++ b/base/py/tests/scanner_test.py @@ -9,9 +9,11 @@ from nose.tools import eq_ -from hsutil import job +from hsutil import job, io from hsutil.path import Path +from hsutil.testcase import TestCase +from .. import fs from ..engine import getwords, Match from ..ignore import IgnoreList from ..scanner import * @@ -27,412 +29,439 @@ class NamedObject(object): no = NamedObject #--- Scanner -def test_empty(): - s = Scanner() - r = s.GetDupeGroups([]) - eq_(r, []) - -def test_default_settings(): - s = Scanner() - eq_(s.min_match_percentage, 80) - eq_(s.scan_type, SCAN_TYPE_FILENAME) - eq_(s.mix_file_kind, True) - eq_(s.word_weighting, False) - eq_(s.match_similar_words, False) - assert isinstance(s.ignore_list, IgnoreList) - -def test_simple_with_default_settings(): - s = Scanner() - f = [no('foo bar'), no('foo bar'), no('foo bleh')] - r = s.GetDupeGroups(f) - eq_(len(r), 1) - g = r[0] - #'foo bleh' cannot be in the group because the default min match % is 80 - eq_(len(g), 2) - assert g.ref in f[:2] - assert g.dupes[0] in f[:2] - -def test_simple_with_lower_min_match(): - s = Scanner() - s.min_match_percentage = 50 - f = [no('foo bar'), no('foo bar'), no('foo bleh')] - r = s.GetDupeGroups(f) - eq_(len(r), 1) - g = r[0] - eq_(len(g), 3) - -def test_trim_all_ref_groups(): - # When all files of a group are ref, don't include that group in the results, but also don't - # count the files from that group as discarded. - s = Scanner() - f = [no('foo'), no('foo'), no('bar'), no('bar')] - f[2].is_ref = True - f[3].is_ref = True - r = s.GetDupeGroups(f) - eq_(len(r), 1) - eq_(s.discarded_file_count, 0) - -def test_priorize(): - s = Scanner() - f = [no('foo'), no('foo'), no('bar'), no('bar')] - f[1].size = 2 - f[2].size = 3 - f[3].is_ref = True - r = s.GetDupeGroups(f) - g1, g2 = r - assert f[1] in (g1.ref,g2.ref) - assert f[0] in (g1.dupes[0],g2.dupes[0]) - assert f[3] in (g1.ref,g2.ref) - assert f[2] in (g1.dupes[0],g2.dupes[0]) - -def test_content_scan(): - s = Scanner() - s.scan_type = SCAN_TYPE_CONTENT - f = [no('foo'), no('bar'), no('bleh')] - f[0].md5 = f[0].md5partial = 'foobar' - f[1].md5 = f[1].md5partial = 'foobar' - f[2].md5 = f[2].md5partial = 'bleh' - r = s.GetDupeGroups(f) - eq_(len(r), 1) - eq_(len(r[0]), 2) - eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded! - -def test_content_scan_compare_sizes_first(): - class MyFile(no): - @property - def md5(file): - raise AssertionError() +class ScannerTestFakeFiles(TestCase): + def setUp(self): + # This is a hack to avoid invalidating all previous tests since the scanner started to test + # for file existence before doing the match grouping. + self.mock(io, 'exists', lambda _: True) - s = Scanner() - s.scan_type = SCAN_TYPE_CONTENT - f = [MyFile('foo', 1), MyFile('bar', 2)] - eq_(len(s.GetDupeGroups(f)), 0) - -def test_min_match_perc_doesnt_matter_for_content_scan(): - s = Scanner() - s.scan_type = SCAN_TYPE_CONTENT - f = [no('foo'), no('bar'), no('bleh')] - f[0].md5 = f[0].md5partial = 'foobar' - f[1].md5 = f[1].md5partial = 'foobar' - f[2].md5 = f[2].md5partial = 'bleh' - s.min_match_percentage = 101 - r = s.GetDupeGroups(f) - eq_(len(r), 1) - eq_(len(r[0]), 2) - s.min_match_percentage = 0 - r = s.GetDupeGroups(f) - eq_(len(r), 1) - eq_(len(r[0]), 2) - -def test_content_scan_doesnt_put_md5_in_words_at_the_end(): - s = Scanner() - s.scan_type = SCAN_TYPE_CONTENT - f = [no('foo'),no('bar')] - f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' - f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' - r = s.GetDupeGroups(f) - g = r[0] - -def test_extension_is_not_counted_in_filename_scan(): - s = Scanner() - s.min_match_percentage = 100 - f = [no('foo.bar'), no('foo.bleh')] - r = s.GetDupeGroups(f) - eq_(len(r), 1) - eq_(len(r[0]), 2) - -def test_job(): - def do_progress(progress, desc=''): - log.append(progress) - return True + def test_empty(self): + s = Scanner() + r = s.GetDupeGroups([]) + eq_(r, []) - s = Scanner() - log = [] - f = [no('foo bar'), no('foo bar'), no('foo bleh')] - r = s.GetDupeGroups(f, job.Job(1, do_progress)) - eq_(log[0], 0) - eq_(log[-1], 100) - -def test_mix_file_kind(): - s = Scanner() - s.mix_file_kind = False - f = [no('foo.1'), no('foo.2')] - r = s.GetDupeGroups(f) - eq_(len(r), 0) - -def test_word_weighting(): - s = Scanner() - s.min_match_percentage = 75 - s.word_weighting = True - f = [no('foo bar'), no('foo bar bleh')] - r = s.GetDupeGroups(f) - eq_(len(r), 1) - g = r[0] - m = g.get_match_of(g.dupes[0]) - eq_(m.percentage, 75) # 16 letters, 12 matching - -def test_similar_words(): - s = Scanner() - s.match_similar_words = True - f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')] - r = s.GetDupeGroups(f) - eq_(len(r), 2) - -def test_fields(): - s = Scanner() - s.scan_type = SCAN_TYPE_FIELDS - f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')] - r = s.GetDupeGroups(f) - eq_(len(r), 0) - -def test_fields_no_order(): - s = Scanner() - s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER - f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')] - r = s.GetDupeGroups(f) - eq_(len(r), 1) - -def test_tag_scan(): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG - o1 = no('foo') - o2 = no('bar') - o1.artist = 'The White Stripes' - o1.title = 'The Air Near My Fingers' - o2.artist = 'The White Stripes' - o2.title = 'The Air Near My Fingers' - r = s.GetDupeGroups([o1,o2]) - eq_(len(r), 1) - -def test_tag_with_album_scan(): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG - s.scanned_tags = set(['artist', 'album', 'title']) - o1 = no('foo') - o2 = no('bar') - o3 = no('bleh') - o1.artist = 'The White Stripes' - o1.title = 'The Air Near My Fingers' - o1.album = 'Elephant' - o2.artist = 'The White Stripes' - o2.title = 'The Air Near My Fingers' - o2.album = 'Elephant' - o3.artist = 'The White Stripes' - o3.title = 'The Air Near My Fingers' - o3.album = 'foobar' - r = s.GetDupeGroups([o1,o2,o3]) - eq_(len(r), 1) - -def test_that_dash_in_tags_dont_create_new_fields(): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG - s.scanned_tags = set(['artist', 'album', 'title']) - s.min_match_percentage = 50 - o1 = no('foo') - o2 = no('bar') - o1.artist = 'The White Stripes - a' - o1.title = 'The Air Near My Fingers - a' - o1.album = 'Elephant - a' - o2.artist = 'The White Stripes - b' - o2.title = 'The Air Near My Fingers - b' - o2.album = 'Elephant - b' - r = s.GetDupeGroups([o1,o2]) - eq_(len(r), 1) - -def test_tag_scan_with_different_scanned(): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG - s.scanned_tags = set(['track', 'year']) - o1 = no('foo') - o2 = no('bar') - o1.artist = 'The White Stripes' - o1.title = 'some title' - o1.track = 'foo' - o1.year = 'bar' - o2.artist = 'The White Stripes' - o2.title = 'another title' - o2.track = 'foo' - o2.year = 'bar' - r = s.GetDupeGroups([o1, o2]) - eq_(len(r), 1) - -def test_tag_scan_only_scans_existing_tags(): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG - s.scanned_tags = set(['artist', 'foo']) - o1 = no('foo') - o2 = no('bar') - o1.artist = 'The White Stripes' - o1.foo = 'foo' - o2.artist = 'The White Stripes' - o2.foo = 'bar' - r = s.GetDupeGroups([o1, o2]) - eq_(len(r), 1) # Because 'foo' is not scanned, they match - -def test_tag_scan_converts_to_str(): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG - s.scanned_tags = set(['track']) - o1 = no('foo') - o2 = no('bar') - o1.track = 42 - o2.track = 42 - try: + def test_default_settings(self): + s = Scanner() + eq_(s.min_match_percentage, 80) + eq_(s.scan_type, SCAN_TYPE_FILENAME) + eq_(s.mix_file_kind, True) + eq_(s.word_weighting, False) + eq_(s.match_similar_words, False) + assert isinstance(s.ignore_list, IgnoreList) + + def test_simple_with_default_settings(self): + s = Scanner() + f = [no('foo bar'), no('foo bar'), no('foo bleh')] + r = s.GetDupeGroups(f) + eq_(len(r), 1) + g = r[0] + #'foo bleh' cannot be in the group because the default min match % is 80 + eq_(len(g), 2) + assert g.ref in f[:2] + assert g.dupes[0] in f[:2] + + def test_simple_with_lower_min_match(self): + s = Scanner() + s.min_match_percentage = 50 + f = [no('foo bar'), no('foo bar'), no('foo bleh')] + r = s.GetDupeGroups(f) + eq_(len(r), 1) + g = r[0] + eq_(len(g), 3) + + def test_trim_all_ref_groups(self): + # When all files of a group are ref, don't include that group in the results, but also don't + # count the files from that group as discarded. + s = Scanner() + f = [no('foo'), no('foo'), no('bar'), no('bar')] + f[2].is_ref = True + f[3].is_ref = True + r = s.GetDupeGroups(f) + eq_(len(r), 1) + eq_(s.discarded_file_count, 0) + + def test_priorize(self): + s = Scanner() + f = [no('foo'), no('foo'), no('bar'), no('bar')] + f[1].size = 2 + f[2].size = 3 + f[3].is_ref = True + r = s.GetDupeGroups(f) + g1, g2 = r + assert f[1] in (g1.ref,g2.ref) + assert f[0] in (g1.dupes[0],g2.dupes[0]) + assert f[3] in (g1.ref,g2.ref) + assert f[2] in (g1.dupes[0],g2.dupes[0]) + + def test_content_scan(self): + s = Scanner() + s.scan_type = SCAN_TYPE_CONTENT + f = [no('foo'), no('bar'), no('bleh')] + f[0].md5 = f[0].md5partial = 'foobar' + f[1].md5 = f[1].md5partial = 'foobar' + f[2].md5 = f[2].md5partial = 'bleh' + r = s.GetDupeGroups(f) + eq_(len(r), 1) + eq_(len(r[0]), 2) + eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded! + + def test_content_scan_compare_sizes_first(self): + class MyFile(no): + @property + def md5(file): + raise AssertionError() + + s = Scanner() + s.scan_type = SCAN_TYPE_CONTENT + f = [MyFile('foo', 1), MyFile('bar', 2)] + eq_(len(s.GetDupeGroups(f)), 0) + + def test_min_match_perc_doesnt_matter_for_content_scan(self): + s = Scanner() + s.scan_type = SCAN_TYPE_CONTENT + f = [no('foo'), no('bar'), no('bleh')] + f[0].md5 = f[0].md5partial = 'foobar' + f[1].md5 = f[1].md5partial = 'foobar' + f[2].md5 = f[2].md5partial = 'bleh' + s.min_match_percentage = 101 + r = s.GetDupeGroups(f) + eq_(len(r), 1) + eq_(len(r[0]), 2) + s.min_match_percentage = 0 + r = s.GetDupeGroups(f) + eq_(len(r), 1) + eq_(len(r[0]), 2) + + def test_content_scan_doesnt_put_md5_in_words_at_the_end(self): + s = Scanner() + s.scan_type = SCAN_TYPE_CONTENT + f = [no('foo'),no('bar')] + f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' + f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' + r = s.GetDupeGroups(f) + g = r[0] + + def test_extension_is_not_counted_in_filename_scan(self): + s = Scanner() + s.min_match_percentage = 100 + f = [no('foo.bar'), no('foo.bleh')] + r = s.GetDupeGroups(f) + eq_(len(r), 1) + eq_(len(r[0]), 2) + + def test_job(self): + def do_progress(progress, desc=''): + log.append(progress) + return True + + s = Scanner() + log = [] + f = [no('foo bar'), no('foo bar'), no('foo bleh')] + r = s.GetDupeGroups(f, job.Job(1, do_progress)) + eq_(log[0], 0) + eq_(log[-1], 100) + + def test_mix_file_kind(self): + s = Scanner() + s.mix_file_kind = False + f = [no('foo.1'), no('foo.2')] + r = s.GetDupeGroups(f) + eq_(len(r), 0) + + def test_word_weighting(self): + s = Scanner() + s.min_match_percentage = 75 + s.word_weighting = True + f = [no('foo bar'), no('foo bar bleh')] + r = s.GetDupeGroups(f) + eq_(len(r), 1) + g = r[0] + m = g.get_match_of(g.dupes[0]) + eq_(m.percentage, 75) # 16 letters, 12 matching + + def test_similar_words(self): + s = Scanner() + s.match_similar_words = True + f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')] + r = s.GetDupeGroups(f) + eq_(len(r), 2) + + def test_fields(self): + s = Scanner() + s.scan_type = SCAN_TYPE_FIELDS + f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')] + r = s.GetDupeGroups(f) + eq_(len(r), 0) + + def test_fields_no_order(self): + s = Scanner() + s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER + f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')] + r = s.GetDupeGroups(f) + eq_(len(r), 1) + + def test_tag_scan(self): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + o1 = no('foo') + o2 = no('bar') + o1.artist = 'The White Stripes' + o1.title = 'The Air Near My Fingers' + o2.artist = 'The White Stripes' + o2.title = 'The Air Near My Fingers' + r = s.GetDupeGroups([o1,o2]) + eq_(len(r), 1) + + def test_tag_with_album_scan(self): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + s.scanned_tags = set(['artist', 'album', 'title']) + o1 = no('foo') + o2 = no('bar') + o3 = no('bleh') + o1.artist = 'The White Stripes' + o1.title = 'The Air Near My Fingers' + o1.album = 'Elephant' + o2.artist = 'The White Stripes' + o2.title = 'The Air Near My Fingers' + o2.album = 'Elephant' + o3.artist = 'The White Stripes' + o3.title = 'The Air Near My Fingers' + o3.album = 'foobar' + r = s.GetDupeGroups([o1,o2,o3]) + eq_(len(r), 1) + + def test_that_dash_in_tags_dont_create_new_fields(self): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + s.scanned_tags = set(['artist', 'album', 'title']) + s.min_match_percentage = 50 + o1 = no('foo') + o2 = no('bar') + o1.artist = 'The White Stripes - a' + o1.title = 'The Air Near My Fingers - a' + o1.album = 'Elephant - a' + o2.artist = 'The White Stripes - b' + o2.title = 'The Air Near My Fingers - b' + o2.album = 'Elephant - b' + r = s.GetDupeGroups([o1,o2]) + eq_(len(r), 1) + + def test_tag_scan_with_different_scanned(self): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + s.scanned_tags = set(['track', 'year']) + o1 = no('foo') + o2 = no('bar') + o1.artist = 'The White Stripes' + o1.title = 'some title' + o1.track = 'foo' + o1.year = 'bar' + o2.artist = 'The White Stripes' + o2.title = 'another title' + o2.track = 'foo' + o2.year = 'bar' r = s.GetDupeGroups([o1, o2]) - except TypeError: - raise AssertionError() - eq_(len(r), 1) - -def test_tag_scan_non_ascii(): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG - s.scanned_tags = set(['title']) - o1 = no('foo') - o2 = no('bar') - o1.title = u'foobar\u00e9' - o2.title = u'foobar\u00e9' - try: + eq_(len(r), 1) + + def test_tag_scan_only_scans_existing_tags(self): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + s.scanned_tags = set(['artist', 'foo']) + o1 = no('foo') + o2 = no('bar') + o1.artist = 'The White Stripes' + o1.foo = 'foo' + o2.artist = 'The White Stripes' + o2.foo = 'bar' r = s.GetDupeGroups([o1, o2]) - except UnicodeEncodeError: - raise AssertionError() - eq_(len(r), 1) - -def test_audio_content_scan(): - s = Scanner() - s.scan_type = SCAN_TYPE_CONTENT_AUDIO - f = [no('foo'), no('bar'), no('bleh')] - f[0].md5 = 'foo' - f[1].md5 = 'bar' - f[2].md5 = 'bleh' - f[0].md5partial = 'foo' - f[1].md5partial = 'foo' - f[2].md5partial = 'bleh' - f[0].audiosize = 1 - f[1].audiosize = 1 - f[2].audiosize = 1 - r = s.GetDupeGroups(f) - eq_(len(r), 1) - eq_(len(r[0]), 2) + eq_(len(r), 1) # Because 'foo' is not scanned, they match -def test_audio_content_scan_compare_sizes_first(): - class MyFile(no): - @property - def md5partial(file): + def test_tag_scan_converts_to_str(self): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + s.scanned_tags = set(['track']) + o1 = no('foo') + o2 = no('bar') + o1.track = 42 + o2.track = 42 + try: + r = s.GetDupeGroups([o1, o2]) + except TypeError: raise AssertionError() + eq_(len(r), 1) - s = Scanner() - s.scan_type = SCAN_TYPE_CONTENT_AUDIO - f = [MyFile('foo'), MyFile('bar')] - f[0].audiosize = 1 - f[1].audiosize = 2 - eq_(len(s.GetDupeGroups(f)), 0) - -def test_ignore_list(): - s = Scanner() - f1 = no('foobar') - f2 = no('foobar') - f3 = no('foobar') - f1.path = Path('dir1/foobar') - f2.path = Path('dir2/foobar') - f3.path = Path('dir3/foobar') - s.ignore_list.Ignore(str(f1.path),str(f2.path)) - s.ignore_list.Ignore(str(f1.path),str(f3.path)) - r = s.GetDupeGroups([f1,f2,f3]) - eq_(len(r), 1) - g = r[0] - eq_(len(g.dupes), 1) - assert f1 not in g - assert f2 in g - assert f3 in g - # Ignored matches are not counted as discarded - eq_(s.discarded_file_count, 0) - -def test_ignore_list_checks_for_unicode(): - #scanner was calling path_str for ignore list checks. Since the Path changes, it must - #be unicode(path) - s = Scanner() - f1 = no('foobar') - f2 = no('foobar') - f3 = no('foobar') - f1.path = Path(u'foo1\u00e9') - f2.path = Path(u'foo2\u00e9') - f3.path = Path(u'foo3\u00e9') - s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path)) - s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path)) - r = s.GetDupeGroups([f1,f2,f3]) - eq_(len(r), 1) - g = r[0] - eq_(len(g.dupes), 1) - assert f1 not in g - assert f2 in g - assert f3 in g - -def test_file_evaluates_to_false(): - # A very wrong way to use any() was added at some point, causing resulting group list - # to be empty. - class FalseNamedObject(NamedObject): - def __nonzero__(self): - return False + def test_tag_scan_non_ascii(self): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + s.scanned_tags = set(['title']) + o1 = no('foo') + o2 = no('bar') + o1.title = u'foobar\u00e9' + o2.title = u'foobar\u00e9' + try: + r = s.GetDupeGroups([o1, o2]) + except UnicodeEncodeError: + raise AssertionError() + eq_(len(r), 1) + + def test_audio_content_scan(self): + s = Scanner() + s.scan_type = SCAN_TYPE_CONTENT_AUDIO + f = [no('foo'), no('bar'), no('bleh')] + f[0].md5 = 'foo' + f[1].md5 = 'bar' + f[2].md5 = 'bleh' + f[0].md5partial = 'foo' + f[1].md5partial = 'foo' + f[2].md5partial = 'bleh' + f[0].audiosize = 1 + f[1].audiosize = 1 + f[2].audiosize = 1 + r = s.GetDupeGroups(f) + eq_(len(r), 1) + eq_(len(r[0]), 2) + + def test_audio_content_scan_compare_sizes_first(self): + class MyFile(no): + @property + def md5partial(file): + raise AssertionError() + + s = Scanner() + s.scan_type = SCAN_TYPE_CONTENT_AUDIO + f = [MyFile('foo'), MyFile('bar')] + f[0].audiosize = 1 + f[1].audiosize = 2 + eq_(len(s.GetDupeGroups(f)), 0) + + def test_ignore_list(self): + s = Scanner() + f1 = no('foobar') + f2 = no('foobar') + f3 = no('foobar') + f1.path = Path('dir1/foobar') + f2.path = Path('dir2/foobar') + f3.path = Path('dir3/foobar') + s.ignore_list.Ignore(str(f1.path),str(f2.path)) + s.ignore_list.Ignore(str(f1.path),str(f3.path)) + r = s.GetDupeGroups([f1,f2,f3]) + eq_(len(r), 1) + g = r[0] + eq_(len(g.dupes), 1) + assert f1 not in g + assert f2 in g + assert f3 in g + # Ignored matches are not counted as discarded + eq_(s.discarded_file_count, 0) + + def test_ignore_list_checks_for_unicode(self): + #scanner was calling path_str for ignore list checks. Since the Path changes, it must + #be unicode(path) + s = Scanner() + f1 = no('foobar') + f2 = no('foobar') + f3 = no('foobar') + f1.path = Path(u'foo1\u00e9') + f2.path = Path(u'foo2\u00e9') + f3.path = Path(u'foo3\u00e9') + s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path)) + s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path)) + r = s.GetDupeGroups([f1,f2,f3]) + eq_(len(r), 1) + g = r[0] + eq_(len(g.dupes), 1) + assert f1 not in g + assert f2 in g + assert f3 in g + + def test_file_evaluates_to_false(self): + # A very wrong way to use any() was added at some point, causing resulting group list + # to be empty. + class FalseNamedObject(NamedObject): + def __nonzero__(self): + return False - s = Scanner() - f1 = FalseNamedObject('foobar') - f2 = FalseNamedObject('foobar') - r = s.GetDupeGroups([f1, f2]) - eq_(len(r), 1) + s = Scanner() + f1 = FalseNamedObject('foobar') + f2 = FalseNamedObject('foobar') + r = s.GetDupeGroups([f1, f2]) + eq_(len(r), 1) + + def test_size_threshold(self): + # Only file equal or higher than the size_threshold in size are scanned + s = Scanner() + f1 = no('foo', 1) + f2 = no('foo', 2) + f3 = no('foo', 3) + s.size_threshold = 2 + groups = s.GetDupeGroups([f1,f2,f3]) + eq_(len(groups), 1) + [group] = groups + eq_(len(group), 2) + assert f1 not in group + assert f2 in group + assert f3 in group + + def test_tie_breaker_path_deepness(self): + # If there is a tie in prioritization, path deepness is used as a tie breaker + s = Scanner() + o1, o2 = no('foo'), no('foo') + o1.path = Path('foo') + o2.path = Path('foo/bar') + [group] = s.GetDupeGroups([o1, o2]) + assert group.ref is o2 + + def test_tie_breaker_copy(self): + # if copy is in the words used (even if it has a deeper path), it becomes a dupe + s = Scanner() + o1, o2 = no('foo bar Copy'), no('foo bar') + o1.path = Path('deeper/path') + o2.path = Path('foo') + [group] = s.GetDupeGroups([o1, o2]) + assert group.ref is o2 + + def test_tie_breaker_same_name_plus_digit(self): + # if ref has the same words as dupe, but has some just one extra word which is a digit, it + # becomes a dupe + s = Scanner() + o1, o2 = no('foo bar 42'), no('foo bar') + o1.path = Path('deeper/path') + o2.path = Path('foo') + [group] = s.GetDupeGroups([o1, o2]) + assert group.ref is o2 + + def test_partial_group_match(self): + # Count the number od discarded matches (when a file doesn't match all other dupes of the + # group) in Scanner.discarded_file_count + s = Scanner() + o1, o2, o3 = no('a b'), no('a'), no('b') + s.min_match_percentage = 50 + [group] = s.GetDupeGroups([o1, o2, o3]) + eq_(len(group), 2) + assert o1 in group + assert o2 in group + assert o3 not in group + eq_(s.discarded_file_count, 1) + -def test_size_threshold(): - # Only file equal or higher than the size_threshold in size are scanned - s = Scanner() - f1 = no('foo', 1) - f2 = no('foo', 2) - f3 = no('foo', 3) - s.size_threshold = 2 - groups = s.GetDupeGroups([f1,f2,f3]) - eq_(len(groups), 1) - [group] = groups - eq_(len(group), 2) - assert f1 not in group - assert f2 in group - assert f3 in group - -def test_tie_breaker_path_deepness(): - # If there is a tie in prioritization, path deepness is used as a tie breaker - s = Scanner() - o1, o2 = no('foo'), no('foo') - o1.path = Path('foo') - o2.path = Path('foo/bar') - [group] = s.GetDupeGroups([o1, o2]) - assert group.ref is o2 - -def test_tie_breaker_copy(): - # if copy is in the words used (even if it has a deeper path), it becomes a dupe - s = Scanner() - o1, o2 = no('foo bar Copy'), no('foo bar') - o1.path = Path('deeper/path') - o2.path = Path('foo') - [group] = s.GetDupeGroups([o1, o2]) - assert group.ref is o2 - -def test_tie_breaker_same_name_plus_digit(): - # if ref has the same words as dupe, but has some just one extra word which is a digit, it - # becomes a dupe - s = Scanner() - o1, o2 = no('foo bar 42'), no('foo bar') - o1.path = Path('deeper/path') - o2.path = Path('foo') - [group] = s.GetDupeGroups([o1, o2]) - assert group.ref is o2 - -def test_partial_group_match(): - # Count the number od discarded matches (when a file doesn't match all other dupes of the - # group) in Scanner.discarded_file_count - s = Scanner() - o1, o2, o3 = no('a b'), no('a'), no('b') - s.min_match_percentage = 50 - [group] = s.GetDupeGroups([o1, o2, o3]) - eq_(len(group), 2) - assert o1 in group - assert o2 in group - assert o3 not in group - eq_(s.discarded_file_count, 1) +class ScannerTest(TestCase): + def test_dont_group_files_that_dont_exist(self): + # when creating groups, check that files exist first. It's possible that these files have + # been moved during the scan by the user. + # In this test, we have to delete one of the files between the get_matches() part and the + # get_groups() part. + s = Scanner() + s.scan_type = SCAN_TYPE_CONTENT + p = self.tmppath() + io.open(p + 'file1', 'w').write('foo') + io.open(p + 'file2', 'w').write('foo') + file1, file2 = fs.get_files(p) + def getmatches(*args, **kw): + io.remove(file2.path) + return [Match(file1, file2, 100)] + s._getmatches = getmatches + + assert not s.GetDupeGroups([file1, file2]) + \ No newline at end of file