dupeguru/core/scanner.py

# Created By: Virgil Dupras
# Created On: 2006/03/03
# Copyright 2012 Hardcoded Software (http://www.hardcoded.net)
# 
# This software is licensed under the "BSD" License as described in the "LICENSE" file, 
# which should be included with this package. The terms are also available at 
# http://www.hardcoded.net/licenses/bsd_license

import logging
import re
import os.path as op

from jobprogress import job
from hscommon import io
from hscommon.util import dedupe, rem_file_ext, get_file_ext
from hscommon.trans import tr

from . import engine
from .ignore import IgnoreList

# It's quite ugly to have scan types from all editions all put in the same class, but because there's
# there will be some nasty bugs popping up (ScanType is used in core when in should exclusively be
# used in core_*). One day I'll clean this up.

class ScanType:
    Filename = 0
    Fields = 1
    FieldsNoOrder = 2
    Tag = 3
    Folders = 4
    Contents = 5
    ContentsAudio = 6
    
    #PE
    FuzzyBlock = 10
    ExifTimestamp = 11

SCANNABLE_TAGS = ['track', 'artist', 'album', 'title', 'genre', 'year']

RE_DIGIT_ENDING = re.compile(r'\d+|\(\d+\)|\[\d+\]|{\d+}')

def is_same_with_digit(name, refname):
    # Returns True if name is the same as refname, but with digits (with brackets or not) at the end
    if not name.startswith(refname):
        return False
    end = name[len(refname):].strip()
    return RE_DIGIT_ENDING.match(end) is not None

def remove_dupe_paths(files):
    # Returns files with duplicates-by-path removed. Files with the exact same path are considered
    # duplicates and only the first file to have a path is kept. In certain cases, we have files
    # that have the same path, but not with the same case, that's why we normalize. However, we also
    # have case-sensitive filesystems, and in those, we don't want to falsely remove duplicates,
    # that's why we have a `samefile` mechanism.
    result = []
    path2file = {}
    for f in files:
        normalized = str(f.path).lower()
        if normalized in path2file:
            try:
                if op.samefile(normalized, str(path2file[normalized].path)):
                    continue # same file, it's a dupe
                else:
                    pass # We don't treat them as dupes
            except OSError:
                continue # File doesn't exist? Well, treat them as dupes
        else:
            path2file[normalized] = f
        result.append(f)
    return result

class Scanner:
    def __init__(self):
        self.ignore_list = IgnoreList()
        self.discarded_file_count = 0
    
    def _getmatches(self, files, j):
        if self.size_threshold:
            j = j.start_subjob([2, 8])
            for f in j.iter_with_progress(files, tr("Read size of %d/%d files")):
                f.size # pre-read, makes a smoother progress if read here (especially for bundles)
            files = [f for f in files if f.size >= self.size_threshold]
        if self.scan_type in {ScanType.Contents, ScanType.ContentsAudio, ScanType.Folders}:
            sizeattr = 'audiosize' if self.scan_type == ScanType.ContentsAudio else 'size'
            return engine.getmatches_by_contents(files, sizeattr, partial=self.scan_type==ScanType.ContentsAudio, j=j)
        else:
            j = j.start_subjob([2, 8])
            kw = {}
            kw['match_similar_words'] = self.match_similar_words
            kw['weight_words'] = self.word_weighting
            kw['min_match_percentage'] = self.min_match_percentage
            if self.scan_type == ScanType.FieldsNoOrder:
                self.scan_type = ScanType.Fields
                kw['no_field_order'] = True
            func = {
                ScanType.Filename: lambda f: engine.getwords(rem_file_ext(f.name)),
                ScanType.Fields: lambda f: engine.getfields(rem_file_ext(f.name)),
                ScanType.Tag: lambda f: [engine.getwords(str(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
            }[self.scan_type]
            for f in j.iter_with_progress(files, tr("Read metadata of %d/%d files")):
                logging.debug("Reading metadata of {}".format(str(f.path)))
                f.words = func(f)
            return engine.getmatches(files, j=j, **kw)
    
    @staticmethod
    def _key_func(dupe):
        return -dupe.size
    
    @staticmethod
    def _tie_breaker(ref, dupe):
        refname = rem_file_ext(ref.name).lower()
        dupename = rem_file_ext(dupe.name).lower()
        if 'copy' in dupename:
            return False
        if 'copy' in refname:
            return True
        if is_same_with_digit(dupename, refname):
            return False
        if is_same_with_digit(refname, dupename):
            return True
        return len(dupe.path) > len(ref.path)
    
    def get_dupe_groups(self, files, j=job.nulljob):
        j = j.start_subjob([8, 2])
        for f in (f for f in files if not hasattr(f, 'is_ref')):
            f.is_ref = False
        files = remove_dupe_paths(files)
        logging.info("Getting matches. Scan type: %d", self.scan_type)
        matches = self._getmatches(files, j)
        logging.info('Found %d matches' % len(matches))
        j.set_progress(100, tr("Removing false matches"))
        if self.scan_type == ScanType.Folders and matches:
            allpath = {m.first.path for m in matches}
            allpath |= {m.second.path for m in matches}
            sortedpaths = sorted(allpath)
            toremove = set()
            last_parent_path = sortedpaths[0]
            for p in sortedpaths[1:]:
                if p in last_parent_path:
                    toremove.add(p)
                else:
                    last_parent_path = p
            matches = [m for m in matches if m.first.path not in toremove or m.second.path not in toremove]
        if not self.mix_file_kind:
            matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
        matches = [m for m in matches if io.exists(m.first.path) and io.exists(m.second.path)]
        if self.ignore_list:
            j = j.start_subjob(2)
            iter_matches = j.iter_with_progress(matches, tr("Processed %d/%d matches against the ignore list"))
            matches = [m for m in iter_matches 
                if not self.ignore_list.AreIgnored(str(m.first.path), str(m.second.path))]
        logging.info('Grouping matches')
        groups = engine.get_groups(matches, j)
        matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
        if self.scan_type in {ScanType.Filename, ScanType.Fields, ScanType.FieldsNoOrder, ScanType.Tag}:
            self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups)
        else:
            # Ticket #195
            # To speed up the scan, we don't bother comparing contents of files that are both ref
            # files. However, this messes up "discarded" counting because there's a missing match
            # in cases where we end up with a dupe group anyway (with a non-ref file). Because it's
            # impossible to have discarded matches in exact dupe scans, we simply set it at 0, thus
            # bypassing our tricky problem.
            # Also, although ScanType.FuzzyBlock is not always doing exact comparisons, we also
            # bypass ref comparison, thus messing up with our "discarded" count. So we're
            # effectively disabling the "discarded" feature in PE, but it's better than falsely
            # reporting discarded matches.
            self.discarded_file_count = 0
        groups = [g for g in groups if any(not f.is_ref for f in g)]
        logging.info('Created %d groups' % len(groups))
        j.set_progress(100, tr("Doing group prioritization"))
        for g in groups:
            g.prioritize(self._key_func, self._tie_breaker)
        return groups
    
    match_similar_words  = False
    min_match_percentage = 80
    mix_file_kind        = True
    scan_type            = ScanType.Filename
    scanned_tags         = {'artist', 'title'}
    size_threshold       = 0
    word_weighting       = False
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`# Created By: Virgil Dupras`
			`# Created On: 2006/03/03`
Changed copyright year to 2012 2012-03-15 18:28:40 +00:00			`# Copyright 2012 Hardcoded Software (http://www.hardcoded.net)`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`#`
Re-licensed to BSD 2010-09-30 10:17:41 +00:00			`# This software is licensed under the "BSD" License as described in the "LICENSE" file,`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`# which should be included with this package. The terms are also available at`
Re-licensed to BSD 2010-09-30 10:17:41 +00:00			`# http://www.hardcoded.net/licenses/bsd_license`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00
			`import logging`
[#15 state:fixed] Improved tie breaker in cases where filenames end with digits inside brackets. 2010-08-14 17:32:09 +00:00			`import re`
Ignore files in the scanning list that point to the same path as another file in the scanning list. 2012-02-21 16:14:12 +00:00			`import os.path as op`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00
Adapted to the job-related code moving to the 'jobprogress' package. 2010-11-20 11:42:15 +00:00			`from jobprogress import job`
Changed references to what has already been moved from hsutil to hscommon (io, path, testutil). 2011-01-11 10:59:53 +00:00			`from hscommon import io`
Replaced dependencies from hsutil to hscommon. 2011-01-11 12:36:05 +00:00			`from hscommon.util import dedupe, rem_file_ext, get_file_ext`
[#32] Internationalized the core and localized it to french. 2011-01-18 16:33:33 +00:00			`from hscommon.trans import tr`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00
			`from . import engine`
			`from .ignore import IgnoreList`

Added the EXIF Timestamp scan type in dgpe. --HG-- rename : core_pe/matchbase.py => core_pe/matchblock.py 2011-04-21 15:17:19 +00:00			`# It's quite ugly to have scan types from all editions all put in the same class, but because there's`
			`# there will be some nasty bugs popping up (ScanType is used in core when in should exclusively be`
			`# used in core_*). One day I'll clean this up.`

Enum-ified Scan Type constants, looks nicer. 2010-08-14 17:52:23 +00:00			`class ScanType:`
			`Filename = 0`
			`Fields = 1`
			`FieldsNoOrder = 2`
			`Tag = 3`
[#89 state:fixed] Added a Folders scan type in dgse. --HG-- rename : core_se/tests/fs_test.py => core/tests/fs_test.py 2011-04-12 11:22:29 +00:00			`Folders = 4`
Enum-ified Scan Type constants, looks nicer. 2010-08-14 17:52:23 +00:00			`Contents = 5`
			`ContentsAudio = 6`
Added the EXIF Timestamp scan type in dgpe. --HG-- rename : core_pe/matchbase.py => core_pe/matchblock.py 2011-04-21 15:17:19 +00:00
			`#PE`
			`FuzzyBlock = 10`
			`ExifTimestamp = 11`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00
			`SCANNABLE_TAGS = ['track', 'artist', 'album', 'title', 'genre', 'year']`

[#15 state:fixed] Improved tie breaker in cases where filenames end with digits inside brackets. 2010-08-14 17:32:09 +00:00			`RE_DIGIT_ENDING = re.compile(r'\d+\|\(\d+\)\|\[\d+\]\|{\d+}')`

			`def is_same_with_digit(name, refname):`
			`# Returns True if name is the same as refname, but with digits (with brackets or not) at the end`
			`if not name.startswith(refname):`
			`return False`
			`end = name[len(refname):].strip()`
			`return RE_DIGIT_ENDING.match(end) is not None`

Ignore files in the scanning list that point to the same path as another file in the scanning list. 2012-02-21 16:14:12 +00:00			`def remove_dupe_paths(files):`
			`# Returns files with duplicates-by-path removed. Files with the exact same path are considered`
			`# duplicates and only the first file to have a path is kept. In certain cases, we have files`
			`# that have the same path, but not with the same case, that's why we normalize. However, we also`
			`# have case-sensitive filesystems, and in those, we don't want to falsely remove duplicates,`
			# that's why we have a `samefile` mechanism.
			`result = []`
			`path2file = {}`
			`for f in files:`
			`normalized = str(f.path).lower()`
			`if normalized in path2file:`
			`try:`
			`if op.samefile(normalized, str(path2file[normalized].path)):`
			`continue # same file, it's a dupe`
			`else:`
			`pass # We don't treat them as dupes`
			`except OSError:`
			`continue # File doesn't exist? Well, treat them as dupes`
			`else:`
			`path2file[normalized] = f`
			`result.append(f)`
			`return result`

Changed references to what has already been moved from hsutil to hscommon (io, path, testutil). 2011-01-11 10:59:53 +00:00			`class Scanner:`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`def __init__(self):`
			`self.ignore_list = IgnoreList()`
			`self.discarded_file_count = 0`

			`def _getmatches(self, files, j):`
			`if self.size_threshold:`
			`j = j.start_subjob([2, 8])`
[#32] Internationalized the core and localized it to french. 2011-01-18 16:33:33 +00:00			`for f in j.iter_with_progress(files, tr("Read size of %d/%d files")):`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`f.size # pre-read, makes a smoother progress if read here (especially for bundles)`
			`files = [f for f in files if f.size >= self.size_threshold]`
[#89 state:fixed] Added a Folders scan type in dgse. --HG-- rename : core_se/tests/fs_test.py => core/tests/fs_test.py 2011-04-12 11:22:29 +00:00			`if self.scan_type in {ScanType.Contents, ScanType.ContentsAudio, ScanType.Folders}:`
			`sizeattr = 'audiosize' if self.scan_type == ScanType.ContentsAudio else 'size'`
Enum-ified Scan Type constants, looks nicer. 2010-08-14 17:52:23 +00:00			`return engine.getmatches_by_contents(files, sizeattr, partial=self.scan_type==ScanType.ContentsAudio, j=j)`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`else:`
			`j = j.start_subjob([2, 8])`
			`kw = {}`
			`kw['match_similar_words'] = self.match_similar_words`
			`kw['weight_words'] = self.word_weighting`
			`kw['min_match_percentage'] = self.min_match_percentage`
Enum-ified Scan Type constants, looks nicer. 2010-08-14 17:52:23 +00:00			`if self.scan_type == ScanType.FieldsNoOrder:`
			`self.scan_type = ScanType.Fields`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`kw['no_field_order'] = True`
			`func = {`
Enum-ified Scan Type constants, looks nicer. 2010-08-14 17:52:23 +00:00			`ScanType.Filename: lambda f: engine.getwords(rem_file_ext(f.name)),`
			`ScanType.Fields: lambda f: engine.getfields(rem_file_ext(f.name)),`
			`ScanType.Tag: lambda f: [engine.getwords(str(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`}[self.scan_type]`
[#32] Internationalized the core and localized it to french. 2011-01-18 16:33:33 +00:00			`for f in j.iter_with_progress(files, tr("Read metadata of %d/%d files")):`
[#132 state:fixed] Added a debug mode preference as well as extra debug loggings. 2011-01-26 11:50:44 +00:00			`logging.debug("Reading metadata of {}".format(str(f.path)))`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`f.words = func(f)`
			`return engine.getmatches(files, j=j, **kw)`

			`@staticmethod`
			`def _key_func(dupe):`
Don't allow dupes from ref folders to step down from their ref position during reprioritization. 2011-09-23 17:14:57 +00:00			`return -dupe.size`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00
			`@staticmethod`
			`def _tie_breaker(ref, dupe):`
			`refname = rem_file_ext(ref.name).lower()`
			`dupename = rem_file_ext(dupe.name).lower()`
			`if 'copy' in dupename:`
			`return False`
			`if 'copy' in refname:`
			`return True`
[#15 state:fixed] Improved tie breaker in cases where filenames end with digits inside brackets. 2010-08-14 17:32:09 +00:00			`if is_same_with_digit(dupename, refname):`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`return False`
[#15 state:fixed] Improved tie breaker in cases where filenames end with digits inside brackets. 2010-08-14 17:32:09 +00:00			`if is_same_with_digit(refname, dupename):`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`return True`
			`return len(dupe.path) > len(ref.path)`

Ignore files in the scanning list that point to the same path as another file in the scanning list. 2012-02-21 16:14:12 +00:00			`def get_dupe_groups(self, files, j=job.nulljob):`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`j = j.start_subjob([8, 2])`
Ignore files in the scanning list that point to the same path as another file in the scanning list. 2012-02-21 16:14:12 +00:00			`for f in (f for f in files if not hasattr(f, 'is_ref')):`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`f.is_ref = False`
Ignore files in the scanning list that point to the same path as another file in the scanning list. 2012-02-21 16:14:12 +00:00			`files = remove_dupe_paths(files)`
[#89 state:fixed] Added a Folders scan type in dgse. --HG-- rename : core_se/tests/fs_test.py => core/tests/fs_test.py 2011-04-12 11:22:29 +00:00			`logging.info("Getting matches. Scan type: %d", self.scan_type)`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`matches = self._getmatches(files, j)`
			`logging.info('Found %d matches' % len(matches))`
[#32] Internationalized the core and localized it to french. 2011-01-18 16:33:33 +00:00			`j.set_progress(100, tr("Removing false matches"))`
[#89 state:fixed] Added a Folders scan type in dgse. --HG-- rename : core_se/tests/fs_test.py => core/tests/fs_test.py 2011-04-12 11:22:29 +00:00			`if self.scan_type == ScanType.Folders and matches:`
			`allpath = {m.first.path for m in matches}`
			`allpath \|= {m.second.path for m in matches}`
			`sortedpaths = sorted(allpath)`
			`toremove = set()`
			`last_parent_path = sortedpaths[0]`
			`for p in sortedpaths[1:]:`
			`if p in last_parent_path:`
			`toremove.add(p)`
			`else:`
			`last_parent_path = p`
			`matches = [m for m in matches if m.first.path not in toremove or m.second.path not in toremove]`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`if not self.mix_file_kind:`
			`matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]`
			`matches = [m for m in matches if io.exists(m.first.path) and io.exists(m.second.path)]`
			`if self.ignore_list:`
			`j = j.start_subjob(2)`
[#32] Internationalized the core and localized it to french. 2011-01-18 16:33:33 +00:00			`iter_matches = j.iter_with_progress(matches, tr("Processed %d/%d matches against the ignore list"))`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`matches = [m for m in iter_matches`
			`if not self.ignore_list.AreIgnored(str(m.first.path), str(m.second.path))]`
			`logging.info('Grouping matches')`
			`groups = engine.get_groups(matches, j)`
			`matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])`
[#195 state:fixed] Fixed bug where there would be a false reporting of discarded matches. 2012-02-26 16:18:29 +00:00			`if self.scan_type in {ScanType.Filename, ScanType.Fields, ScanType.FieldsNoOrder, ScanType.Tag}:`
			`self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups)`
			`else:`
			`# Ticket #195`
			`# To speed up the scan, we don't bother comparing contents of files that are both ref`
			`# files. However, this messes up "discarded" counting because there's a missing match`
			`# in cases where we end up with a dupe group anyway (with a non-ref file). Because it's`
			`# impossible to have discarded matches in exact dupe scans, we simply set it at 0, thus`
			`# bypassing our tricky problem.`
			`# Also, although ScanType.FuzzyBlock is not always doing exact comparisons, we also`
			`# bypass ref comparison, thus messing up with our "discarded" count. So we're`
			`# effectively disabling the "discarded" feature in PE, but it's better than falsely`
			`# reporting discarded matches.`
			`self.discarded_file_count = 0`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`groups = [g for g in groups if any(not f.is_ref for f in g)]`
			`logging.info('Created %d groups' % len(groups))`
[#32] Internationalized the core and localized it to french. 2011-01-18 16:33:33 +00:00			`j.set_progress(100, tr("Doing group prioritization"))`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`for g in groups:`
			`g.prioritize(self._key_func, self._tie_breaker)`
			`return groups`

			`match_similar_words = False`
			`min_match_percentage = 80`
			`mix_file_kind = True`
Enum-ified Scan Type constants, looks nicer. 2010-08-14 17:52:23 +00:00			`scan_type = ScanType.Filename`
Ignore files in the scanning list that point to the same path as another file in the scanning list. 2012-02-21 16:14:12 +00:00			`scanned_tags = {'artist', 'title'}`
Converted to py3k. There's probably some bugs still. So far, I managed to run dupeGuru SE under pyobjc and qt. 2010-08-11 14:39:06 +00:00			`size_threshold = 0`
			`word_weighting = False`