[#89 state:fixed] Added a Folders scan type in dgse.

--HG-- rename : core_se/tests/fs_test.py => core/tests/fs_test.py
2026-03-09 10:31:38 +00:00 · 2011-04-12 13:22:29 +02:00
parent 0fea59007c
commit 279d44b7f3
23 changed files with 292 additions and 154 deletions
--- a/core/app.py
+++ b/core/app.py
@@ -368,7 +368,10 @@ class DupeGuru(RegistrableApplication, Broadcaster):
    def start_scanning(self):
        def do(j):
            j.set_progress(0, tr("Collecting files to scan"))
-            files = list(self.directories.get_files())
+            if self.scanner.scan_type == scanner.ScanType.Folders:
+                files = list(self.directories.get_folders())
+            else:
+                files = list(self.directories.get_files())
            if self.options['ignore_hardlink_matches']:
                files = self._remove_hardlink_dupes(files)
            logging.info('Scanning %d files' % len(files))
--- a/core/directories.py
+++ b/core/directories.py
@@ -15,9 +15,10 @@ from hscommon.util import FileOrPath

 from . import fs

-(STATE_NORMAL,
-STATE_REFERENCE,
-STATE_EXCLUDED) = range(3)
+class DirectoryState:
+    Normal = 0
+    Reference = 1
+    Excluded = 2

 class AlreadyThereError(Exception):
    """The path being added is already in the directory list"""
@@ -51,11 +52,11 @@ class Directories:
    def _default_state_for_path(self, path):
        # Override this in subclasses to specify the state of some special folders.
        if path[-1].startswith('.'): # hidden
-            return STATE_EXCLUDED
+            return DirectoryState.Excluded
    
    def _get_files(self, from_path):
        state = self.get_state(from_path)
-        if state == STATE_EXCLUDED:
+        if state == DirectoryState.Excluded:
            # Recursively get files from folders with lots of subfolder is expensive. However, there
            # might be a subfolder in this path that is not excluded. What we want to do is to skim
            # through self.states and see if we must continue, or we can stop right here to save time
@@ -63,11 +64,11 @@ class Directories:
                return
        try:
            filepaths = set()
-            if state != STATE_EXCLUDED:
+            if state != DirectoryState.Excluded:
                found_files = fs.get_files(from_path, fileclasses=self.fileclasses)
-                logging.debug("Collected {} files in folder {}".format(len(found_files), str(from_path)))
+                logging.debug("Collected %d files in folder %s", len(found_files), str(from_path))
                for file in found_files:
-                    file.is_ref = state == STATE_REFERENCE
+                    file.is_ref = state == DirectoryState.Reference
                    filepaths.add(file.path)
                    yield file
            subpaths = [from_path + name for name in io.listdir(from_path)]
@@ -79,6 +80,18 @@ class Directories:
        except (EnvironmentError, fs.InvalidPath):
            pass
    
+    def _get_folders(self, from_folder):
+        state = self.get_state(from_folder.path)
+        try:
+            for subfolder in from_folder.subfolders:
+                for folder in self._get_folders(subfolder):
+                    yield folder
+            if state != DirectoryState.Excluded:
+                from_folder.is_ref = state == DirectoryState.Reference
+                yield from_folder
+        except (EnvironmentError, fs.InvalidPath):
+            pass
+    
    #---Public
    def add_path(self, path):
        """Adds 'path' to self, if not already there.
@@ -113,6 +126,16 @@ class Directories:
            for file in self._get_files(path):
                yield file
    
+    def get_folders(self):
+        """Returns a list of all folders that are not excluded.
+        
+        Returned folders also have their 'is_ref' attr set.
+        """
+        for path in self._dirs:
+            from_folder = fs.Folder(path)
+            for folder in self._get_folders(from_folder):
+                yield folder
+    
    def get_state(self, path):
        """Returns the state of 'path' (One of the STATE_* const.)
        """
@@ -125,7 +148,7 @@ class Directories:
        if parent in self:
            return self.get_state(parent)
        else:
-            return STATE_NORMAL
+            return DirectoryState.Normal
    
    def has_any_file(self):
        try:
--- a/core/fs.py
+++ b/core/fs.py
@@ -63,6 +63,9 @@ class File:
        self._md5partial_offset = 0x4000 #16Kb
        self._md5partial_size   = 0x4000 #16Kb
    
+    def __repr__(self):
+        return "<{} {}>".format(self.__class__.__name__, str(self.path))
+    
    def __getattr__(self, attrname):
        # Only called when attr is not there
        if attrname in self.INITIAL_INFO:
@@ -147,6 +150,49 @@ class File:
        return self.path[-1]
    

+class Folder(File):
+    """A wrapper around a folder path.
+    
+    It has the size/md5 info of a File, but it's value are the sum of its subitems.
+    """
+    def __init__(self, path):
+        File.__init__(self, path)
+        self._subfolders = None
+    
+    def _all_items(self):
+        folders = self.subfolders
+        files = get_files(self.path)
+        return folders + files
+    
+    def _read_info(self, field):
+        if field in {'size', 'mtime'}:
+            size = sum((f.size for f in self._all_items()), 0)
+            self.size = size
+            stats = io.stat(self.path)
+            self.mtime = nonone(stats.st_mtime, 0)
+        elif field in {'md5', 'md5partial'}:
+            # What's sensitive here is that we must make sure that subfiles'
+            # md5 are always added up in the same order, but we also want a
+            # different md5 if a file gets moved in a different subdirectory.
+            def get_dir_md5_concat():
+                items = self._all_items()
+                items.sort(key=lambda f:f.path)
+                md5s = [getattr(f, field) for f in items]
+                return b''.join(md5s)
+            
+            md5 = hashlib.md5(get_dir_md5_concat())
+            digest = md5.digest()
+            setattr(self, field, digest)
+    
+    @property
+    def subfolders(self):
+        if self._subfolders is None:
+            subpaths = [self.path + name for name in io.listdir(self.path)]
+            subfolders = [p for p in subpaths if not io.islink(p) and io.isdir(p)]
+            self._subfolders = [Folder(p) for p in subfolders]
+        return self._subfolders
+    
+
 def get_file(path, fileclasses=[File]):
    for fileclass in fileclasses:
        if fileclass.can_handle(path):
@@ -172,12 +218,3 @@ def get_files(path, fileclasses=[File]):
        return result
    except EnvironmentError:
        raise InvalidPath(path)
-
-def get_all_files(path, fileclasses=[File]):
-    files = get_files(path, fileclasses=fileclasses)
-    filepaths = set(f.path for f in files)
-    subpaths = [path + name for name in io.listdir(path)]
-    # it's possible that a folder (bundle) gets into the file list. in that case, we don't want to recurse into it
-    subfolders = [p for p in subpaths if not io.islink(p) and io.isdir(p) and p not in filepaths]
-    subfiles = flatten(get_all_files(subpath, fileclasses=fileclasses) for subpath in subfolders)
-    return subfiles + files
--- a/core/gui/directory_tree.py
+++ b/core/gui/directory_tree.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 # Created By: Virgil Dupras
 # Created On: 2010-02-06
 # Copyright 2011 Hardcoded Software (http://www.hardcoded.net)
@@ -9,10 +8,10 @@

 from hscommon.gui.tree import Tree, Node

-from ..directories import STATE_NORMAL, STATE_REFERENCE, STATE_EXCLUDED
+from ..directories import DirectoryState
 from .base import GUIObject

-STATE_ORDER = [STATE_NORMAL, STATE_REFERENCE, STATE_EXCLUDED]
+STATE_ORDER = [DirectoryState.Normal, DirectoryState.Reference, DirectoryState.Excluded]

 # Lazily loads children
 class DirectoryNode(Node):
--- a/core/scanner.py
+++ b/core/scanner.py
@@ -22,7 +22,7 @@ class ScanType:
    Fields = 1
    FieldsNoOrder = 2
    Tag = 3
-    # number 4 is obsolete
+    Folders = 4
    Contents = 5
    ContentsAudio = 6

@@ -48,8 +48,8 @@ class Scanner:
            for f in j.iter_with_progress(files, tr("Read size of %d/%d files")):
                f.size # pre-read, makes a smoother progress if read here (especially for bundles)
            files = [f for f in files if f.size >= self.size_threshold]
-        if self.scan_type in (ScanType.Contents, ScanType.ContentsAudio):
-            sizeattr = 'size' if self.scan_type == ScanType.Contents else 'audiosize'
+        if self.scan_type in {ScanType.Contents, ScanType.ContentsAudio, ScanType.Folders}:
+            sizeattr = 'audiosize' if self.scan_type == ScanType.ContentsAudio else 'size'
            return engine.getmatches_by_contents(files, sizeattr, partial=self.scan_type==ScanType.ContentsAudio, j=j)
        else:
            j = j.start_subjob([2, 8])
@@ -92,10 +92,22 @@ class Scanner:
        j = j.start_subjob([8, 2])
        for f in [f for f in files if not hasattr(f, 'is_ref')]:
            f.is_ref = False
-        logging.info('Getting matches')
+        logging.info("Getting matches. Scan type: %d", self.scan_type)
        matches = self._getmatches(files, j)
        logging.info('Found %d matches' % len(matches))
        j.set_progress(100, tr("Removing false matches"))
+        if self.scan_type == ScanType.Folders and matches:
+            allpath = {m.first.path for m in matches}
+            allpath |= {m.second.path for m in matches}
+            sortedpaths = sorted(allpath)
+            toremove = set()
+            last_parent_path = sortedpaths[0]
+            for p in sortedpaths[1:]:
+                if p in last_parent_path:
+                    toremove.add(p)
+                else:
+                    last_parent_path = p
+            matches = [m for m in matches if m.first.path not in toremove or m.second.path not in toremove]
        if not self.mix_file_kind:
            matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
        matches = [m for m in matches if io.exists(m.first.path) and io.exists(m.second.path)]
--- a/core/tests/directories_test.py
+++ b/core/tests/directories_test.py
@@ -122,52 +122,52 @@ def test_states():
    d = Directories()
    p = testpath + 'onefile'
    d.add_path(p)
-    eq_(STATE_NORMAL,d.get_state(p))
-    d.set_state(p,STATE_REFERENCE)
-    eq_(STATE_REFERENCE,d.get_state(p))
-    eq_(STATE_REFERENCE,d.get_state(p + 'dir1'))
+    eq_(DirectoryState.Normal ,d.get_state(p))
+    d.set_state(p, DirectoryState.Reference)
+    eq_(DirectoryState.Reference ,d.get_state(p))
+    eq_(DirectoryState.Reference ,d.get_state(p + 'dir1'))
    eq_(1,len(d.states))
    eq_(p,list(d.states.keys())[0])
-    eq_(STATE_REFERENCE,d.states[p])
+    eq_(DirectoryState.Reference ,d.states[p])

 def test_get_state_with_path_not_there():
-    # When the path's not there, just return STATE_NORMAL
+    # When the path's not there, just return DirectoryState.Normal
    d = Directories()
    d.add_path(testpath + 'onefile')
-    eq_(d.get_state(testpath), STATE_NORMAL)
+    eq_(d.get_state(testpath), DirectoryState.Normal)

 def test_states_remain_when_larger_directory_eat_smaller_ones():
    d = Directories()
    p = testpath + 'onefile'
    d.add_path(p)
-    d.set_state(p,STATE_EXCLUDED)
+    d.set_state(p, DirectoryState.Excluded)
    d.add_path(testpath)
-    d.set_state(testpath,STATE_REFERENCE)
-    eq_(STATE_EXCLUDED,d.get_state(p))
-    eq_(STATE_EXCLUDED,d.get_state(p + 'dir1'))
-    eq_(STATE_REFERENCE,d.get_state(testpath))
+    d.set_state(testpath, DirectoryState.Reference)
+    eq_(DirectoryState.Excluded ,d.get_state(p))
+    eq_(DirectoryState.Excluded ,d.get_state(p + 'dir1'))
+    eq_(DirectoryState.Reference ,d.get_state(testpath))

 def test_set_state_keep_state_dict_size_to_minimum():
    d = Directories()
    p = testpath + 'fs'
    d.add_path(p)
-    d.set_state(p,STATE_REFERENCE)
-    d.set_state(p + 'dir1',STATE_REFERENCE)
+    d.set_state(p, DirectoryState.Reference)
+    d.set_state(p + 'dir1', DirectoryState.Reference)
    eq_(1,len(d.states))
-    eq_(STATE_REFERENCE,d.get_state(p + 'dir1'))
-    d.set_state(p + 'dir1',STATE_NORMAL)
+    eq_(DirectoryState.Reference ,d.get_state(p + 'dir1'))
+    d.set_state(p + 'dir1', DirectoryState.Normal)
    eq_(2,len(d.states))
-    eq_(STATE_NORMAL,d.get_state(p + 'dir1'))
-    d.set_state(p + 'dir1',STATE_REFERENCE)
+    eq_(DirectoryState.Normal ,d.get_state(p + 'dir1'))
+    d.set_state(p + 'dir1', DirectoryState.Reference)
    eq_(1,len(d.states))
-    eq_(STATE_REFERENCE,d.get_state(p + 'dir1'))
+    eq_(DirectoryState.Reference ,d.get_state(p + 'dir1'))

 def test_get_files():
    d = Directories()
    p = testpath + 'fs'
    d.add_path(p)
-    d.set_state(p + 'dir1',STATE_REFERENCE)
-    d.set_state(p + 'dir2',STATE_EXCLUDED)
+    d.set_state(p + 'dir1', DirectoryState.Reference)
+    d.set_state(p + 'dir2', DirectoryState.Excluded)
    files = list(d.get_files())
    eq_(5, len(files))
    for f in files:
@@ -176,11 +176,26 @@ def test_get_files():
        else:
            assert not f.is_ref

+def test_get_folders():
+    d = Directories()
+    p = testpath + 'fs'
+    d.add_path(p)
+    d.set_state(p + 'dir1', DirectoryState.Reference)
+    d.set_state(p + 'dir2', DirectoryState.Excluded)
+    folders = list(d.get_folders())
+    eq_(len(folders), 3)
+    ref = [f for f in folders if f.is_ref]
+    not_ref = [f for f in folders if not f.is_ref]
+    eq_(len(ref), 1)
+    eq_(ref[0].path, p + 'dir1')
+    eq_(len(not_ref), 2)
+    eq_(ref[0].size, 1)
+
 def test_get_files_with_inherited_exclusion():
    d = Directories()
    p = testpath + 'onefile'
    d.add_path(p)
-    d.set_state(p,STATE_EXCLUDED)
+    d.set_state(p, DirectoryState.Excluded)
    eq_([], list(d.get_files()))

 def test_save_and_load(tmpdir):
@@ -192,14 +207,14 @@ def test_save_and_load(tmpdir):
    io.mkdir(p2)
    d1.add_path(p1)
    d1.add_path(p2)
-    d1.set_state(p1, STATE_REFERENCE)
-    d1.set_state(p1 + 'dir1',STATE_EXCLUDED)
+    d1.set_state(p1, DirectoryState.Reference)
+    d1.set_state(p1 + 'dir1', DirectoryState.Excluded)
    tmpxml = str(tmpdir.join('directories_testunit.xml'))
    d1.save_to_file(tmpxml)
    d2.load_from_file(tmpxml)
    eq_(2, len(d2))
-    eq_(STATE_REFERENCE,d2.get_state(p1))
-    eq_(STATE_EXCLUDED,d2.get_state(p1 + 'dir1'))
+    eq_(DirectoryState.Reference ,d2.get_state(p1))
+    eq_(DirectoryState.Excluded ,d2.get_state(p1 + 'dir1'))

 def test_invalid_path():
    d = Directories()
@@ -211,7 +226,7 @@ def test_invalid_path():
 def test_set_state_on_invalid_path():
    d = Directories()
    try:
-        d.set_state(Path('foobar',),STATE_NORMAL)
+        d.set_state(Path('foobar',), DirectoryState.Normal)
    except LookupError:
        assert False

@@ -237,7 +252,7 @@ def test_unicode_save(tmpdir):
    io.mkdir(p1)
    io.mkdir(p1 + 'foo\xe9')
    d.add_path(p1)
-    d.set_state(p1 + 'foo\xe9', STATE_EXCLUDED)
+    d.set_state(p1 + 'foo\xe9', DirectoryState.Excluded)
    tmpxml = str(tmpdir.join('directories_testunit.xml'))
    try:
        d.save_to_file(tmpxml)
@@ -268,17 +283,17 @@ def test_get_state_returns_excluded_by_default_for_hidden_directories(tmpdir):
    hidden_dir_path = p + '.foo'
    io.mkdir(p + '.foo')
    d.add_path(p)
-    eq_(d.get_state(hidden_dir_path), STATE_EXCLUDED)
+    eq_(d.get_state(hidden_dir_path), DirectoryState.Excluded)
    # But it can be overriden
-    d.set_state(hidden_dir_path, STATE_NORMAL)
-    eq_(d.get_state(hidden_dir_path), STATE_NORMAL)
+    d.set_state(hidden_dir_path, DirectoryState.Normal)
+    eq_(d.get_state(hidden_dir_path), DirectoryState.Normal)

 def test_default_path_state_override(tmpdir):
    # It's possible for a subclass to override the default state of a path
    class MyDirectories(Directories):
        def _default_state_for_path(self, path):
            if 'foobar' in path:
-                return STATE_EXCLUDED
+                return DirectoryState.Excluded
    
    d = MyDirectories()
    p1 = Path(str(tmpdir))
@@ -287,11 +302,11 @@ def test_default_path_state_override(tmpdir):
    io.mkdir(p1 + 'foobaz')
    io.open(p1 + 'foobaz/somefile', 'w').close()
    d.add_path(p1)
-    eq_(d.get_state(p1 + 'foobaz'), STATE_NORMAL)
-    eq_(d.get_state(p1 + 'foobar'), STATE_EXCLUDED)
+    eq_(d.get_state(p1 + 'foobaz'), DirectoryState.Normal)
+    eq_(d.get_state(p1 + 'foobar'), DirectoryState.Excluded)
    eq_(len(list(d.get_files())), 1) # only the 'foobaz' file is there
    # However, the default state can be changed
-    d.set_state(p1 + 'foobar', STATE_NORMAL)
-    eq_(d.get_state(p1 + 'foobar'), STATE_NORMAL)
+    d.set_state(p1 + 'foobar', DirectoryState.Normal)
+    eq_(d.get_state(p1 + 'foobar'), DirectoryState.Normal)
    eq_(len(list(d.get_files())), 2)

--- a/core/tests/fs_test.py
+++ b/core/tests/fs_test.py
@@ -0,0 +1,45 @@
+# Created By: Virgil Dupras
+# Created On: 2009-10-23
+# Copyright 2011 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "BSD" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/bsd_license
+
+import hashlib
+
+from hscommon.path import Path
+from hscommon.testutil import eq_
+from core.tests.directories_test import create_fake_fs
+
+from .. import fs
+
+def test_size_aggregates_subfiles(tmpdir):
+    p = create_fake_fs(Path(str(tmpdir)))
+    b = fs.Folder(p)
+    eq_(b.size, 12)
+
+def test_md5_aggregate_subfiles_sorted(tmpdir):
+    #dir.allfiles can return child in any order. Thus, bundle.md5 must aggregate
+    #all files' md5 it contains, but it must make sure that it does so in the 
+    #same order everytime.
+    p = create_fake_fs(Path(str(tmpdir)))
+    b = fs.Folder(p)
+    md51 = fs.File(p + ('dir1', 'file1.test')).md5
+    md52 = fs.File(p + ('dir2', 'file2.test')).md5
+    md53 = fs.File(p + ('dir3', 'file3.test')).md5
+    md54 = fs.File(p + 'file1.test').md5
+    md55 = fs.File(p + 'file2.test').md5
+    md56 = fs.File(p + 'file3.test').md5
+    # The expected md5 is the md5 of md5s for folders and the direct md5 for files
+    folder_md51 = hashlib.md5(md51).digest()
+    folder_md52 = hashlib.md5(md52).digest()
+    folder_md53 = hashlib.md5(md53).digest()
+    md5 = hashlib.md5(folder_md51+folder_md52+folder_md53+md54+md55+md56)
+    eq_(b.md5, md5.digest())
+
+def test_has_file_attrs(tmpdir):
+    #a Folder must behave like a file, so it must have mtime attributes
+    b = fs.Folder(Path(str(tmpdir)))
+    assert b.mtime > 0
+    eq_(b.extension, '')
--- a/core/tests/scanner_test.py
+++ b/core/tests/scanner_test.py
@@ -471,3 +471,27 @@ def test_dont_group_files_that_dont_exist(tmpdir):
    s._getmatches = getmatches
    
    assert not s.GetDupeGroups([file1, file2])
+
+def test_folder_scan_exclude_subfolder_matches(fake_fileexists):
+    # when doing a Folders scan type, don't include matches for folders whose parent folder already
+    # match.
+    s = Scanner()
+    s.scan_type = ScanType.Folders
+    topf1 = no("top folder 1", size=42)
+    topf1.md5 = topf1.md5partial = b"some_md5_1"
+    topf1.path = Path('/topf1')
+    topf2 = no("top folder 2", size=42)
+    topf2.md5 = topf2.md5partial = b"some_md5_1"
+    topf2.path = Path('/topf2')
+    subf1 = no("sub folder 1", size=41)
+    subf1.md5 = subf1.md5partial = b"some_md5_2"
+    subf1.path = Path('/topf1/sub')
+    subf2 = no("sub folder 2", size=41)
+    subf2.md5 = subf2.md5partial = b"some_md5_2"
+    subf2.path = Path('/topf2/sub')
+    eq_(len(s.GetDupeGroups([topf1, topf2, subf1, subf2])), 1) # only top folders
+    # however, if another folder matches a subfolder, keep in in the matches
+    otherf = no("other folder", size=41)
+    otherf.md5 = otherf.md5partial = b"some_md5_2"
+    otherf.path = Path('/otherfolder')
+    eq_(len(s.GetDupeGroups([topf1, topf2, subf1, subf2, otherf])), 2)