1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2026-01-22 14:41:39 +00:00

[#89 state:fixed] Added a Folders scan type in dgse.

--HG--
rename : core_se/tests/fs_test.py => core/tests/fs_test.py
This commit is contained in:
Virgil Dupras
2011-04-12 13:22:29 +02:00
parent 0fea59007c
commit 279d44b7f3
23 changed files with 292 additions and 154 deletions

View File

@@ -368,7 +368,10 @@ class DupeGuru(RegistrableApplication, Broadcaster):
def start_scanning(self):
def do(j):
j.set_progress(0, tr("Collecting files to scan"))
files = list(self.directories.get_files())
if self.scanner.scan_type == scanner.ScanType.Folders:
files = list(self.directories.get_folders())
else:
files = list(self.directories.get_files())
if self.options['ignore_hardlink_matches']:
files = self._remove_hardlink_dupes(files)
logging.info('Scanning %d files' % len(files))

View File

@@ -15,9 +15,10 @@ from hscommon.util import FileOrPath
from . import fs
(STATE_NORMAL,
STATE_REFERENCE,
STATE_EXCLUDED) = range(3)
class DirectoryState:
Normal = 0
Reference = 1
Excluded = 2
class AlreadyThereError(Exception):
"""The path being added is already in the directory list"""
@@ -51,11 +52,11 @@ class Directories:
def _default_state_for_path(self, path):
# Override this in subclasses to specify the state of some special folders.
if path[-1].startswith('.'): # hidden
return STATE_EXCLUDED
return DirectoryState.Excluded
def _get_files(self, from_path):
state = self.get_state(from_path)
if state == STATE_EXCLUDED:
if state == DirectoryState.Excluded:
# Recursively get files from folders with lots of subfolder is expensive. However, there
# might be a subfolder in this path that is not excluded. What we want to do is to skim
# through self.states and see if we must continue, or we can stop right here to save time
@@ -63,11 +64,11 @@ class Directories:
return
try:
filepaths = set()
if state != STATE_EXCLUDED:
if state != DirectoryState.Excluded:
found_files = fs.get_files(from_path, fileclasses=self.fileclasses)
logging.debug("Collected {} files in folder {}".format(len(found_files), str(from_path)))
logging.debug("Collected %d files in folder %s", len(found_files), str(from_path))
for file in found_files:
file.is_ref = state == STATE_REFERENCE
file.is_ref = state == DirectoryState.Reference
filepaths.add(file.path)
yield file
subpaths = [from_path + name for name in io.listdir(from_path)]
@@ -79,6 +80,18 @@ class Directories:
except (EnvironmentError, fs.InvalidPath):
pass
def _get_folders(self, from_folder):
state = self.get_state(from_folder.path)
try:
for subfolder in from_folder.subfolders:
for folder in self._get_folders(subfolder):
yield folder
if state != DirectoryState.Excluded:
from_folder.is_ref = state == DirectoryState.Reference
yield from_folder
except (EnvironmentError, fs.InvalidPath):
pass
#---Public
def add_path(self, path):
"""Adds 'path' to self, if not already there.
@@ -113,6 +126,16 @@ class Directories:
for file in self._get_files(path):
yield file
def get_folders(self):
"""Returns a list of all folders that are not excluded.
Returned folders also have their 'is_ref' attr set.
"""
for path in self._dirs:
from_folder = fs.Folder(path)
for folder in self._get_folders(from_folder):
yield folder
def get_state(self, path):
"""Returns the state of 'path' (One of the STATE_* const.)
"""
@@ -125,7 +148,7 @@ class Directories:
if parent in self:
return self.get_state(parent)
else:
return STATE_NORMAL
return DirectoryState.Normal
def has_any_file(self):
try:

View File

@@ -63,6 +63,9 @@ class File:
self._md5partial_offset = 0x4000 #16Kb
self._md5partial_size = 0x4000 #16Kb
def __repr__(self):
return "<{} {}>".format(self.__class__.__name__, str(self.path))
def __getattr__(self, attrname):
# Only called when attr is not there
if attrname in self.INITIAL_INFO:
@@ -147,6 +150,49 @@ class File:
return self.path[-1]
class Folder(File):
"""A wrapper around a folder path.
It has the size/md5 info of a File, but it's value are the sum of its subitems.
"""
def __init__(self, path):
File.__init__(self, path)
self._subfolders = None
def _all_items(self):
folders = self.subfolders
files = get_files(self.path)
return folders + files
def _read_info(self, field):
if field in {'size', 'mtime'}:
size = sum((f.size for f in self._all_items()), 0)
self.size = size
stats = io.stat(self.path)
self.mtime = nonone(stats.st_mtime, 0)
elif field in {'md5', 'md5partial'}:
# What's sensitive here is that we must make sure that subfiles'
# md5 are always added up in the same order, but we also want a
# different md5 if a file gets moved in a different subdirectory.
def get_dir_md5_concat():
items = self._all_items()
items.sort(key=lambda f:f.path)
md5s = [getattr(f, field) for f in items]
return b''.join(md5s)
md5 = hashlib.md5(get_dir_md5_concat())
digest = md5.digest()
setattr(self, field, digest)
@property
def subfolders(self):
if self._subfolders is None:
subpaths = [self.path + name for name in io.listdir(self.path)]
subfolders = [p for p in subpaths if not io.islink(p) and io.isdir(p)]
self._subfolders = [Folder(p) for p in subfolders]
return self._subfolders
def get_file(path, fileclasses=[File]):
for fileclass in fileclasses:
if fileclass.can_handle(path):
@@ -172,12 +218,3 @@ def get_files(path, fileclasses=[File]):
return result
except EnvironmentError:
raise InvalidPath(path)
def get_all_files(path, fileclasses=[File]):
files = get_files(path, fileclasses=fileclasses)
filepaths = set(f.path for f in files)
subpaths = [path + name for name in io.listdir(path)]
# it's possible that a folder (bundle) gets into the file list. in that case, we don't want to recurse into it
subfolders = [p for p in subpaths if not io.islink(p) and io.isdir(p) and p not in filepaths]
subfiles = flatten(get_all_files(subpath, fileclasses=fileclasses) for subpath in subfolders)
return subfiles + files

View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
# Created By: Virgil Dupras
# Created On: 2010-02-06
# Copyright 2011 Hardcoded Software (http://www.hardcoded.net)
@@ -9,10 +8,10 @@
from hscommon.gui.tree import Tree, Node
from ..directories import STATE_NORMAL, STATE_REFERENCE, STATE_EXCLUDED
from ..directories import DirectoryState
from .base import GUIObject
STATE_ORDER = [STATE_NORMAL, STATE_REFERENCE, STATE_EXCLUDED]
STATE_ORDER = [DirectoryState.Normal, DirectoryState.Reference, DirectoryState.Excluded]
# Lazily loads children
class DirectoryNode(Node):

View File

@@ -22,7 +22,7 @@ class ScanType:
Fields = 1
FieldsNoOrder = 2
Tag = 3
# number 4 is obsolete
Folders = 4
Contents = 5
ContentsAudio = 6
@@ -48,8 +48,8 @@ class Scanner:
for f in j.iter_with_progress(files, tr("Read size of %d/%d files")):
f.size # pre-read, makes a smoother progress if read here (especially for bundles)
files = [f for f in files if f.size >= self.size_threshold]
if self.scan_type in (ScanType.Contents, ScanType.ContentsAudio):
sizeattr = 'size' if self.scan_type == ScanType.Contents else 'audiosize'
if self.scan_type in {ScanType.Contents, ScanType.ContentsAudio, ScanType.Folders}:
sizeattr = 'audiosize' if self.scan_type == ScanType.ContentsAudio else 'size'
return engine.getmatches_by_contents(files, sizeattr, partial=self.scan_type==ScanType.ContentsAudio, j=j)
else:
j = j.start_subjob([2, 8])
@@ -92,10 +92,22 @@ class Scanner:
j = j.start_subjob([8, 2])
for f in [f for f in files if not hasattr(f, 'is_ref')]:
f.is_ref = False
logging.info('Getting matches')
logging.info("Getting matches. Scan type: %d", self.scan_type)
matches = self._getmatches(files, j)
logging.info('Found %d matches' % len(matches))
j.set_progress(100, tr("Removing false matches"))
if self.scan_type == ScanType.Folders and matches:
allpath = {m.first.path for m in matches}
allpath |= {m.second.path for m in matches}
sortedpaths = sorted(allpath)
toremove = set()
last_parent_path = sortedpaths[0]
for p in sortedpaths[1:]:
if p in last_parent_path:
toremove.add(p)
else:
last_parent_path = p
matches = [m for m in matches if m.first.path not in toremove or m.second.path not in toremove]
if not self.mix_file_kind:
matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
matches = [m for m in matches if io.exists(m.first.path) and io.exists(m.second.path)]

View File

@@ -122,52 +122,52 @@ def test_states():
d = Directories()
p = testpath + 'onefile'
d.add_path(p)
eq_(STATE_NORMAL,d.get_state(p))
d.set_state(p,STATE_REFERENCE)
eq_(STATE_REFERENCE,d.get_state(p))
eq_(STATE_REFERENCE,d.get_state(p + 'dir1'))
eq_(DirectoryState.Normal ,d.get_state(p))
d.set_state(p, DirectoryState.Reference)
eq_(DirectoryState.Reference ,d.get_state(p))
eq_(DirectoryState.Reference ,d.get_state(p + 'dir1'))
eq_(1,len(d.states))
eq_(p,list(d.states.keys())[0])
eq_(STATE_REFERENCE,d.states[p])
eq_(DirectoryState.Reference ,d.states[p])
def test_get_state_with_path_not_there():
# When the path's not there, just return STATE_NORMAL
# When the path's not there, just return DirectoryState.Normal
d = Directories()
d.add_path(testpath + 'onefile')
eq_(d.get_state(testpath), STATE_NORMAL)
eq_(d.get_state(testpath), DirectoryState.Normal)
def test_states_remain_when_larger_directory_eat_smaller_ones():
d = Directories()
p = testpath + 'onefile'
d.add_path(p)
d.set_state(p,STATE_EXCLUDED)
d.set_state(p, DirectoryState.Excluded)
d.add_path(testpath)
d.set_state(testpath,STATE_REFERENCE)
eq_(STATE_EXCLUDED,d.get_state(p))
eq_(STATE_EXCLUDED,d.get_state(p + 'dir1'))
eq_(STATE_REFERENCE,d.get_state(testpath))
d.set_state(testpath, DirectoryState.Reference)
eq_(DirectoryState.Excluded ,d.get_state(p))
eq_(DirectoryState.Excluded ,d.get_state(p + 'dir1'))
eq_(DirectoryState.Reference ,d.get_state(testpath))
def test_set_state_keep_state_dict_size_to_minimum():
d = Directories()
p = testpath + 'fs'
d.add_path(p)
d.set_state(p,STATE_REFERENCE)
d.set_state(p + 'dir1',STATE_REFERENCE)
d.set_state(p, DirectoryState.Reference)
d.set_state(p + 'dir1', DirectoryState.Reference)
eq_(1,len(d.states))
eq_(STATE_REFERENCE,d.get_state(p + 'dir1'))
d.set_state(p + 'dir1',STATE_NORMAL)
eq_(DirectoryState.Reference ,d.get_state(p + 'dir1'))
d.set_state(p + 'dir1', DirectoryState.Normal)
eq_(2,len(d.states))
eq_(STATE_NORMAL,d.get_state(p + 'dir1'))
d.set_state(p + 'dir1',STATE_REFERENCE)
eq_(DirectoryState.Normal ,d.get_state(p + 'dir1'))
d.set_state(p + 'dir1', DirectoryState.Reference)
eq_(1,len(d.states))
eq_(STATE_REFERENCE,d.get_state(p + 'dir1'))
eq_(DirectoryState.Reference ,d.get_state(p + 'dir1'))
def test_get_files():
d = Directories()
p = testpath + 'fs'
d.add_path(p)
d.set_state(p + 'dir1',STATE_REFERENCE)
d.set_state(p + 'dir2',STATE_EXCLUDED)
d.set_state(p + 'dir1', DirectoryState.Reference)
d.set_state(p + 'dir2', DirectoryState.Excluded)
files = list(d.get_files())
eq_(5, len(files))
for f in files:
@@ -176,11 +176,26 @@ def test_get_files():
else:
assert not f.is_ref
def test_get_folders():
d = Directories()
p = testpath + 'fs'
d.add_path(p)
d.set_state(p + 'dir1', DirectoryState.Reference)
d.set_state(p + 'dir2', DirectoryState.Excluded)
folders = list(d.get_folders())
eq_(len(folders), 3)
ref = [f for f in folders if f.is_ref]
not_ref = [f for f in folders if not f.is_ref]
eq_(len(ref), 1)
eq_(ref[0].path, p + 'dir1')
eq_(len(not_ref), 2)
eq_(ref[0].size, 1)
def test_get_files_with_inherited_exclusion():
d = Directories()
p = testpath + 'onefile'
d.add_path(p)
d.set_state(p,STATE_EXCLUDED)
d.set_state(p, DirectoryState.Excluded)
eq_([], list(d.get_files()))
def test_save_and_load(tmpdir):
@@ -192,14 +207,14 @@ def test_save_and_load(tmpdir):
io.mkdir(p2)
d1.add_path(p1)
d1.add_path(p2)
d1.set_state(p1, STATE_REFERENCE)
d1.set_state(p1 + 'dir1',STATE_EXCLUDED)
d1.set_state(p1, DirectoryState.Reference)
d1.set_state(p1 + 'dir1', DirectoryState.Excluded)
tmpxml = str(tmpdir.join('directories_testunit.xml'))
d1.save_to_file(tmpxml)
d2.load_from_file(tmpxml)
eq_(2, len(d2))
eq_(STATE_REFERENCE,d2.get_state(p1))
eq_(STATE_EXCLUDED,d2.get_state(p1 + 'dir1'))
eq_(DirectoryState.Reference ,d2.get_state(p1))
eq_(DirectoryState.Excluded ,d2.get_state(p1 + 'dir1'))
def test_invalid_path():
d = Directories()
@@ -211,7 +226,7 @@ def test_invalid_path():
def test_set_state_on_invalid_path():
d = Directories()
try:
d.set_state(Path('foobar',),STATE_NORMAL)
d.set_state(Path('foobar',), DirectoryState.Normal)
except LookupError:
assert False
@@ -237,7 +252,7 @@ def test_unicode_save(tmpdir):
io.mkdir(p1)
io.mkdir(p1 + 'foo\xe9')
d.add_path(p1)
d.set_state(p1 + 'foo\xe9', STATE_EXCLUDED)
d.set_state(p1 + 'foo\xe9', DirectoryState.Excluded)
tmpxml = str(tmpdir.join('directories_testunit.xml'))
try:
d.save_to_file(tmpxml)
@@ -268,17 +283,17 @@ def test_get_state_returns_excluded_by_default_for_hidden_directories(tmpdir):
hidden_dir_path = p + '.foo'
io.mkdir(p + '.foo')
d.add_path(p)
eq_(d.get_state(hidden_dir_path), STATE_EXCLUDED)
eq_(d.get_state(hidden_dir_path), DirectoryState.Excluded)
# But it can be overriden
d.set_state(hidden_dir_path, STATE_NORMAL)
eq_(d.get_state(hidden_dir_path), STATE_NORMAL)
d.set_state(hidden_dir_path, DirectoryState.Normal)
eq_(d.get_state(hidden_dir_path), DirectoryState.Normal)
def test_default_path_state_override(tmpdir):
# It's possible for a subclass to override the default state of a path
class MyDirectories(Directories):
def _default_state_for_path(self, path):
if 'foobar' in path:
return STATE_EXCLUDED
return DirectoryState.Excluded
d = MyDirectories()
p1 = Path(str(tmpdir))
@@ -287,11 +302,11 @@ def test_default_path_state_override(tmpdir):
io.mkdir(p1 + 'foobaz')
io.open(p1 + 'foobaz/somefile', 'w').close()
d.add_path(p1)
eq_(d.get_state(p1 + 'foobaz'), STATE_NORMAL)
eq_(d.get_state(p1 + 'foobar'), STATE_EXCLUDED)
eq_(d.get_state(p1 + 'foobaz'), DirectoryState.Normal)
eq_(d.get_state(p1 + 'foobar'), DirectoryState.Excluded)
eq_(len(list(d.get_files())), 1) # only the 'foobaz' file is there
# However, the default state can be changed
d.set_state(p1 + 'foobar', STATE_NORMAL)
eq_(d.get_state(p1 + 'foobar'), STATE_NORMAL)
d.set_state(p1 + 'foobar', DirectoryState.Normal)
eq_(d.get_state(p1 + 'foobar'), DirectoryState.Normal)
eq_(len(list(d.get_files())), 2)

45
core/tests/fs_test.py Normal file
View File

@@ -0,0 +1,45 @@
# Created By: Virgil Dupras
# Created On: 2009-10-23
# Copyright 2011 Hardcoded Software (http://www.hardcoded.net)
#
# This software is licensed under the "BSD" License as described in the "LICENSE" file,
# which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/bsd_license
import hashlib
from hscommon.path import Path
from hscommon.testutil import eq_
from core.tests.directories_test import create_fake_fs
from .. import fs
def test_size_aggregates_subfiles(tmpdir):
p = create_fake_fs(Path(str(tmpdir)))
b = fs.Folder(p)
eq_(b.size, 12)
def test_md5_aggregate_subfiles_sorted(tmpdir):
#dir.allfiles can return child in any order. Thus, bundle.md5 must aggregate
#all files' md5 it contains, but it must make sure that it does so in the
#same order everytime.
p = create_fake_fs(Path(str(tmpdir)))
b = fs.Folder(p)
md51 = fs.File(p + ('dir1', 'file1.test')).md5
md52 = fs.File(p + ('dir2', 'file2.test')).md5
md53 = fs.File(p + ('dir3', 'file3.test')).md5
md54 = fs.File(p + 'file1.test').md5
md55 = fs.File(p + 'file2.test').md5
md56 = fs.File(p + 'file3.test').md5
# The expected md5 is the md5 of md5s for folders and the direct md5 for files
folder_md51 = hashlib.md5(md51).digest()
folder_md52 = hashlib.md5(md52).digest()
folder_md53 = hashlib.md5(md53).digest()
md5 = hashlib.md5(folder_md51+folder_md52+folder_md53+md54+md55+md56)
eq_(b.md5, md5.digest())
def test_has_file_attrs(tmpdir):
#a Folder must behave like a file, so it must have mtime attributes
b = fs.Folder(Path(str(tmpdir)))
assert b.mtime > 0
eq_(b.extension, '')

View File

@@ -471,3 +471,27 @@ def test_dont_group_files_that_dont_exist(tmpdir):
s._getmatches = getmatches
assert not s.GetDupeGroups([file1, file2])
def test_folder_scan_exclude_subfolder_matches(fake_fileexists):
# when doing a Folders scan type, don't include matches for folders whose parent folder already
# match.
s = Scanner()
s.scan_type = ScanType.Folders
topf1 = no("top folder 1", size=42)
topf1.md5 = topf1.md5partial = b"some_md5_1"
topf1.path = Path('/topf1')
topf2 = no("top folder 2", size=42)
topf2.md5 = topf2.md5partial = b"some_md5_1"
topf2.path = Path('/topf2')
subf1 = no("sub folder 1", size=41)
subf1.md5 = subf1.md5partial = b"some_md5_2"
subf1.path = Path('/topf1/sub')
subf2 = no("sub folder 2", size=41)
subf2.md5 = subf2.md5partial = b"some_md5_2"
subf2.path = Path('/topf2/sub')
eq_(len(s.GetDupeGroups([topf1, topf2, subf1, subf2])), 1) # only top folders
# however, if another folder matches a subfolder, keep in in the matches
otherf = no("other folder", size=41)
otherf.md5 = otherf.md5partial = b"some_md5_2"
otherf.path = Path('/otherfolder')
eq_(len(s.GetDupeGroups([topf1, topf2, subf1, subf2, otherf])), 2)