1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2025-05-07 09:19:50 +00:00

Compare commits

...

5 Commits

Author SHA1 Message Date
a37b5b0eeb
Fix #988 2022-03-30 01:06:51 -05:00
efd500ecc1
Update directory scanning to use os.scandir()
- Change to use os.scandir() instead of os.walk() to leverage DirEntry objects.
- Avoids extra calls to stat() on files during fs.can_handle()
- See 3x speed improvement on Windows in some cases
2022-03-29 23:37:56 -05:00
43fcc52291
Replace pathlib.glob() with os.scandir() in fs.py 2022-03-29 22:35:38 -05:00
50f5db1543
Update fs to support DirEntry on get_file() 2022-03-29 22:32:36 -05:00
a5b0ccdd02
Improve performance of Directories.get_state() 2022-03-29 21:48:14 -05:00
3 changed files with 55 additions and 56 deletions

View File

@ -248,7 +248,7 @@ class DupeGuru(Broadcaster):
ref = group.ref ref = group.ref
linkfunc = os.link if use_hardlinks else os.symlink linkfunc = os.link if use_hardlinks else os.symlink
linkfunc(str(ref.path), str_path) linkfunc(str(ref.path), str_path)
self.clean_empty_dirs(dupe.path.parent()) self.clean_empty_dirs(dupe.path.parent)
def _create_file(self, path): def _create_file(self, path):
# We add fs.Folder to fileclasses in case the file we're loading contains folder paths. # We add fs.Folder to fileclasses in case the file we're loading contains folder paths.

View File

@ -90,47 +90,45 @@ class Directories:
return DirectoryState.EXCLUDED return DirectoryState.EXCLUDED
def _get_files(self, from_path, fileclasses, j): def _get_files(self, from_path, fileclasses, j):
for root, dirs, files in os.walk(str(from_path)): try:
j.check_if_cancelled() with os.scandir(from_path) as iter:
root_path = Path(root) root_path = Path(from_path)
state = self.get_state(root_path) state = self.get_state(root_path)
if state == DirectoryState.EXCLUDED and not any( # if we have no un-excluded dirs under this directory skip going deeper
p.parts[: len(root_path.parts)] == root_path.parts for p in self.states skip_dirs = state == DirectoryState.EXCLUDED and not any(
): p.parts[: len(root_path.parts)] == root_path.parts for p in self.states
# Recursively get files from folders with lots of subfolder is expensive. However, there )
# might be a subfolder in this path that is not excluded. What we want to do is to skim count = 0
# through self.states and see if we must continue, or we can stop right here to save time for item in iter:
del dirs[:] j.check_if_cancelled()
try: try:
if state != DirectoryState.EXCLUDED: if item.is_dir():
# Old logic if skip_dirs:
if self._exclude_list is None or not self._exclude_list.mark_count: continue
found_files = [fs.get_file(root_path.joinpath(f), fileclasses=fileclasses) for f in files] yield from self._get_files(item.path, fileclasses, j)
else: continue
found_files = [] elif state == DirectoryState.EXCLUDED:
# print(f"len of files: {len(files)} {files}") continue
for f in files: # File excluding or not
if not self._exclude_list.is_excluded(root, f): if (
found_files.append(fs.get_file(root_path.joinpath(f), fileclasses=fileclasses)) self._exclude_list is None
found_files = [f for f in found_files if f is not None] or not self._exclude_list.mark_count
# In some cases, directories can be considered as files by dupeGuru, which is or not self._exclude_list.is_excluded(str(from_path), item.name)
# why we have this line below. In fact, there only one case: Bundle files under ):
# OS X... In other situations, this forloop will do nothing. file = fs.get_file(item, fileclasses=fileclasses)
for d in dirs[:]: if file:
f = fs.get_file(root_path.joinpath(d), fileclasses=fileclasses) file.is_ref = state == DirectoryState.REFERENCE
if f is not None: count += 1
found_files.append(f) yield file
dirs.remove(d) except (EnvironmentError, OSError, fs.InvalidPath):
logging.debug( pass
"Collected %d files in folder %s", logging.debug(
len(found_files), "Collected %d files in folder %s",
str(root_path), count,
) str(root_path),
for file in found_files: )
file.is_ref = state == DirectoryState.REFERENCE except OSError:
yield file pass
except (EnvironmentError, fs.InvalidPath):
pass
def _get_folders(self, from_folder, j): def _get_folders(self, from_folder, j):
j.check_if_cancelled() j.check_if_cancelled()
@ -222,14 +220,11 @@ class Directories:
if state != DirectoryState.NORMAL: if state != DirectoryState.NORMAL:
self.states[path] = state self.states[path] = state
return state return state
# find the longest parent path that is in states and return that state if found
prevlen = 0 # NOTE: path.parents is ordered longest to shortest
# we loop through the states to find the longest matching prefix for parent_path in path.parents:
# if the parent has a state in cache, return that state if parent_path in self.states:
for p, s in self.states.items(): return self.states[parent_path]
if p in path.parents and len(p.parts) > prevlen:
prevlen = len(p.parts)
state = s
return state return state
def has_any_file(self): def has_any_file(self):

View File

@ -377,8 +377,9 @@ class Folder(File):
@property @property
def subfolders(self): def subfolders(self):
if self._subfolders is None: if self._subfolders is None:
subfolders = [p for p in self.path.glob("*") if not p.is_symlink() and p.is_dir()] with os.scandir(self.path) as iter:
self._subfolders = [self.__class__(p) for p in subfolders] subfolders = [p.path for p in iter if not p.is_symlink() and p.is_dir()]
self._subfolders = [self.__class__(Path(p)) for p in subfolders]
return self._subfolders return self._subfolders
@classmethod @classmethod
@ -396,6 +397,8 @@ def get_file(path, fileclasses=[File]):
""" """
for fileclass in fileclasses: for fileclass in fileclasses:
if fileclass.can_handle(path): if fileclass.can_handle(path):
if type(path) is os.DirEntry:
return fileclass(Path(path.path))
return fileclass(path) return fileclass(path)
@ -408,10 +411,11 @@ def get_files(path, fileclasses=[File]):
assert all(issubclass(fileclass, File) for fileclass in fileclasses) assert all(issubclass(fileclass, File) for fileclass in fileclasses)
try: try:
result = [] result = []
for path in path.glob("*"): with os.scandir(path) as iter:
file = get_file(path, fileclasses=fileclasses) for item in iter:
if file is not None: file = get_file(item, fileclasses=fileclasses)
result.append(file) if file is not None:
result.append(file)
return result return result
except EnvironmentError: except EnvironmentError:
raise InvalidPath(path) raise InvalidPath(path)