1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2025-05-08 01:39:50 +00:00

Update directory scanning to use os.scandir()

- Change to use os.scandir() instead of os.walk() to leverage DirEntry objects.
- Avoids extra calls to stat() on files during fs.can_handle()
- See 3x speed improvement on Windows in some cases
This commit is contained in:
Andrew Senetar 2022-03-29 23:37:56 -05:00
parent 43fcc52291
commit efd500ecc1
Signed by: arsenetar
GPG Key ID: C63300DCE48AB2F1
2 changed files with 41 additions and 43 deletions

View File

@ -90,46 +90,44 @@ class Directories:
return DirectoryState.EXCLUDED return DirectoryState.EXCLUDED
def _get_files(self, from_path, fileclasses, j): def _get_files(self, from_path, fileclasses, j):
for root, dirs, files in os.walk(str(from_path)):
j.check_if_cancelled()
root_path = Path(root)
state = self.get_state(root_path)
if state == DirectoryState.EXCLUDED and not any(
p.parts[: len(root_path.parts)] == root_path.parts for p in self.states
):
# Recursively get files from folders with lots of subfolder is expensive. However, there
# might be a subfolder in this path that is not excluded. What we want to do is to skim
# through self.states and see if we must continue, or we can stop right here to save time
del dirs[:]
try: try:
if state != DirectoryState.EXCLUDED: with os.scandir(from_path) as iter:
# Old logic root_path = Path(from_path)
if self._exclude_list is None or not self._exclude_list.mark_count: state = self.get_state(root_path)
found_files = [fs.get_file(root_path.joinpath(f), fileclasses=fileclasses) for f in files] # if we have no un-excluded dirs under this directory skip going deeper
else: skip_dirs = state == DirectoryState.EXCLUDED and not any(
found_files = [] p.parts[: len(root_path.parts)] == root_path.parts for p in self.states
# print(f"len of files: {len(files)} {files}") )
for f in files: count = 0
if not self._exclude_list.is_excluded(root, f): for item in iter:
found_files.append(fs.get_file(root_path.joinpath(f), fileclasses=fileclasses)) j.check_if_cancelled()
found_files = [f for f in found_files if f is not None] try:
# In some cases, directories can be considered as files by dupeGuru, which is if item.is_dir():
# why we have this line below. In fact, there only one case: Bundle files under if skip_dirs:
# OS X... In other situations, this forloop will do nothing. continue
for d in dirs[:]: yield from self._get_files(item.path, fileclasses, j)
f = fs.get_file(root_path.joinpath(d), fileclasses=fileclasses) continue
if f is not None: elif state == DirectoryState.EXCLUDED:
found_files.append(f) continue
dirs.remove(d) # File excluding or not
if (
self._exclude_list is None
or not self._exclude_list.mark_count
or not self._exclude_list.is_excluded(str(from_path), item.name)
):
file = fs.get_file(item, fileclasses=fileclasses)
if file:
file.is_ref = state == DirectoryState.REFERENCE
count += 1
yield file
except (EnvironmentError, OSError, fs.InvalidPath):
pass
logging.debug( logging.debug(
"Collected %d files in folder %s", "Collected %d files in folder %s",
len(found_files), count,
str(root_path), str(root_path),
) )
for file in found_files: except OSError:
file.is_ref = state == DirectoryState.REFERENCE
yield file
except (EnvironmentError, fs.InvalidPath):
pass pass
def _get_folders(self, from_folder, j): def _get_folders(self, from_folder, j):

View File

@ -379,7 +379,7 @@ class Folder(File):
if self._subfolders is None: if self._subfolders is None:
with os.scandir(self.path) as iter: with os.scandir(self.path) as iter:
subfolders = [p.path for p in iter if not p.is_symlink() and p.is_dir()] subfolders = [p.path for p in iter if not p.is_symlink() and p.is_dir()]
self._subfolders = [self.__class__(p) for p in subfolders] self._subfolders = [self.__class__(Path(p)) for p in subfolders]
return self._subfolders return self._subfolders
@classmethod @classmethod
@ -398,7 +398,7 @@ def get_file(path, fileclasses=[File]):
for fileclass in fileclasses: for fileclass in fileclasses:
if fileclass.can_handle(path): if fileclass.can_handle(path):
if type(path) is os.DirEntry: if type(path) is os.DirEntry:
return fileclass(path.path) return fileclass(Path(path.path))
return fileclass(path) return fileclass(path)