ci: Merge artifacts

- Merge the resulting artifacts - Use only the .so files from build
chore: Update VS Code configuration
2024-05-11 01:21:58 -07:00 · 2024-05-11 00:12:19 -07:00 · 2024-05-11 00:11:27 -07:00 · 2024-02-19 14:39:04 -08:00 · 2024-02-19 14:22:55 -08:00 · 2024-02-19 14:17:40 -08:00
310 changed files with 14600 additions and 10950 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@ -0,0 +1,13 @@
+# These are supported funding model platforms
+
+github: arsenetar
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@ -0,0 +1,50 @@
+name: "CodeQL"
+
+on:
+  push:
+    branches: [master]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [master]
+  schedule:
+    - cron: "24 20 * * 2"
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: ["cpp", "python"]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      # Initializes the CodeQL tools for scanning.
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v1
+        with:
+          languages: ${{ matrix.language }}
+          # If you wish to specify custom queries, you can do so here or in a config file.
+          # By default, queries listed here will override any specified in a config file.
+          # Prefix the list here with "+" to use these queries and those in the config file.
+          # queries: ./path/to/local/query, your-org/your-repo/queries@main
+      - if: matrix.language == 'cpp'
+        name: Build Cpp
+        run: |
+          sudo apt-get update
+          sudo apt-get install python3-pyqt5
+          make modules
+      - if: matrix.language == 'python'
+        name: Autobuild
+        uses: github/codeql-action/autobuild@v1
+      # Analysis
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v1
--- a/.github/workflows/default.yml
+++ b/.github/workflows/default.yml
@ -0,0 +1,65 @@
+# Workflow lints, and checks format in parallel then runs tests on all platforms
+
+name: Default CI/CD
+
+on:
+  push:
+  pull_request:
+    branches: [master]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - uses: pre-commit/action@v3.0.1
+  test:
+    needs: [pre-commit]
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: [3.7, 3.8, 3.9, "3.10", "3.11", "3.12"]
+        include:
+          - os: windows-latest
+            python-version: "3.12"
+          - os: macos-latest
+            python-version: "3.12"
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install setuptools
+          pip install -r requirements.txt -r requirements-extra.txt
+      - name: Build python modules
+        run: |
+          python build.py --modules
+      - name: Run tests
+        run: |
+          pytest core hscommon
+      - name: Upload Artifacts
+        if: matrix.os == 'ubuntu-latest'
+        uses: actions/upload-artifact@v4
+        with:
+          name: modules ${{ matrix.python-version }}
+          path: build/**/*.so
+  merge-artifacts:
+    needs: [test]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Merge Artifacts
+        uses: actions/upload-artifact/merge@v4
+        with:
+          name: modules
+          pattern: modules*
+          delete-merged: true
--- a/.github/workflows/tx-push.yml
+++ b/.github/workflows/tx-push.yml
@ -0,0 +1,26 @@
+# Push translation source to Transifex
+name: Transifex Sync
+
+on:
+  push:
+    branches:
+      - master
+    paths:
+      - locale/*.pot
+
+env:
+  TX_VERSION: "v1.6.10"
+
+jobs:
+  push-source:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Get Transifex Client
+        run: |
+          curl -o- https://raw.githubusercontent.com/transifex/cli/master/install.sh | bash -s -- $TX_VERSION
+      - name: Update & Push Translation Sources
+        env:
+          TX_TOKEN: ${{ secrets.TX_TOKEN }}
+        run: |
+          ./tx push -s --use-git-timestamps
--- a/.gitignore
+++ b/.gitignore
@ -1,28 +1,111 @@
-.DS_Store
-__pycache__
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
 *.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
 *.mo
-*.waf*
-.lock-waf*
-.tox
-/tags
+#*.pot

-build
-dist
-env*
-/deps
-cocoa/autogen
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/

-/run.py
-/cocoa/*/Info.plist
-/cocoa/*/build
+# Environments
+.env
+.venv
+env*/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# macOS
+.DS_Store
+
+# Visual Studio Code
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+# dupeGuru Specific
 /qt/*_rc.py
 /help/*/conf.py
 /help/*/changelog.rst
-/transifex
+cocoa/autogen
+/cocoa/*/Info.plist
+/cocoa/*/build

-*.pyd
-*.exe
-*.spec
-
-.vscode
+*.waf*
+.lock-waf*
+/tags
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,24 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-yaml
+      - id: check-toml
+      - id: end-of-file-fixer
+        exclude: ".*.json"
+      - id: trailing-whitespace
+  - repo: https://github.com/psf/black
+    rev: 24.2.0
+    hooks:
+      - id: black
+  - repo: https://github.com/PyCQA/flake8
+    rev: 7.0.0
+    hooks:
+      - id: flake8
+        exclude: ^(.tox|env|build|dist|help|qt/dg_rc.py|pkg).*
+  - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
+    rev: v9.11.0
+    hooks:
+      - id: commitlint
+        stages: [commit-msg]
+        additional_dependencies: ["@commitlint/config-conventional"]
--- a/.sonarcloud.properties
+++ b/.sonarcloud.properties
@ -0,0 +1 @@
+sonar.python.version=3.7, 3.8, 3.9, 3.10, 3.11
--- a/.travis.yml
+++ b/.travis.yml
@ -1,27 +0,0 @@
-sudo: false
-language: python
-install: 
-        - pip3 install -r requirements.txt -r requirements-extra.txt
-script: tox
-matrix:
-        include:
-                - os: "linux"
-                  dist: "xenial"
-                  python: "3.6"
-                - os: "linux"
-                  dist: "xenial"
-                  python: "3.7"
-                - os: "linux"
-                  dist: "focal"
-                  python: "3.8"
-                - os: "linux"
-                  dist: "focal"
-                  python: "3.9"
-                - os: "windows"
-                  language: shell
-                  python: "3.8"
-                  env: "PATH=/c/python38:/c/python38/Scripts:$PATH"
-                  before_install:
-                          - choco install python --version=3.8.6
-                          - cp /c/python38/python.exe /c/python38/python3.exe
-                  script: tox -e py38
--- a/.tx/config
+++ b/.tx/config
@ -1,21 +1,20 @@
 [main]
 host = https://www.transifex.com

-[dupeguru.core]
-file_filter = locale/<lang>/LC_MESSAGES/core.po
-source_file = locale/core.pot
-source_lang = en
-type = PO
-
-[dupeguru.columns]
+[o:voltaicideas:p:dupeguru-1:r:columns]
 file_filter = locale/<lang>/LC_MESSAGES/columns.po
 source_file = locale/columns.pot
 source_lang = en
-type = PO
+type        = PO

-[dupeguru.ui]
+[o:voltaicideas:p:dupeguru-1:r:core]
+file_filter = locale/<lang>/LC_MESSAGES/core.po
+source_file = locale/core.pot
+source_lang = en
+type        = PO
+
+[o:voltaicideas:p:dupeguru-1:r:ui]
 file_filter = locale/<lang>/LC_MESSAGES/ui.po
 source_file = locale/ui.pot
 source_lang = en
-type = PO
-
+type        = PO
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@ -0,0 +1,12 @@
+{
+    // List of extensions which should be recommended for users of this workspace.
+    "recommendations": [
+        "redhat.vscode-yaml",
+        "ms-python.vscode-pylance",
+        "ms-python.python",
+        "ms-python.black-formatter",
+    ],
+    // List of extensions recommended by VS Code that should not be recommended for
+    // users of this workspace.
+    "unwantedRecommendations": []
+}
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,17 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "DupuGuru",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "run.py",
+            "console": "integratedTerminal",
+            "subProcess": true,
+            "justMyCode": false
+        },
+    ]
+}
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,17 @@
+{
+    "cSpell.words": [
+        "Dupras",
+        "hscommon"
+    ],
+    "editor.rulers": [
+        88,
+        120
+    ],
+    "python.languageServer": "Pylance",
+    "yaml.schemaStore.enable": true,
+    "[python]": {
+        "editor.formatOnSave": true,
+        "editor.defaultFormatter": "ms-python.black-formatter"
+    },
+    "python.testing.pytestEnabled": true
+}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,88 @@
+# Contributing to dupeGuru
+
+The following is a set of guidelines and information for contributing to dupeGuru.
+
+#### Table of Contents
+
+[Things to Know Before Starting](#things-to-know-before-starting)
+
+[Ways to Contribute](#ways-to-contribute)
+  * [Reporting Bugs](#reporting-bugs)
+  * [Suggesting Enhancements](#suggesting-enhancements)
+  * [Localization](#localization)
+  * [Code Contribution](#code-contribution)
+  * [Pull Requests](#pull-requests)
+
+[Style Guides](#style-guides)
+  * [Git Commit Messages](#git-commit-messages)
+  * [Python Style Guide](#python-style-guide)
+  * [Documentation Style Guide](#documentation-style-guide)
+
+[Additional Notes](#additional-notes)
+  * [Issue and Pull Request Labels](#issue-and-pull-request-labels)
+
+## Things to Know Before Starting
+**TODO**
+## Ways to contribute
+### Reporting Bugs
+**TODO**
+### Suggesting Enhancements
+**TODO**
+### Localization
+**TODO**
+### Code Contribution
+**TODO**
+### Pull Requests
+Please follow these steps to have your contribution considered by the maintainers:
+
+1. Keep Pull Request specific to one feature or bug.
+2. Follow the [style guides](#style-guides)
+3. After you submit your pull request, verify that all [status checks](https://help.github.com/articles/about-status-checks/) are passing <details><summary>What if the status checks are failing?</summary>If a status check is failing, and you believe that the failure is unrelated to your change, please leave a comment on the pull request explaining why you believe the failure is unrelated. A maintainer will re-run the status check for you. If we conclude that the failure was a false positive, then we will open an issue to track that problem with our status check suite.</details>
+
+While the prerequisites above must be satisfied prior to having your pull request reviewed, the reviewer(s) may ask you to complete additional design work, tests, or other changes before your pull request can be ultimately accepted.
+
+## Style Guides
+### Git Commit Messages
+- Use the present tense ("Add feature" not "Added feature")
+- Use the imperative mood ("Move cursor to..." not "Moves cursor to...")
+- Limit the first line to 72 characters or less
+- Reference issues and pull requests liberally after the first line
+
+### Python Style Guide
+- All files are formatted with [Black](https://github.com/psf/black)
+- Follow [PEP 8](https://peps.python.org/pep-0008/) as much as practical
+- Pass [flake8](https://flake8.pycqa.org/en/latest/) linting
+- Include [PEP 484](https://peps.python.org/pep-0484/) type hints (new code)
+
+### Documentation Style Guide
+**TODO**
+
+## Additional Notes
+### Issue and Pull Request Labels
+This section lists and describes the various labels used with issues and pull requests.  Each of the labels is listed with a search link as well.
+
+#### Issue Type and Status
+| Label name | Search | Description |
+|------------|--------|-------------|
+| `enhancement` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3Aenhancement) | Feature requests and enhancements. |
+| `bug` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3Abug) | Bug reports. |
+| `duplicate` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3Aduplicate) | Issue is a duplicate of existing issue. |
+| `needs-reproduction` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3Aneeds-reproduction) | A bug that has not been able to be reproduced. |
+| `needs-information` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3Aneeds-information) | More information needs to be collected about these problems or feature requests (e.g. steps to reproduce). |
+| `blocked` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3Ablocked) | Issue blocked by other issues. |
+| `beginner` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3Abeginner) | Less complex issues for users who want to start contributing. |
+
+#### Category Labels
+| Label name | Search | Description |
+|------------|--------|-------------|
+| `3rd party` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3A%223rd%20party%22)  | Related to a 3rd party dependency. |
+| `crash` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3Acrash) | Related to crashes (complete, or unhandled). |
+| `documentation` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3Adocumentation) | Related to any documentation. |
+| `linux` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3linux) | Related to running on Linux. |
+| `mac` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3Amac) | Related to running on macOS. |
+| `performance` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3Aperformance) | Related to the performance. |
+| `ui` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3Aui)| Related to the visual design. |
+| `windows` | [search](https://github.com/arsenetar/dupeguru/issues?q=is%3Aopen+is%3Aissue+label%3Awindows) | Related to running on Windows. |
+
+#### Pull Request Labels
+None at this time, if the volume of Pull Requests increase labels may be added to manage.
--- a/1
+++ b/1
@ -619,4 +619,3 @@ Program, unless a warranty or assumption of liability accompanies a
 copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS
-
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,5 @@
+recursive-include core *.h
+recursive-include core *.m
+include run.py
+graft locale
+graft help
--- a/14
+++ b/14
@ -1,10 +1,10 @@
 PYTHON ?= python3
 PYTHON_VERSION_MINOR := $(shell ${PYTHON} -c "import sys; print(sys.version_info.minor)")
 PYRCC5 ?= pyrcc5
-REQ_MINOR_VERSION = 6
+REQ_MINOR_VERSION = 7
 PREFIX ?= /usr/local

-# Window compatability via Msys2 
+# Window compatability via Msys2
 # - venv creates Scripts instead of bin
 # - compile generates .pyd instead of .so
 # - venv with --sytem-site-packages has issues on windows as well...
@ -12,7 +12,7 @@ PREFIX ?= /usr/local
 ifeq ($(shell ${PYTHON} -c "import platform; print(platform.system())"), Windows)
 	BIN = Scripts
 	SO = *.pyd
-	VENV_OPTIONS = 
+	VENV_OPTIONS =
 else
 	BIN = bin
 	SO = *.so
@ -35,7 +35,7 @@ endif
 # Our build scripts are not very "make like" yet and perform their task in a bundle. For now, we
 # use one of each file to act as a representative, a target, of these groups.

-packages = hscommon qtlib core qt
+packages = hscommon core qt
 localedirs = $(wildcard locale/*/LC_MESSAGES)
 pofiles = $(wildcard locale/*/LC_MESSAGES/*.po)
 mofiles = $(patsubst %.po,%.mo,$(pofiles))
@ -43,7 +43,7 @@ mofiles = $(patsubst %.po,%.mo,$(pofiles))
 vpath %.po $(localedirs)
 vpath %.mo $(localedirs)

-all: | env i18n modules qt/dg_rc.py 
+all: | env i18n modules qt/dg_rc.py
 	@echo "Build complete! You can run dupeGuru with 'make run'"

 run:
@ -53,7 +53,7 @@ pyc: | env
 	${VENV_PYTHON} -m compileall ${packages}

 reqs:
-ifneq ($(shell test $(PYTHON_VERSION_MINOR) -gt $(REQ_MINOR_VERSION); echo $$?),0)
+ifneq ($(shell test $(PYTHON_VERSION_MINOR) -ge $(REQ_MINOR_VERSION); echo $$?),0)
 	$(error "Python 3.${REQ_MINOR_VERSION}+ required. Aborting.")
 endif
 ifndef NO_VENV
@ -82,7 +82,7 @@ qt/dg_rc.py: qt/dg.qrc
 i18n: $(mofiles)

 %.mo: %.po
-	msgfmt -o $@ $<	
+	msgfmt -o $@ $<

 modules: | env
 	$(VENV_PYTHON) build.py --modules
--- a/README.md
+++ b/README.md
@ -1,16 +1,12 @@
 # dupeGuru

 [dupeGuru][dupeguru] is a cross-platform (Linux, OS X, Windows) GUI tool to find duplicate files in
-a system. It is written mostly in Python 3 and has the peculiarity of using
-[multiple GUI toolkits][cross-toolkit], all using the same core Python code. On OS X, the UI layer
-is written in Objective-C and uses Cocoa. On Linux, it is written in Python and uses Qt5.
-
-The Cocoa UI of dupeGuru is hosted in a separate repo: https://github.com/arsenetar/dupeguru-cocoa
+a system. It is written mostly in Python 3 and uses [qt](https://www.qt.io/) for the UI.

 ## Current status
 Still looking for additional help especially with regards to:
-* OSX maintenance: reproducing bugs & cocoa version, building package with Cocoa UI.
-* Linux maintenance: reproducing bugs, maintaining PPA repository, Debian package.
+* OSX maintenance: reproducing bugs, packaging verification.
+* Linux maintenance: reproducing bugs, maintaining PPA repository, Debian package, rpm package.
 * Translations: updating missing strings, transifex project at https://www.transifex.com/voltaicideas/dupeguru-1
 * Documentation: keeping it up-to-date.

@ -26,7 +22,6 @@ This folder contains the source for dupeGuru. Its documentation is in `help`, bu
 * help: Help document, written for Sphinx.
 * locale: .po files for localization.
 * hscommon: A collection of helpers used across HS applications.
-* qtlib: A collection of helpers used across Qt UI codebases of HS applications.

 ## How to build dupeGuru from source

@ -36,19 +31,17 @@ For windows instructions see the [Windows Instructions](Windows.md).
 For macos instructions (qt version) see the [macOS Instructions](macos.md).

 ### Prerequisites
-* [Python 3.6+][python]
+* [Python 3.7+][python]
 * PyQt5

 ### System Setup
 When running in a linux based environment the following system packages or equivalents are needed to build:
 * python3-pyqt5
 * pyqt5-dev-tools (on some systems, see note)
-* python3-wheel (for hsaudiotag3k)
 * python3-venv (only if using a virtual environment)
 * python3-dev
 * build-essential

-
 Note: On some linux systems pyrcc5 is not put on the path when installing python3-pyqt5, this will cause some issues with the resource files (and icons). These systems should have a respective pyqt5-dev-tools package, which should also be installed. The presence of pyrcc5 can be checked with `which pyrcc5`.  Debian based systems need the extra package, and Arch does not.

 To create packages the following are also needed:
@ -70,7 +63,7 @@ dupeGuru comes with a makefile that can be used to build and run:
    $ python run.py

 ### Generating Debian/Ubuntu package
-To generate packages the extra requirements in requirements-extra.txt must be installed, the 
+To generate packages the extra requirements in requirements-extra.txt must be installed, the
 steps are as follows:

    $ cd <dupeGuru directory>
--- a/Windows.md
+++ b/Windows.md
@ -2,7 +2,7 @@

 ### Prerequisites

- [Python 3.6+][python]
+- [Python 3.7+][python]
 - [Visual Studio 2019][vs] or [Visual Studio Build Tools 2019][vsBuildTools] with the Windows 10 SDK
 - [nsis][nsis] (for installer creation)
 - [msys2][msys2] (for using makefile method)
@ -16,7 +16,7 @@ After installing python it is recommended to update setuptools before compiling
 More details on setting up python for compiling packages on windows can be found on the [python wiki][pythonWindowsCompilers] Take note of the required vc++ versions.

 ### With build.py (preferred)
-To build with a different python version 3.6 vs 3.8 or 32 bit vs 64 bit specify that version instead of -3.8 to the `py` command below.  If you want to build additional versions while keeping all virtual environments setup use a different location for each virtual environment.
+To build with a different python version 3.7 vs 3.8 or 32 bit vs 64 bit specify that version instead of -3.8 to the `py` command below.  If you want to build additional versions while keeping all virtual environments setup use a different location for each virtual environment.

    $ cd <dupeGuru directory>
    $ py -3.8 -m venv .\env
@ -29,7 +29,7 @@ To build with a different python version 3.6 vs 3.8 or 32 bit vs 64 bit specify
 It is possible to build dupeGuru with the makefile on windows using a compatable POSIX environment.  The following steps have been tested using [msys2][msys2]. Before running make:
 1. Install msys2 or other POSIX environment
 2. Install PyQt5 globally via pip
-3. Use the respective console for msys2 it is `msys2 msys` 
+3. Use the respective console for msys2 it is `msys2 msys`

 Then the following execution of the makefile should work.  Pass the correct value for PYTHON to the makefile if not on the path as python3.

--- a/build.py
+++ b/build.py
@ -4,19 +4,17 @@
 # which should be included with this package. The terms are also available at
 # http://www.gnu.org/licenses/gpl-3.0.html

-import os
-import os.path as op
+from pathlib import Path
+import sys
 from optparse import OptionParser
 import shutil
-from pathlib import Path
-
-from setuptools import setup, Extension
+from multiprocessing import Pool

+from setuptools import sandbox
 from hscommon import sphinxgen
 from hscommon.build import (
    add_to_pythonpath,
    print_and_do,
-    move_all,
    fix_qt_resource_file,
 )
 from hscommon import loc
@ -31,12 +29,9 @@ def parse_args():
        dest="clean",
        help="Clean build folder before building",
    )
-    parser.add_option(
-        "--doc", action="store_true", dest="doc", help="Build only the help file"
-    )
-    parser.add_option(
-        "--loc", action="store_true", dest="loc", help="Build only localization"
-    )
+    parser.add_option("--doc", action="store_true", dest="doc", help="Build only the help file (en)")
+    parser.add_option("--alldoc", action="store_true", dest="all_doc", help="Build only the help file in all languages")
+    parser.add_option("--loc", action="store_true", dest="loc", help="Build only localization")
    parser.add_option(
        "--updatepot",
        action="store_true",
@ -61,26 +56,20 @@ def parse_args():
        dest="modules",
        help="Build the python modules.",
    )
-    parser.add_option(
-        "--importpo",
-        action="store_true",
-        dest="importpo",
-        help="Import all PO files downloaded from transifex.",
-    )
    (options, args) = parser.parse_args()
    return options


-def build_help():
-    print("Generating Help")
-    current_path = op.abspath(".")
-    help_basepath = op.join(current_path, "help", "en")
-    help_destpath = op.join(current_path, "build", "help")
-    changelog_path = op.join(current_path, "help", "changelog")
+def build_one_help(language):
+    print(f"Generating Help in {language}")
+    current_path = Path(".").absolute()
+    changelog_path = current_path.joinpath("help", "changelog")
    tixurl = "https://github.com/arsenetar/dupeguru/issues/{}"
-    confrepl = {"language": "en"}
-    changelogtmpl = op.join(current_path, "help", "changelog.tmpl")
-    conftmpl = op.join(current_path, "help", "conf.tmpl")
+    changelogtmpl = current_path.joinpath("help", "changelog.tmpl")
+    conftmpl = current_path.joinpath("help", "conf.tmpl")
+    help_basepath = current_path.joinpath("help", language)
+    help_destpath = current_path.joinpath("build", "help", language)
+    confrepl = {"language": language}
    sphinxgen.gen(
        help_basepath,
        help_destpath,
@ -92,103 +81,44 @@ def build_help():
    )


-def build_qt_localizations():
-    loc.compile_all_po(op.join("qtlib", "locale"))
-    loc.merge_locale_dir(op.join("qtlib", "locale"), "locale")
+def build_help():
+    languages = ["en", "de", "fr", "hy", "ru", "uk"]
+    # Running with Pools as for some reason sphinx seems to cross contaminate the output otherwise
+    with Pool(len(languages)) as p:
+        p.map(build_one_help, languages)


 def build_localizations():
    loc.compile_all_po("locale")
-    build_qt_localizations()
-    locale_dest = op.join("build", "locale")
-    if op.exists(locale_dest):
+    locale_dest = Path("build", "locale")
+    if locale_dest.exists():
        shutil.rmtree(locale_dest)
-    shutil.copytree(
-        "locale", locale_dest, ignore=shutil.ignore_patterns("*.po", "*.pot")
-    )
+    shutil.copytree("locale", locale_dest, ignore=shutil.ignore_patterns("*.po", "*.pot"))


 def build_updatepot():
    print("Building .pot files from source files")
    print("Building core.pot")
-    loc.generate_pot(["core"], op.join("locale", "core.pot"), ["tr"])
+    loc.generate_pot(["core"], Path("locale", "core.pot"), ["tr"])
    print("Building columns.pot")
-    loc.generate_pot(["core"], op.join("locale", "columns.pot"), ["coltr"])
+    loc.generate_pot(["core"], Path("locale", "columns.pot"), ["coltr"])
    print("Building ui.pot")
-    # When we're not under OS X, we don't want to overwrite ui.pot because it contains Cocoa locs
-    # We want to merge the generated pot with the old pot in the most preserving way possible.
-    ui_packages = ["qt", op.join("cocoa", "inter")]
-    loc.generate_pot(ui_packages, op.join("locale", "ui.pot"), ["tr"], merge=True)
-    print("Building qtlib.pot")
-    loc.generate_pot(["qtlib"], op.join("qtlib", "locale", "qtlib.pot"), ["tr"])
+    loc.generate_pot(["qt"], Path("locale", "ui.pot"), ["tr"], merge=True)


 def build_mergepot():
    print("Updating .po files using .pot files")
    loc.merge_pots_into_pos("locale")
-    loc.merge_pots_into_pos(op.join("qtlib", "locale"))
-    # loc.merge_pots_into_pos(op.join("cocoalib", "locale"))


 def build_normpo():
    loc.normalize_all_pos("locale")
-    loc.normalize_all_pos(op.join("qtlib", "locale"))
-    # loc.normalize_all_pos(op.join("cocoalib", "locale"))
-
-
-def build_importpo():
-    basePath = Path.cwd()
-    # expect a folder named transifex with all the .po files from the exports
-    translationsPath = basePath.joinpath("transifex")
-    # locations where the translation files go
-    qtlibPath = basePath.joinpath("qtlib", "locale")
-    localePath = basePath.joinpath("locale")
-    for translation in translationsPath.iterdir():
-        # transifex files are named resource_lang.po so split on first '_'
-        parts = translation.stem.split("_", 1)
-        resource = parts[0]
-        language = parts[1]
-        # make sure qtlib resources go to dedicated folder
-        if resource == "qtlib":
-            outputPath = qtlibPath
-        else:
-            outputPath = localePath
-        outputFolder = outputPath.joinpath(language, "LC_MESSAGES")
-        # create the language folder if it is new
-        if not outputFolder.exists():
-            outputFolder.mkdir(parents=True)
-        # copy the po file over
-        shutil.copy(translation, outputFolder.joinpath(resource + ".po"))
-    # normalize files after complete
-    build_normpo()


 def build_pe_modules():
    print("Building PE Modules")
-    exts = [
-        Extension(
-            "_block",
-            [
-                op.join("core", "pe", "modules", "block.c"),
-                op.join("core", "pe", "modules", "common.c"),
-            ],
-        ),
-        Extension(
-            "_cache",
-            [
-                op.join("core", "pe", "modules", "cache.c"),
-                op.join("core", "pe", "modules", "common.c"),
-            ],
-        ),
-    ]
-    exts.append(Extension("_block_qt", [op.join("qt", "pe", "modules", "block.c")]))
-    setup(
-        script_args=["build_ext", "--inplace"],
-        ext_modules=exts,
-    )
-    move_all("_block_qt*", op.join("qt", "pe"))
-    move_all("_block*", op.join("core", "pe"))
-    move_all("_cache*", op.join("core", "pe"))
+    # Leverage setup.py to build modules
+    sandbox.run_setup("setup.py", ["build_ext", "--inplace"])


 def build_normal():
@ -199,21 +129,23 @@ def build_normal():
    print("Building localizations")
    build_localizations()
    print("Building Qt stuff")
-    print_and_do(
-        "pyrcc5 {0} > {1}".format(op.join("qt", "dg.qrc"), op.join("qt", "dg_rc.py"))
-    )
-    fix_qt_resource_file(op.join("qt", "dg_rc.py"))
+    Path("qt", "dg_rc.py").unlink(missing_ok=True)
+    print_and_do("pyrcc5 {} > {}".format(Path("qt", "dg.qrc"), Path("qt", "dg_rc.py")))
+    fix_qt_resource_file(Path("qt", "dg_rc.py"))
    build_help()


 def main():
+    if sys.version_info < (3, 7):
+        sys.exit("Python < 3.7 is unsupported.")
    options = parse_args()
-    if options.clean:
-        if op.exists("build"):
-            shutil.rmtree("build")
-    if not op.exists("build"):
-        os.mkdir("build")
+    if options.clean and Path("build").exists():
+        shutil.rmtree("build")
+    if not Path("build").exists():
+        Path("build").mkdir()
    if options.doc:
+        build_one_help("en")
+    elif options.all_doc:
        build_help()
    elif options.loc:
        build_localizations()
@ -225,8 +157,6 @@ def main():
        build_normpo()
    elif options.modules:
        build_pe_modules()
-    elif options.importpo:
-        build_importpo()
    else:
        build_normal()

--- a/commitlint.config.js
+++ b/commitlint.config.js
@ -0,0 +1,17 @@
+const Configuration = {
+    /*
+     * Resolve and load @commitlint/config-conventional from node_modules.
+     * Referenced packages must be installed
+     */
+    extends: ['@commitlint/config-conventional'],
+    /*
+     * Any rules defined here will override rules from @commitlint/config-conventional
+     */
+    rules: {
+        'header-max-length': [2, 'always', 72],
+        'subject-case': [2, 'always', 'sentence-case'],
+        'scope-enum': [2, 'always'],
+    },
+};
+
+module.exports = Configuration;
--- a/core/init.py
+++ b/core/init.py
@ -1,2 +1,2 @@
-__version__ = "4.1.1"
+__version__ = "4.3.1"
 __appname__ = "dupeGuru"
--- a/core/app.py
+++ b/core/app.py
@ -4,37 +4,39 @@
 # which should be included with this package. The terms are also available at
 # http://www.gnu.org/licenses/gpl-3.0.html

+import cProfile
+import datetime
 import os
 import os.path as op
 import logging
 import subprocess
 import re
 import shutil
+from pathlib import Path

 from send2trash import send2trash
 from hscommon.jobprogress import job
 from hscommon.notify import Broadcaster
-from hscommon.path import Path
 from hscommon.conflict import smart_move, smart_copy
 from hscommon.gui.progress_window import ProgressWindow
 from hscommon.util import delete_if_empty, first, escape, nonone, allsame
 from hscommon.trans import tr
 from hscommon import desktop

-from . import se, me, pe
-from .pe.photo import get_delta_dimensions
-from .util import cmp_value, fix_surrogate_encoding
-from . import directories, results, export, fs, prioritize
-from .ignore import IgnoreList
-from .exclude import ExcludeDict as ExcludeList
-from .scanner import ScanType
-from .gui.deletion_options import DeletionOptions
-from .gui.details_panel import DetailsPanel
-from .gui.directory_tree import DirectoryTree
-from .gui.ignore_list_dialog import IgnoreListDialog
-from .gui.exclude_list_dialog import ExcludeListDialogCore
-from .gui.problem_dialog import ProblemDialog
-from .gui.stats_label import StatsLabel
+from core import se, me, pe
+from core.pe.photo import get_delta_dimensions
+from core.util import cmp_value, fix_surrogate_encoding
+from core import directories, results, export, fs, prioritize
+from core.ignore import IgnoreList
+from core.exclude import ExcludeDict as ExcludeList
+from core.scanner import ScanType
+from core.gui.deletion_options import DeletionOptions
+from core.gui.details_panel import DetailsPanel
+from core.gui.directory_tree import DirectoryTree
+from core.gui.ignore_list_dialog import IgnoreListDialog
+from core.gui.exclude_list_dialog import ExcludeListDialogCore
+from core.gui.problem_dialog import ProblemDialog
+from core.gui.stats_label import StatsLabel

 HAD_FIRST_LAUNCH_PREFERENCE = "HadFirstLaunch"
 DEBUG_MODE_PREFERENCE = "DebugMode"
@ -48,31 +50,31 @@ MSG_MANY_FILES_TO_OPEN = tr(


 class DestType:
-    Direct = 0
-    Relative = 1
-    Absolute = 2
+    DIRECT = 0
+    RELATIVE = 1
+    ABSOLUTE = 2


 class JobType:
-    Scan = "job_scan"
-    Load = "job_load"
-    Move = "job_move"
-    Copy = "job_copy"
-    Delete = "job_delete"
+    SCAN = "job_scan"
+    LOAD = "job_load"
+    MOVE = "job_move"
+    COPY = "job_copy"
+    DELETE = "job_delete"


 class AppMode:
-    Standard = 0
-    Music = 1
-    Picture = 2
+    STANDARD = 0
+    MUSIC = 1
+    PICTURE = 2


 JOBID2TITLE = {
-    JobType.Scan: tr("Scanning for duplicates"),
-    JobType.Load: tr("Loading"),
-    JobType.Move: tr("Moving"),
-    JobType.Copy: tr("Copying"),
-    JobType.Delete: tr("Sending to Trash"),
+    JobType.SCAN: tr("Scanning for duplicates"),
+    JobType.LOAD: tr("Loading"),
+    JobType.MOVE: tr("Moving"),
+    JobType.COPY: tr("Copying"),
+    JobType.DELETE: tr("Sending to Trash"),
 }


@ -124,22 +126,20 @@ class DupeGuru(Broadcaster):

    NAME = PROMPT_NAME = "dupeGuru"

-    PICTURE_CACHE_TYPE = "sqlite"  # set to 'shelve' for a ShelveCache
-
-    def __init__(self, view):
+    def __init__(self, view, portable=False):
        if view.get_default(DEBUG_MODE_PREFERENCE):
            logging.getLogger().setLevel(logging.DEBUG)
            logging.debug("Debug mode enabled")
        Broadcaster.__init__(self)
        self.view = view
-        self.appdata = desktop.special_folder_path(
-            desktop.SpecialFolder.AppData, appname=self.NAME
-        )
+        self.appdata = desktop.special_folder_path(desktop.SpecialFolder.APPDATA, portable=portable)
        if not op.exists(self.appdata):
            os.makedirs(self.appdata)
-        self.app_mode = AppMode.Standard
+        self.app_mode = AppMode.STANDARD
        self.discarded_file_count = 0
        self.exclude_list = ExcludeList()
+        hash_cache_file = op.join(self.appdata, "hash_cache.db")
+        fs.filesdb.connect(hash_cache_file)
        self.directories = directories.Directories(self.exclude_list)
        self.results = results.Results(self)
        self.ignore_list = IgnoreList()
@ -150,8 +150,9 @@ class DupeGuru(Broadcaster):
            "escape_filter_regexp": True,
            "clean_empty_dirs": False,
            "ignore_hardlink_matches": False,
-            "copymove_dest_type": DestType.Relative,
-            "picture_cache_type": self.PICTURE_CACHE_TYPE,
+            "copymove_dest_type": DestType.RELATIVE,
+            "include_exists_check": True,
+            "rehash_ignore_mtime": False,
        }
        self.selected_dupes = []
        self.details_panel = DetailsPanel(self)
@ -171,9 +172,9 @@ class DupeGuru(Broadcaster):
    def _recreate_result_table(self):
        if self.result_table is not None:
            self.result_table.disconnect()
-        if self.app_mode == AppMode.Picture:
+        if self.app_mode == AppMode.PICTURE:
            self.result_table = pe.result_table.ResultTable(self)
-        elif self.app_mode == AppMode.Music:
+        elif self.app_mode == AppMode.MUSIC:
            self.result_table = me.result_table.ResultTable(self)
        else:
            self.result_table = se.result_table.ResultTable(self)
@ -181,24 +182,17 @@ class DupeGuru(Broadcaster):
        self.view.create_results_window()

    def _get_picture_cache_path(self):
-        cache_type = self.options["picture_cache_type"]
-        cache_name = (
-            "cached_pictures.shelve" if cache_type == "shelve" else "cached_pictures.db"
-        )
+        cache_name = "cached_pictures.db"
        return op.join(self.appdata, cache_name)

    def _get_dupe_sort_key(self, dupe, get_group, key, delta):
-        if self.app_mode in (AppMode.Music, AppMode.Picture):
-            if key == "folder_path":
-                dupe_folder_path = getattr(
-                    dupe, "display_folder_path", dupe.folder_path
-                )
-                return str(dupe_folder_path).lower()
-        if self.app_mode == AppMode.Picture:
-            if delta and key == "dimensions":
-                r = cmp_value(dupe, key)
-                ref_value = cmp_value(get_group().ref, key)
-                return get_delta_dimensions(r, ref_value)
+        if self.app_mode in (AppMode.MUSIC, AppMode.PICTURE) and key == "folder_path":
+            dupe_folder_path = getattr(dupe, "display_folder_path", dupe.folder_path)
+            return str(dupe_folder_path).lower()
+        if self.app_mode == AppMode.PICTURE and delta and key == "dimensions":
+            r = cmp_value(dupe, key)
+            ref_value = cmp_value(get_group().ref, key)
+            return get_delta_dimensions(r, ref_value)
        if key == "marked":
            return self.results.is_marked(dupe)
        if key == "percentage":
@ -218,12 +212,9 @@ class DupeGuru(Broadcaster):
        return result

    def _get_group_sort_key(self, group, key):
-        if self.app_mode in (AppMode.Music, AppMode.Picture):
-            if key == "folder_path":
-                dupe_folder_path = getattr(
-                    group.ref, "display_folder_path", group.ref.folder_path
-                )
-                return str(dupe_folder_path).lower()
+        if self.app_mode in (AppMode.MUSIC, AppMode.PICTURE) and key == "folder_path":
+            dupe_folder_path = getattr(group.ref, "display_folder_path", group.ref.folder_path)
+            return str(dupe_folder_path).lower()
        if key == "percentage":
            return group.percentage
        if key == "dupe_count":
@ -235,9 +226,7 @@ class DupeGuru(Broadcaster):
    def _do_delete(self, j, link_deleted, use_hardlinks, direct_deletion):
        def op(dupe):
            j.add_progress()
-            return self._do_delete_dupe(
-                dupe, link_deleted, use_hardlinks, direct_deletion
-            )
+            return self._do_delete_dupe(dupe, link_deleted, use_hardlinks, direct_deletion)

        j.start_job(self.results.mark_count)
        self.results.perform_on_marked(op, True)
@ -259,7 +248,7 @@ class DupeGuru(Broadcaster):
            ref = group.ref
            linkfunc = os.link if use_hardlinks else os.symlink
            linkfunc(str(ref.path), str_path)
-        self.clean_empty_dirs(dupe.path.parent())
+        self.clean_empty_dirs(dupe.path.parent)

    def _create_file(self, path):
        # We add fs.Folder to fileclasses in case the file we're loading contains folder paths.
@ -273,15 +262,11 @@ class DupeGuru(Broadcaster):
        try:
            f._read_all_info(attrnames=self.METADATA_TO_READ)
            return f
-        except EnvironmentError:
+        except OSError:
            return None

    def _get_export_data(self):
-        columns = [
-            col
-            for col in self.result_table.columns.ordered_columns
-            if col.visible and col.name != "marked"
-        ]
+        columns = [col for col in self.result_table._columns.ordered_columns if col.visible and col.name != "marked"]
        colnames = [col.display for col in columns]
        rows = []
        for group_id, group in enumerate(self.results.groups):
@ -293,11 +278,7 @@ class DupeGuru(Broadcaster):
        return colnames, rows

    def _results_changed(self):
-        self.selected_dupes = [
-            d
-            for d in self.selected_dupes
-            if self.results.get_group_of_duplicate(d) is not None
-        ]
+        self.selected_dupes = [d for d in self.selected_dupes if self.results.get_group_of_duplicate(d) is not None]
        self.notify("results_changed")

    def _start_job(self, jobid, func, args=()):
@ -312,34 +293,36 @@ class DupeGuru(Broadcaster):
            self.view.show_message(msg)

    def _job_completed(self, jobid):
-        if jobid == JobType.Scan:
+        if jobid == JobType.SCAN:
            self._results_changed()
+            fs.filesdb.commit()
            if not self.results.groups:
                self.view.show_message(tr("No duplicates found."))
            else:
                self.view.show_results_window()
-        if jobid in {JobType.Move, JobType.Delete}:
+        if jobid in {JobType.MOVE, JobType.DELETE}:
            self._results_changed()
-        if jobid == JobType.Load:
+        if jobid == JobType.LOAD:
            self._recreate_result_table()
            self._results_changed()
            self.view.show_results_window()
-        if jobid in {JobType.Copy, JobType.Move, JobType.Delete}:
+        if jobid in {JobType.COPY, JobType.MOVE, JobType.DELETE}:
            if self.results.problems:
                self.problem_dialog.refresh()
                self.view.show_problem_dialog()
            else:
-                msg = {
-                    JobType.Copy: tr("All marked files were copied successfully."),
-                    JobType.Move: tr("All marked files were moved successfully."),
-                    JobType.Delete: tr(
-                        "All marked files were successfully sent to Trash."
-                    ),
-                }[jobid]
+                if jobid == JobType.COPY:
+                    msg = tr("All marked files were copied successfully.")
+                elif jobid == JobType.MOVE:
+                    msg = tr("All marked files were moved successfully.")
+                elif jobid == JobType.DELETE and self.deletion_options.direct:
+                    msg = tr("All marked files were deleted successfully.")
+                else:
+                    msg = tr("All marked files were successfully sent to Trash.")
                self.view.show_message(msg)

    def _job_error(self, jobid, err):
-        if jobid == JobType.Load:
+        if jobid == JobType.LOAD:
            msg = tr("Could not load file: {}").format(err)
            self.view.show_message(msg)
            return False
@ -369,17 +352,17 @@ class DupeGuru(Broadcaster):

    # --- Protected
    def _get_fileclasses(self):
-        if self.app_mode == AppMode.Picture:
+        if self.app_mode == AppMode.PICTURE:
            return [pe.photo.PLAT_SPECIFIC_PHOTO_CLASS]
-        elif self.app_mode == AppMode.Music:
+        elif self.app_mode == AppMode.MUSIC:
            return [me.fs.MusicFile]
        else:
            return [se.fs.File]

    def _prioritization_categories(self):
-        if self.app_mode == AppMode.Picture:
+        if self.app_mode == AppMode.PICTURE:
            return pe.prioritize.all_categories()
-        elif self.app_mode == AppMode.Music:
+        elif self.app_mode == AppMode.MUSIC:
            return me.prioritize.all_categories()
        else:
            return prioritize.all_categories()
@ -401,41 +384,38 @@ class DupeGuru(Broadcaster):
            self.view.show_message(tr("'{}' does not exist.").format(d))

    def add_selected_to_ignore_list(self):
-        """Adds :attr:`selected_dupes` to :attr:`ignore_list`.
-        """
+        """Adds :attr:`selected_dupes` to :attr:`ignore_list`."""
        dupes = self.without_ref(self.selected_dupes)
        if not dupes:
            self.view.show_message(MSG_NO_SELECTED_DUPES)
            return
-        msg = tr(
-            "All selected %d matches are going to be ignored in all subsequent scans. Continue?"
-        )
+        msg = tr("All selected %d matches are going to be ignored in all subsequent scans. Continue?")
        if not self.view.ask_yes_no(msg % len(dupes)):
            return
        for dupe in dupes:
            g = self.results.get_group_of_duplicate(dupe)
            for other in g:
                if other is not dupe:
-                    self.ignore_list.Ignore(str(other.path), str(dupe.path))
+                    self.ignore_list.ignore(str(other.path), str(dupe.path))
        self.remove_duplicates(dupes)
        self.ignore_list_dialog.refresh()

-    def apply_filter(self, filter):
+    def apply_filter(self, result_filter):
        """Apply a filter ``filter`` to the results so that it shows only dupe groups that match it.

        :param str filter: filter to apply
        """
        self.results.apply_filter(None)
        if self.options["escape_filter_regexp"]:
-            filter = escape(filter, set("()[]\\.|+?^"))
-            filter = escape(filter, "*", ".")
-        self.results.apply_filter(filter)
+            result_filter = escape(result_filter, set("()[]\\.|+?^"))
+            result_filter = escape(result_filter, "*", ".")
+        self.results.apply_filter(result_filter)
        self._results_changed()

    def clean_empty_dirs(self, path):
        if self.options["clean_empty_dirs"]:
            while delete_if_empty(path, [".DS_Store"]):
-                path = path.parent()
+                path = path.parent

    def clear_picture_cache(self):
        try:
@ -443,27 +423,30 @@ class DupeGuru(Broadcaster):
        except FileNotFoundError:
            pass  # we don't care

+    def clear_hash_cache(self):
+        fs.filesdb.clear()
+
    def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType):
        source_path = dupe.path
-        location_path = first(p for p in self.directories if dupe.path in p)
+        location_path = first(p for p in self.directories if p in dupe.path.parents)
        dest_path = Path(destination)
-        if dest_type in {DestType.Relative, DestType.Absolute}:
+        if dest_type in {DestType.RELATIVE, DestType.ABSOLUTE}:
            # no filename, no windows drive letter
-            source_base = source_path.remove_drive_letter().parent()
-            if dest_type == DestType.Relative:
-                source_base = source_base[location_path:]
-            dest_path = dest_path[source_base]
+            source_base = source_path.relative_to(source_path.anchor).parent
+            if dest_type == DestType.RELATIVE:
+                source_base = source_base.relative_to(location_path.relative_to(location_path.anchor))
+            dest_path = dest_path.joinpath(source_base)
        if not dest_path.exists():
-            dest_path.makedirs()
+            dest_path.mkdir(parents=True)
        # Add filename to dest_path. For file move/copy, it's not required, but for folders, yes.
-        dest_path = dest_path[source_path.name]
+        dest_path = dest_path.joinpath(source_path.name)
        logging.debug("Copy/Move operation from '%s' to '%s'", source_path, dest_path)
        # Raises an EnvironmentError if there's a problem
        if copy:
            smart_copy(source_path, dest_path)
        else:
            smart_move(source_path, dest_path)
-            self.clean_empty_dirs(source_path.parent())
+            self.clean_empty_dirs(source_path.parent)

    def copy_or_move_marked(self, copy):
        """Start an async move (or copy) job on marked duplicates.
@ -483,16 +466,17 @@ class DupeGuru(Broadcaster):
            self.view.show_message(MSG_NO_MARKED_DUPES)
            return
        destination = self.view.select_dest_folder(
-            tr("Select a directory to copy marked files to") if copy
-            else tr("Select a directory to move marked files to"))
+            tr("Select a directory to copy marked files to")
+            if copy
+            else tr("Select a directory to move marked files to")
+        )
        if destination:
            desttype = self.options["copymove_dest_type"]
-            jobid = JobType.Copy if copy else JobType.Move
+            jobid = JobType.COPY if copy else JobType.MOVE
            self._start_job(jobid, do)

    def delete_marked(self):
-        """Start an async job to send marked duplicates to the trash.
-        """
+        """Start an async job to send marked duplicates to the trash."""
        if not self.results.mark_count:
            self.view.show_message(MSG_NO_MARKED_DUPES)
            return
@ -504,7 +488,7 @@ class DupeGuru(Broadcaster):
            self.deletion_options.direct,
        ]
        logging.debug("Starting deletion job with args %r", args)
-        self._start_job(JobType.Delete, self._do_delete, args=args)
+        self._start_job(JobType.DELETE, self._do_delete, args=args)

    def export_to_xhtml(self):
        """Export current results to XHTML.
@ -523,9 +507,7 @@ class DupeGuru(Broadcaster):
        The columns and their order in the resulting CSV file is determined in the same way as in
        :meth:`export_to_xhtml`.
        """
-        dest_file = self.view.select_dest_file(
-            tr("Select a destination for your exported CSV"), "csv"
-        )
+        dest_file = self.view.select_dest_file(tr("Select a destination for your exported CSV"), "csv")
        if dest_file:
            colnames, rows = self._get_export_data()
            try:
@ -542,9 +524,7 @@ class DupeGuru(Broadcaster):
        try:
            return dupe.get_display_info(group, delta)
        except Exception as e:
-            logging.warning(
-                "Exception (type: %s) on GetDisplayInfo for %s: %s",
-                type(e), str(dupe.path), str(e))
+            logging.warning("Exception (type: %s) on GetDisplayInfo for %s: %s", type(e), str(dupe.path), str(e))
            return empty_data()

    def invoke_custom_command(self):
@ -556,28 +536,32 @@ class DupeGuru(Broadcaster):
        """
        cmd = self.view.get_default("CustomCommand")
        if not cmd:
-            msg = tr(
-                "You have no custom command set up. Set it up in your preferences."
-            )
+            msg = tr("You have no custom command set up. Set it up in your preferences.")
            self.view.show_message(msg)
            return
        if not self.selected_dupes:
            return
-        dupe = self.selected_dupes[0]
-        group = self.results.get_group_of_duplicate(dupe)
-        ref = group.ref
-        cmd = cmd.replace("%d", str(dupe.path))
-        cmd = cmd.replace("%r", str(ref.path))
-        match = re.match(r'"([^"]+)"(.*)', cmd)
-        if match is not None:
-            # This code here is because subprocess. Popen doesn't seem to accept, under Windows,
-            # executable paths with spaces in it, *even* when they're enclosed in "". So this is
-            # a workaround to make the damn thing work.
-            exepath, args = match.groups()
-            path, exename = op.split(exepath)
-            subprocess.Popen(exename + args, shell=True, cwd=path)
-        else:
-            subprocess.Popen(cmd, shell=True)
+        dupes = self.selected_dupes
+        refs = [self.results.get_group_of_duplicate(dupe).ref for dupe in dupes]
+        for dupe, ref in zip(dupes, refs):
+            dupe_cmd = cmd.replace("%d", str(dupe.path))
+            dupe_cmd = dupe_cmd.replace("%r", str(ref.path))
+            match = re.match(r'"([^"]+)"(.*)', dupe_cmd)
+            if match is not None:
+                # This code here is because subprocess. Popen doesn't seem to accept, under Windows,
+                # executable paths with spaces in it, *even* when they're enclosed in "". So this is
+                # a workaround to make the damn thing work.
+                exepath, args = match.groups()
+                path, exename = op.split(exepath)
+                p = subprocess.Popen(
+                    exename + args, shell=True, cwd=path, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+                )
+                output = p.stdout.read()
+                logging.info("Custom command %s %s: %s", exename, args, output)
+            else:
+                p = subprocess.Popen(dupe_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+                output = p.stdout.read()
+                logging.info("Custom command %s: %s", dupe_cmd, output)

    def load(self):
        """Load directory selection and ignore list from files in appdata.
@ -610,7 +594,7 @@ class DupeGuru(Broadcaster):
        def do(j):
            self.results.load_from_xml(filename, self._get_file, j)

-        self._start_job(JobType.Load, do)
+        self._start_job(JobType.LOAD, do)

    def make_selected_reference(self):
        """Promote :attr:`selected_dupes` to reference position within their respective groups.
@ -623,9 +607,8 @@ class DupeGuru(Broadcaster):
        changed_groups = set()
        for dupe in dupes:
            g = self.results.get_group_of_duplicate(dupe)
-            if g not in changed_groups:
-                if self.results.make_ref(dupe):
-                    changed_groups.add(g)
+            if g not in changed_groups and self.results.make_ref(dupe):
+                changed_groups.add(g)
        # It's not always obvious to users what this action does, so to make it a bit clearer,
        # we change our selection to the ref of all changed groups. However, we also want to keep
        # the files that were ref before and weren't changed by the action. In effect, what this
@ -634,9 +617,7 @@ class DupeGuru(Broadcaster):
        if not self.result_table.power_marker:
            if changed_groups:
                self.selected_dupes = [
-                    d
-                    for d in self.selected_dupes
-                    if self.results.get_group_of_duplicate(d).ref is d
+                    d for d in self.selected_dupes if self.results.get_group_of_duplicate(d).ref is d
                ]
            self.notify("results_changed")
        else:
@ -648,20 +629,17 @@ class DupeGuru(Broadcaster):
            self.notify("results_changed_but_keep_selection")

    def mark_all(self):
-        """Set all dupes in the results as marked.
-        """
+        """Set all dupes in the results as marked."""
        self.results.mark_all()
        self.notify("marking_changed")

    def mark_none(self):
-        """Set all dupes in the results as unmarked.
-        """
+        """Set all dupes in the results as unmarked."""
        self.results.mark_none()
        self.notify("marking_changed")

    def mark_invert(self):
-        """Invert the marked state of all dupes in the results.
-        """
+        """Invert the marked state of all dupes in the results."""
        self.results.mark_invert()
        self.notify("marking_changed")

@ -679,18 +657,15 @@ class DupeGuru(Broadcaster):
        self.notify("marking_changed")

    def open_selected(self):
-        """Open :attr:`selected_dupes` with their associated application.
-        """
-        if len(self.selected_dupes) > 10:
-            if not self.view.ask_yes_no(MSG_MANY_FILES_TO_OPEN):
-                return
+        """Open :attr:`selected_dupes` with their associated application."""
+        if len(self.selected_dupes) > 10 and not self.view.ask_yes_no(MSG_MANY_FILES_TO_OPEN):
+            return
        for dupe in self.selected_dupes:
            desktop.open_path(dupe.path)

    def purge_ignore_list(self):
-        """Remove files that don't exist from :attr:`ignore_list`.
-        """
-        self.ignore_list.Filter(lambda f, s: op.exists(f) and op.exists(s))
+        """Remove files that don't exist from :attr:`ignore_list`."""
+        self.ignore_list.filter(lambda f, s: op.exists(f) and op.exists(s))
        self.ignore_list_dialog.refresh()

    def remove_directories(self, indexes):
@ -719,8 +694,7 @@ class DupeGuru(Broadcaster):
        self.notify("results_changed_but_keep_selection")

    def remove_marked(self):
-        """Removed marked duplicates from the results (without touching the files themselves).
-        """
+        """Removed marked duplicates from the results (without touching the files themselves)."""
        if not self.results.mark_count:
            self.view.show_message(MSG_NO_MARKED_DUPES)
            return
@ -731,8 +705,7 @@ class DupeGuru(Broadcaster):
        self._results_changed()

    def remove_selected(self):
-        """Removed :attr:`selected_dupes` from the results (without touching the files themselves).
-        """
+        """Removed :attr:`selected_dupes` from the results (without touching the files themselves)."""
        dupes = self.without_ref(self.selected_dupes)
        if not dupes:
            self.view.show_message(MSG_NO_SELECTED_DUPES)
@ -770,10 +743,10 @@ class DupeGuru(Broadcaster):
        for group in self.results.groups:
            if group.prioritize(key_func=sort_key):
                count += 1
+        if count:
+            self.results.refresh_required = True
        self._results_changed()
-        msg = tr("{} duplicate groups were changed by the re-prioritization.").format(
-            count
-        )
+        msg = tr("{} duplicate groups were changed by the re-prioritization.").format(count)
        self.view.show_message(msg)

    def reveal_selected(self):
@ -790,6 +763,9 @@ class DupeGuru(Broadcaster):
        self.exclude_list.save_to_xml(p)
        self.notify("save_session")

+    def close(self):
+        fs.filesdb.close()
+
    def save_as(self, filename):
        """Save results in ``filename``.

@ -810,44 +786,45 @@ class DupeGuru(Broadcaster):
        except OSError as e:
            self.view.show_message(tr("Couldn't write to file: {}").format(str(e)))

-    def start_scanning(self):
+    def start_scanning(self, profile_scan=False):
        """Starts an async job to scan for duplicates.

        Scans folders selected in :attr:`directories` and put the results in :attr:`results`
        """
        scanner = self.SCANNER_CLASS()
+        fs.filesdb.ignore_mtime = self.options["rehash_ignore_mtime"] is True
        if not self.directories.has_any_file():
-            self.view.show_message(
-                tr("The selected directories contain no scannable file.")
-            )
+            self.view.show_message(tr("The selected directories contain no scannable file."))
            return
        # Send relevant options down to the scanner instance
        for k, v in self.options.items():
            if hasattr(scanner, k):
                setattr(scanner, k, v)
-        if self.app_mode == AppMode.Picture:
+        if self.app_mode == AppMode.PICTURE:
            scanner.cache_path = self._get_picture_cache_path()
        self.results.groups = []
        self._recreate_result_table()
        self._results_changed()

        def do(j):
+            if profile_scan:
+                pr = cProfile.Profile()
+                pr.enable()
            j.set_progress(0, tr("Collecting files to scan"))
-            if scanner.scan_type == ScanType.Folders:
-                files = list(
-                    self.directories.get_folders(folderclass=se.fs.Folder, j=j)
-                )
+            if scanner.scan_type == ScanType.FOLDERS:
+                files = list(self.directories.get_folders(folderclass=se.fs.Folder, j=j))
            else:
-                files = list(
-                    self.directories.get_files(fileclasses=self.fileclasses, j=j)
-                )
+                files = list(self.directories.get_files(fileclasses=self.fileclasses, j=j))
            if self.options["ignore_hardlink_matches"]:
                files = self._remove_hardlink_dupes(files)
            logging.info("Scanning %d files" % len(files))
            self.results.groups = scanner.get_dupe_groups(files, self.ignore_list, j)
            self.discarded_file_count = scanner.discarded_file_count
+            if profile_scan:
+                pr.disable()
+                pr.dump_stats(op.join(self.appdata, f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.profile"))

-        self._start_job(JobType.Scan, do)
+        self._start_job(JobType.SCAN, do)

    def toggle_selected_mark_state(self):
        selected = self.without_ref(self.selected_dupes)
@ -862,13 +839,8 @@ class DupeGuru(Broadcaster):
        self.notify("marking_changed")

    def without_ref(self, dupes):
-        """Returns ``dupes`` with all reference elements removed.
-        """
-        return [
-            dupe
-            for dupe in dupes
-            if self.results.get_group_of_duplicate(dupe).ref is not dupe
-        ]
+        """Returns ``dupes`` with all reference elements removed."""
+        return [dupe for dupe in dupes if self.results.get_group_of_duplicate(dupe).ref is not dupe]

    def get_default(self, key, fallback_value=None):
        result = nonone(self.view.get_default(key), fallback_value)
@ -897,18 +869,18 @@ class DupeGuru(Broadcaster):

    @property
    def SCANNER_CLASS(self):
-        if self.app_mode == AppMode.Picture:
+        if self.app_mode == AppMode.PICTURE:
            return pe.scanner.ScannerPE
-        elif self.app_mode == AppMode.Music:
+        elif self.app_mode == AppMode.MUSIC:
            return me.scanner.ScannerME
        else:
            return se.scanner.ScannerSE

    @property
    def METADATA_TO_READ(self):
-        if self.app_mode == AppMode.Picture:
+        if self.app_mode == AppMode.PICTURE:
            return ["size", "mtime", "dimensions", "exif_timestamp"]
-        elif self.app_mode == AppMode.Music:
+        elif self.app_mode == AppMode.MUSIC:
            return [
                "size",
                "mtime",
--- a/core/directories.py
+++ b/core/directories.py
@ -7,12 +7,13 @@
 import os
 from xml.etree import ElementTree as ET
 import logging
+from pathlib import Path

 from hscommon.jobprogress import job
-from hscommon.path import Path
 from hscommon.util import FileOrPath
+from hscommon.trans import tr

-from . import fs
+from core import fs

 __all__ = [
    "Directories",
@ -30,9 +31,9 @@ class DirectoryState:
    * DirectoryState.Excluded: Don't scan this folder
    """

-    Normal = 0
-    Reference = 1
-    Excluded = 2
+    NORMAL = 0
+    REFERENCE = 1
+    EXCLUDED = 2


 class AlreadyThereError(Exception):
@ -62,7 +63,7 @@ class Directories:

    def __contains__(self, path):
        for p in self._dirs:
-            if path in p:
+            if path == p or p in path.parents:
                return True
        return False

@ -82,76 +83,65 @@ class Directories:
            # We iterate even if we only have one item here
            for denied_path_re in self._exclude_list.compiled:
                if denied_path_re.match(str(path.name)):
-                    return DirectoryState.Excluded
-            # return # We still use the old logic to force state on hidden dirs
+                    return DirectoryState.EXCLUDED
+            return DirectoryState.NORMAL
        # Override this in subclasses to specify the state of some special folders.
        if path.name.startswith("."):
-            return DirectoryState.Excluded
+            return DirectoryState.EXCLUDED
+        return DirectoryState.NORMAL

    def _get_files(self, from_path, fileclasses, j):
-        for root, dirs, files in os.walk(str(from_path)):
-            j.check_if_cancelled()
-            rootPath = Path(root)
-            state = self.get_state(rootPath)
-            if state == DirectoryState.Excluded:
-                # Recursively get files from folders with lots of subfolder is expensive. However, there
-                # might be a subfolder in this path that is not excluded. What we want to do is to skim
-                # through self.states and see if we must continue, or we can stop right here to save time
-                if not any(p[: len(rootPath)] == rootPath for p in self.states):
-                    del dirs[:]
-            try:
-                if state != DirectoryState.Excluded:
-                    # Old logic
-                    if self._exclude_list is None or not self._exclude_list.mark_count:
-                        found_files = [fs.get_file(rootPath + f, fileclasses=fileclasses) for f in files]
-                    else:
-                        found_files = []
-                        # print(f"len of files: {len(files)} {files}")
-                        for f in files:
-                            found = False
-                            for expr in self._exclude_list.compiled_files:
-                                if expr.match(f):
-                                    found = True
-                                    break
-                            if not found:
-                                for expr in self._exclude_list.compiled_paths:
-                                    if expr.match(root + os.sep + f):
-                                        found = True
-                                        break
-                            if not found:
-                                found_files.append(fs.get_file(rootPath + f, fileclasses=fileclasses))
-                    found_files = [f for f in found_files if f is not None]
-                    # In some cases, directories can be considered as files by dupeGuru, which is
-                    # why we have this line below. In fact, there only one case: Bundle files under
-                    # OS X... In other situations, this forloop will do nothing.
-                    for d in dirs[:]:
-                        f = fs.get_file(rootPath + d, fileclasses=fileclasses)
-                        if f is not None:
-                            found_files.append(f)
-                            dirs.remove(d)
-                    logging.debug(
-                        "Collected %d files in folder %s",
-                        len(found_files),
-                        str(rootPath),
-                    )
-                    for file in found_files:
-                        file.is_ref = state == DirectoryState.Reference
-                        yield file
-            except (EnvironmentError, fs.InvalidPath):
-                pass
+        try:
+            with os.scandir(from_path) as iter:
+                root_path = Path(from_path)
+                state = self.get_state(root_path)
+                # if we have no un-excluded dirs under this directory skip going deeper
+                skip_dirs = state == DirectoryState.EXCLUDED and not any(
+                    p.parts[: len(root_path.parts)] == root_path.parts for p in self.states
+                )
+                count = 0
+                for item in iter:
+                    j.check_if_cancelled()
+                    try:
+                        if item.is_dir():
+                            if skip_dirs:
+                                continue
+                            yield from self._get_files(item.path, fileclasses, j)
+                            continue
+                        elif state == DirectoryState.EXCLUDED:
+                            continue
+                        # File excluding or not
+                        if (
+                            self._exclude_list is None
+                            or not self._exclude_list.mark_count
+                            or not self._exclude_list.is_excluded(str(from_path), item.name)
+                        ):
+                            file = fs.get_file(item, fileclasses=fileclasses)
+                            if file:
+                                file.is_ref = state == DirectoryState.REFERENCE
+                                count += 1
+                                yield file
+                    except (OSError, fs.InvalidPath):
+                        pass
+                logging.debug(
+                    "Collected %d files in folder %s",
+                    count,
+                    str(root_path),
+                )
+        except OSError:
+            pass

    def _get_folders(self, from_folder, j):
        j.check_if_cancelled()
        try:
            for subfolder in from_folder.subfolders:
-                for folder in self._get_folders(subfolder, j):
-                    yield folder
+                yield from self._get_folders(subfolder, j)
            state = self.get_state(from_folder.path)
-            if state != DirectoryState.Excluded:
-                from_folder.is_ref = state == DirectoryState.Reference
+            if state != DirectoryState.EXCLUDED:
+                from_folder.is_ref = state == DirectoryState.REFERENCE
                logging.debug("Yielding Folder %r state: %d", from_folder, state)
                yield from_folder
-        except (EnvironmentError, fs.InvalidPath):
+        except (OSError, fs.InvalidPath):
            pass

    # ---Public
@ -169,7 +159,7 @@ class Directories:
            raise AlreadyThereError()
        if not path.exists():
            raise InvalidPathError()
-        self._dirs = [p for p in self._dirs if p not in path]
+        self._dirs = [p for p in self._dirs if path not in p.parents]
        self._dirs.append(path)

    @staticmethod
@ -180,10 +170,10 @@ class Directories:
        :rtype: list of Path
        """
        try:
-            subpaths = [p for p in path.listdir() if p.isdir()]
+            subpaths = [p for p in path.glob("*") if p.is_dir()]
            subpaths.sort(key=lambda x: x.name.lower())
            return subpaths
-        except EnvironmentError:
+        except OSError:
            return []

    def get_files(self, fileclasses=None, j=job.nulljob):
@ -193,8 +183,12 @@ class Directories:
        """
        if fileclasses is None:
            fileclasses = [fs.File]
+        file_count = 0
        for path in self._dirs:
            for file in self._get_files(path, fileclasses=fileclasses, j=j):
+                file_count += 1
+                if not isinstance(j, job.NullJob):
+                    j.set_progress(-1, tr("Collected {} files to scan").format(file_count))
                yield file

    def get_folders(self, folderclass=None, j=job.nulljob):
@ -204,9 +198,13 @@ class Directories:
        """
        if folderclass is None:
            folderclass = fs.Folder
+        folder_count = 0
        for path in self._dirs:
            from_folder = folderclass(path)
            for folder in self._get_folders(from_folder, j):
+                folder_count += 1
+                if not isinstance(j, job.NullJob):
+                    j.set_progress(-1, tr("Collected {} folders to scan").format(folder_count))
                yield folder

    def get_state(self, path):
@ -217,19 +215,16 @@ class Directories:
        # direct match? easy result.
        if path in self.states:
            return self.states[path]
-        state = self._default_state_for_path(path) or DirectoryState.Normal
+        state = self._default_state_for_path(path)
        # Save non-default states in cache, necessary for _get_files()
-        if state != DirectoryState.Normal:
+        if state != DirectoryState.NORMAL:
            self.states[path] = state
            return state
-
-        prevlen = 0
-        # we loop through the states to find the longest matching prefix
-        # if the parent has a state in cache, return that state
-        for p, s in self.states.items():
-            if p.is_parent_of(path) and len(p) > prevlen:
-                prevlen = len(p)
-                state = s
+        # find the longest parent path that is in states and return that state if found
+        # NOTE: path.parents is ordered longest to shortest
+        for parent_path in path.parents:
+            if parent_path in self.states:
+                return self.states[parent_path]
        return state

    def has_any_file(self):
@ -298,6 +293,6 @@ class Directories:
        if self.get_state(path) == state:
            return
        for iter_path in list(self.states.keys()):
-            if path.is_parent_of(iter_path):
+            if path in iter_path.parents:
                del self.states[iter_path]
        self.states[path] = state
--- a/core/engine.py
+++ b/core/engine.py
@ -17,17 +17,31 @@ from hscommon.util import flatten, multi_replace
 from hscommon.trans import tr
 from hscommon.jobprogress import job

-(WEIGHT_WORDS, MATCH_SIMILAR_WORDS, NO_FIELD_ORDER,) = range(3)
+(
+    WEIGHT_WORDS,
+    MATCH_SIMILAR_WORDS,
+    NO_FIELD_ORDER,
+) = range(3)

 JOB_REFRESH_RATE = 100
+PROGRESS_MESSAGE = tr("%d matches found from %d groups")


 def getwords(s):
    # We decompose the string so that ascii letters with accents can be part of the word.
    s = normalize("NFD", s)
    s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", " ").lower()
+    # logging.debug(f"DEBUG chars for: {s}\n"
+    #               f"{[c for c in s if ord(c) != 32]}\n"
+    #               f"{[ord(c) for c in s if ord(c) != 32]}")
+    # HACK We shouldn't ignore non-ascii characters altogether. Any Unicode char
+    # above common european characters that cannot be "sanitized" (ie. stripped
+    # of their accents, etc.) are preserved as is. The arbitrary limit is
+    # obtained from this one: ord("\u037e") GREEK QUESTION MARK
    s = "".join(
-        c for c in s if c in string.ascii_letters + string.digits + string.whitespace
+        c
+        for c in s
+        if (ord(c) <= 894 and c in string.ascii_letters + string.digits + string.whitespace) or ord(c) > 894
    )
    return [_f for _f in s.split(" ") if _f]  # remove empty elements

@ -93,20 +107,18 @@ def compare_fields(first, second, flags=()):
        # We don't want to remove field directly in the list. We must work on a copy.
        second = second[:]
        for field1 in first:
-            max = 0
+            max_score = 0
            matched_field = None
            for field2 in second:
                r = compare(field1, field2, flags)
-                if r > max:
-                    max = r
+                if r > max_score:
+                    max_score = r
                    matched_field = field2
-            results.append(max)
+            results.append(max_score)
            if matched_field:
                second.remove(matched_field)
    else:
-        results = [
-            compare(field1, field2, flags) for field1, field2 in zip(first, second)
-        ]
+        results = [compare(field1, field2, flags) for field1, field2 in zip(first, second)]
    return min(results) if results else 0


@ -119,9 +131,7 @@ def build_word_dict(objects, j=job.nulljob):
    The result will be a dict with words as keys, lists of objects as values.
    """
    result = defaultdict(set)
-    for object in j.iter_with_progress(
-        objects, "Prepared %d/%d files", JOB_REFRESH_RATE
-    ):
+    for object in j.iter_with_progress(objects, "Prepared %d/%d files", JOB_REFRESH_RATE):
        for word in unpack_fields(object.words):
            result[word].add(object)
    return result
@ -156,9 +166,7 @@ def reduce_common_words(word_dict, threshold):
    The exception to this removal are the objects where all the words of the object are common.
    Because if we remove them, we will miss some duplicates!
    """
-    uncommon_words = set(
-        word for word, objects in word_dict.items() if len(objects) < threshold
-    )
+    uncommon_words = {word for word, objects in word_dict.items() if len(objects) < threshold}
    for word, objects in list(word_dict.items()):
        if len(objects) < threshold:
            continue
@ -241,10 +249,11 @@ def getmatches(
        match_flags.append(MATCH_SIMILAR_WORDS)
    if no_field_order:
        match_flags.append(NO_FIELD_ORDER)
-    j.start_job(len(word_dict), tr("0 matches found"))
+    j.start_job(len(word_dict), PROGRESS_MESSAGE % (0, 0))
    compared = defaultdict(set)
    result = []
    try:
+        word_count = 0
        # This whole 'popping' thing is there to avoid taking too much memory at the same time.
        while word_dict:
            items = word_dict.popitem()[1]
@ -259,41 +268,51 @@ def getmatches(
                        result.append(m)
                        if len(result) >= LIMIT:
                            return result
-            j.add_progress(desc=tr("%d matches found") % len(result))
+            word_count += 1
+            j.add_progress(desc=PROGRESS_MESSAGE % (len(result), word_count))
    except MemoryError:
        # This is the place where the memory usage is at its peak during the scan.
        # Just continue the process with an incomplete list of matches.
        del compared  # This should give us enough room to call logging.
-        logging.warning(
-            "Memory Overflow. Matches: %d. Word dict: %d"
-            % (len(result), len(word_dict))
-        )
+        logging.warning("Memory Overflow. Matches: %d. Word dict: %d" % (len(result), len(word_dict)))
        return result
    return result


-def getmatches_by_contents(files, j=job.nulljob):
+def getmatches_by_contents(files, bigsize=0, j=job.nulljob):
    """Returns a list of :class:`Match` within ``files`` if their contents is the same.

+    :param bigsize: The size in bytes over which we consider files big enough to
+                    justify taking samples of the file for hashing. If 0, compute digest as usual.
    :param j: A :ref:`job progress instance <jobs>`.
    """
    size2files = defaultdict(set)
    for f in files:
-        if f.size:
-            size2files[f.size].add(f)
+        size2files[f.size].add(f)
    del files
    possible_matches = [files for files in size2files.values() if len(files) > 1]
    del size2files
    result = []
-    j.start_job(len(possible_matches), tr("0 matches found"))
+    j.start_job(len(possible_matches), PROGRESS_MESSAGE % (0, 0))
+    group_count = 0
    for group in possible_matches:
        for first, second in itertools.combinations(group, 2):
            if first.is_ref and second.is_ref:
                continue  # Don't spend time comparing two ref pics together.
-            if first.md5partial == second.md5partial:
-                if first.md5 == second.md5:
-                    result.append(Match(first, second, 100))
-        j.add_progress(desc=tr("%d matches found") % len(result))
+            if first.size == 0 and second.size == 0:
+                # skip hashing for zero length files
+                result.append(Match(first, second, 100))
+                continue
+            # if digests are the same (and not None) then files match
+            if first.digest_partial is not None and first.digest_partial == second.digest_partial:
+                if bigsize > 0 and first.size > bigsize:
+                    if first.digest_samples is not None and first.digest_samples == second.digest_samples:
+                        result.append(Match(first, second, 100))
+                else:
+                    if first.digest is not None and first.digest == second.digest:
+                        result.append(Match(first, second, 100))
+        group_count += 1
+        j.add_progress(desc=PROGRESS_MESSAGE % (len(result), group_count))
    return result


@ -391,18 +410,13 @@ class Group:

        You can call this after the duplicate scanning process to free a bit of memory.
        """
-        discarded = set(
-            m
-            for m in self.matches
-            if not all(obj in self.unordered for obj in [m.first, m.second])
-        )
+        discarded = {m for m in self.matches if not all(obj in self.unordered for obj in [m.first, m.second])}
        self.matches -= discarded
        self.candidates = defaultdict(set)
        return discarded

    def get_match_of(self, item):
-        """Returns the match pair between ``item`` and :attr:`ref`.
-        """
+        """Returns the match pair between ``item`` and :attr:`ref`."""
        if item is self.ref:
            return
        for m in self._get_matches_for_ref():
@ -418,8 +432,7 @@ class Group:
        """
        # tie_breaker(ref, dupe) --> True if dupe should be ref
        # Returns True if anything changed during prioritization.
-        master_key_func = lambda x: (-x.is_ref, key_func(x))
-        new_order = sorted(self.ordered, key=master_key_func)
+        new_order = sorted(self.ordered, key=lambda x: (-x.is_ref, key_func(x)))
        changed = new_order != self.ordered
        self.ordered = new_order
        if tie_breaker is None:
@ -442,19 +455,16 @@ class Group:
            self.unordered.remove(item)
            self._percentage = None
            self._matches_for_ref = None
-            if (len(self) > 1) and any(
-                not getattr(item, "is_ref", False) for item in self
-            ):
+            if (len(self) > 1) and any(not getattr(item, "is_ref", False) for item in self):
                if discard_matches:
-                    self.matches = set(m for m in self.matches if item not in m)
+                    self.matches = {m for m in self.matches if item not in m}
            else:
                self._clear()
        except ValueError:
            pass

    def switch_ref(self, with_dupe):
-        """Make the :attr:`ref` dupe of the group switch position with ``with_dupe``.
-        """
+        """Make the :attr:`ref` dupe of the group switch position with ``with_dupe``."""
        if self.ref.is_ref:
            return False
        try:
@ -473,9 +483,7 @@ class Group:
        if self._percentage is None:
            if self.dupes:
                matches = self._get_matches_for_ref()
-                self._percentage = sum(match.percentage for match in matches) // len(
-                    matches
-                )
+                self._percentage = sum(match.percentage for match in matches) // len(matches)
            else:
                self._percentage = 0
        return self._percentage
@ -522,7 +530,7 @@ def get_groups(matches):
        del dupe2group
        del matches
        # should free enough memory to continue
-        logging.warning("Memory Overflow. Groups: {0}".format(len(groups)))
+        logging.warning(f"Memory Overflow. Groups: {len(groups)}")
    # Now that we have a group, we have to discard groups' matches and see if there're any "orphan"
    # matches, that is, matches that were candidate in a group but that none of their 2 files were
    # accepted in the group. With these orphan groups, it's safe to build additional groups
@ -530,12 +538,8 @@ def get_groups(matches):
    orphan_matches = []
    for group in groups:
        orphan_matches += {
-            m
-            for m in group.discard_matches()
-            if not any(obj in matched_files for obj in [m.first, m.second])
+            m for m in group.discard_matches() if not any(obj in matched_files for obj in [m.first, m.second])
        }
    if groups and orphan_matches:
-        groups += get_groups(
-            orphan_matches
-        )  # no job, as it isn't supposed to take a long time
+        groups += get_groups(orphan_matches)  # no job, as it isn't supposed to take a long time
    return groups
--- a/core/exclude.py
+++ b/core/exclude.py
@ -2,8 +2,9 @@
 # which should be included with this package. The terms are also available at
 # http://www.gnu.org/licenses/gpl-3.0.html

-from .markable import Markable
+from core.markable import Markable
 from xml.etree import ElementTree as ET
+
 # TODO: perhaps use regex module for better Unicode support? https://pypi.org/project/regex/
 # also https://pypi.org/project/re2/
 # TODO update the Result list with newly added regexes if possible
@ -15,13 +16,14 @@ from hscommon.util import FileOrPath
 from hscommon.plat import ISWINDOWS
 import time

-default_regexes = [r"^thumbs\.db$",  # Obsolete after WindowsXP
-                   r"^desktop\.ini$",  # Windows metadata
-                   r"^\.DS_Store$",  # MacOS metadata
-                   r"^\.Trash\-.*",  # Linux trash directories
-                   r"^\$Recycle\.Bin$",  # Windows
-                   r"^\..*",  # Hidden files on Unix-like
-                   ]
+default_regexes = [
+    r"^thumbs\.db$",  # Obsolete after WindowsXP
+    r"^desktop\.ini$",  # Windows metadata
+    r"^\.DS_Store$",  # MacOS metadata
+    r"^\.Trash\-.*",  # Linux trash directories
+    r"^\$Recycle\.Bin$",  # Windows
+    r"^\..*",  # Hidden files on Unix-like
+]
 # These are too broad
 forbidden_regexes = [r".*", r"\/.*", r".*\/.*", r".*\\\\.*", r".*\..*"]

@ -34,6 +36,7 @@ def timer(func):
        end = time.perf_counter_ns()
        print(f"DEBUG: func {func.__name__!r} took {end - start} ns.")
        return value
+
    return wrapper_timer


@ -45,11 +48,13 @@ def memoize(func):
        if args not in func.cache:
            func.cache[args] = func(*args)
        return func.cache[args]
+
    return _memoize


 class AlreadyThereException(Exception):
    """Expression already in the list"""
+
    def __init__(self, arg="Expression is already in excluded list."):
        super().__init__(arg)

@ -81,7 +86,7 @@ class ExcludeList(Markable):
            yield self.is_marked(regex), regex

    def __contains__(self, item):
-        return self.isExcluded(item)
+        return self.has_entry(item)

    def __len__(self):
        """Returns the total number of regexes regardless of mark status."""
@ -145,10 +150,7 @@ class ExcludeList(Markable):
    # @timer
    @memoize
    def _do_compile(self, expr):
-        try:
-            return re.compile(expr)
-        except Exception as e:
-            raise(e)
+        return re.compile(expr)

    # @timer
    # @memoize  # probably not worth memoizing this one if we memoize the above
@ -169,11 +171,11 @@ class ExcludeList(Markable):

    def build_compiled_caches(self, union=False):
        if not union:
-            self._cached_compiled_files =\
-                [x for x in self._excluded_compiled if not has_sep(x.pattern)]
-            self._cached_compiled_paths =\
-                [x for x in self._excluded_compiled if has_sep(x.pattern)]
+            self._cached_compiled_files = [x for x in self._excluded_compiled if not has_sep(x.pattern)]
+            self._cached_compiled_paths = [x for x in self._excluded_compiled if has_sep(x.pattern)]
+            self._dirty = False
            return
+
        marked_count = [x for marked, x in self if marked]
        # If there is no item, the compiled Pattern will be '' and match everything!
        if not marked_count:
@ -183,28 +185,25 @@ class ExcludeList(Markable):
        else:
            # HACK returned as a tuple to get a free iterator and keep interface
            # the same regardless of whether the client asked for union or not
-            self._cached_compiled_union_all =\
-                (re.compile('|'.join(marked_count)),)
+            self._cached_compiled_union_all = (re.compile("|".join(marked_count)),)
            files_marked = [x for x in marked_count if not has_sep(x)]
            if not files_marked:
                self._cached_compiled_union_files = tuple()
            else:
-                self._cached_compiled_union_files =\
-                    (re.compile('|'.join(files_marked)),)
+                self._cached_compiled_union_files = (re.compile("|".join(files_marked)),)
            paths_marked = [x for x in marked_count if has_sep(x)]
            if not paths_marked:
                self._cached_compiled_union_paths = tuple()
            else:
-                self._cached_compiled_union_paths =\
-                    (re.compile('|'.join(paths_marked)),)
+                self._cached_compiled_union_paths = (re.compile("|".join(paths_marked)),)
+        self._dirty = False

    @property
    def compiled(self):
        """Should be used by other classes to retrieve the up-to-date list of patterns."""
        if self._use_union:
            if self._dirty:
-                self.build_compiled_caches(True)
-                self._dirty = False
+                self.build_compiled_caches(self._use_union)
            return self._cached_compiled_union_all
        return self._excluded_compiled

@ -215,29 +214,25 @@ class ExcludeList(Markable):
        The interface should be expected to be a generator, even if it returns only
        one item (one Pattern in the union case)."""
        if self._dirty:
-            self.build_compiled_caches(True if self._use_union else False)
-            self._dirty = False
-        return self._cached_compiled_union_files if self._use_union\
-            else self._cached_compiled_files
+            self.build_compiled_caches(self._use_union)
+        return self._cached_compiled_union_files if self._use_union else self._cached_compiled_files

    @property
    def compiled_paths(self):
        """Returns patterns with only separators in them, for more precise filtering."""
        if self._dirty:
-            self.build_compiled_caches(True if self._use_union else False)
-            self._dirty = False
-        return self._cached_compiled_union_paths if self._use_union\
-            else self._cached_compiled_paths
+            self.build_compiled_caches(self._use_union)
+        return self._cached_compiled_union_paths if self._use_union else self._cached_compiled_paths

    # ---Public
    def add(self, regex, forced=False):
        """This interface should throw exceptions if there is an error during
        regex compilation"""
-        if self.isExcluded(regex):
+        if self.has_entry(regex):
            # This exception should never be ignored
            raise AlreadyThereException()
        if regex in forbidden_regexes:
-            raise Exception("Forbidden (dangerous) expression.")
+            raise ValueError("Forbidden (dangerous) expression.")

        iscompilable, exception, compiled = self.compile_re(regex)
        if not iscompilable and not forced:
@ -256,12 +251,27 @@ class ExcludeList(Markable):
        """Returns the number of marked regexes only."""
        return len([x for marked, x in self if marked])

-    def isExcluded(self, regex):
+    def has_entry(self, regex):
        for item in self._excluded:
            if regex == item[0]:
                return True
        return False

+    def is_excluded(self, dirname, filename):
+        """Return True if the file or the absolute path to file is supposed to be
+        filtered out, False otherwise."""
+        matched = False
+        for expr in self.compiled_files:
+            if expr.fullmatch(filename):
+                matched = True
+                break
+        if not matched:
+            for expr in self.compiled_paths:
+                if expr.fullmatch(dirname + sep + filename):
+                    matched = True
+                    break
+        return matched
+
    def remove(self, regex):
        for item in self._excluded:
            if item[0] == regex:
@ -280,15 +290,16 @@ class ExcludeList(Markable):
                was_marked = self.is_marked(regex)
                is_compilable, exception, compiled = self.compile_re(newregex)
                # We overwrite the found entry
-                self._excluded[self._excluded.index(item)] =\
-                    [newregex, is_compilable, exception, compiled]
+                self._excluded[self._excluded.index(item)] = [newregex, is_compilable, exception, compiled]
                self._remove_compiled(regex)
                break
        if not found:
            return
-        if is_compilable and was_marked:
-            # Not marked by default when added, add it back
-            self.mark(newregex)
+        if is_compilable:
+            self._add_compiled(newregex)
+            if was_marked:
+                # Not marked by default when added, add it back
+                self.mark(newregex)

    # def change_index(self, regex, new_index):
    # """Internal list must be a list, not dict."""
@ -300,7 +311,7 @@ class ExcludeList(Markable):
            if regex not in default_regexes:
                self.unmark(regex)
        for default_regex in default_regexes:
-            if not self.isExcluded(default_regex):
+            if not self.has_entry(default_regex):
                self.add(default_regex)
            self.mark(default_regex)

@ -326,8 +337,10 @@ class ExcludeList(Markable):
                # "forced" avoids compilation exceptions and adds anyway
                self.add(regex_string, forced=True)
            except AlreadyThereException:
-                logging.error(f"Regex \"{regex_string}\" \
-loaded from XML was already present in the list.")
+                logging.error(
+                    f'Regex "{regex_string}" \
+loaded from XML was already present in the list.'
+                )
                continue
            if exclude_item.get("marked") == "y":
                marked.add(regex_string)
@ -352,6 +365,7 @@ loaded from XML was already present in the list.")
 class ExcludeDict(ExcludeList):
    """Exclusion list holding a set of regular expressions as keys, the compiled
    Pattern, compilation error and compilable boolean as values."""
+
    # Implemntation around a dictionary instead of a list, which implies
    # to keep the index of each string-key as its sub-element and keep it updated
    # whenever insert/remove is done.
@ -399,9 +413,9 @@ class ExcludeDict(ExcludeList):
        if self._use_union:
            return
        try:
-            self._excluded_compiled.add(self._excluded[regex]["compiled"])
+            self._excluded_compiled.add(self._excluded.get(regex).get("compiled"))
        except Exception as e:
-            logging.warning(f"Exception while adding regex {regex} to compiled set: {e}")
+            logging.error(f"Exception while adding regex {regex} to compiled set: {e}")
            return

    def is_compilable(self, regex):
@ -418,14 +432,9 @@ class ExcludeDict(ExcludeList):
        # and other indices should be pushed by one
        for value in self._excluded.values():
            value["index"] += 1
-        self._excluded[regex] = {
-            "index": 0,
-            "compilable": iscompilable,
-            "error": exception,
-            "compiled": compiled
-        }
+        self._excluded[regex] = {"index": 0, "compilable": iscompilable, "error": exception, "compiled": compiled}

-    def isExcluded(self, regex):
+    def has_entry(self, regex):
        if regex in self._excluded.keys():
            return True
        return False
@ -451,14 +460,16 @@ class ExcludeDict(ExcludeList):
        previous = self._excluded.pop(regex)
        iscompilable, error, compiled = self.compile_re(newregex)
        self._excluded[newregex] = {
-            "index": previous["index"],
+            "index": previous.get("index"),
            "compilable": iscompilable,
            "error": error,
-            "compiled": compiled
+            "compiled": compiled,
        }
        self._remove_compiled(regex)
-        if was_marked and iscompilable:
-            self.mark(newregex)
+        if iscompilable:
+            self._add_compiled(newregex)
+            if was_marked:
+                self.mark(newregex)

    def save_to_xml(self, outfile):
        """Create a XML file that can be used by load_from_xml.
@ -492,8 +503,11 @@ def ordered_keys(_dict):


 if ISWINDOWS:
-    def has_sep(x):
-        return '\\' + sep in x
+
+    def has_sep(regexp):
+        return "\\" + sep in regexp
+
 else:
-    def has_sep(x):
-        return sep in x
+
+    def has_sep(regexp):
+        return sep in regexp
--- a/core/export.py
+++ b/core/export.py
@ -131,15 +131,11 @@ def export_to_xhtml(colnames, rows):
            indented = "indented"
        filename = row[1]
        cells = "".join(CELL_TEMPLATE.format(value=value) for value in row[2:])
-        rendered_rows.append(
-            ROW_TEMPLATE.format(indented=indented, filename=filename, cells=cells)
-        )
+        rendered_rows.append(ROW_TEMPLATE.format(indented=indented, filename=filename, cells=cells))
        previous_group_id = row[0]
    rendered_rows = "".join(rendered_rows)
    # The main template can't use format because the css code uses {}
-    content = MAIN_TEMPLATE.replace("$colheaders", colheaders).replace(
-        "$rows", rendered_rows
-    )
+    content = MAIN_TEMPLATE.replace("$colheaders", colheaders).replace("$rows", rendered_rows)
    folder = mkdtemp()
    destpath = op.join(folder, "export.htm")
    fp = open(destpath, "wt", encoding="utf-8")
--- a/core/fs.py
+++ b/core/fs.py
@ -11,11 +11,28 @@
 # resulting needless complexity and memory usage. It's been a while since I wanted to do that fork,
 # and I'm doing it now.

-import hashlib
-import logging
+import os

+from math import floor
+import logging
+import sqlite3
+from sys import platform
+from threading import Lock
+from typing import Any, AnyStr, Union, Callable
+
+from pathlib import Path
 from hscommon.util import nonone, get_file_ext

+hasher: Callable
+try:
+    import xxhash
+
+    hasher = xxhash.xxh128
+except ImportError:
+    import hashlib
+
+    hasher = hashlib.md5
+
 __all__ = [
    "File",
    "Folder",
@ -30,6 +47,17 @@ __all__ = [

 NOT_SET = object()

+# The goal here is to not run out of memory on really big files. However, the chunk
+# size has to be large enough so that the python loop isn't too costly in terms of
+# CPU.
+CHUNK_SIZE = 1024 * 1024  # 1 MiB
+
+# Minimum size below which partial hashing is not used
+MIN_FILE_SIZE = 3 * CHUNK_SIZE  # 3MiB, because we take 3 samples
+
+# Partial hashing offset and size
+PARTIAL_OFFSET_SIZE = (0x4000, 0x4000)
+

 class FSError(Exception):
    cls_message = "An error has occured on '{name}' in '{parent}'"
@ -69,28 +97,131 @@ class OperationError(FSError):
    cls_message = "Operation on '{name}' failed."


-class File:
-    """Represents a file and holds metadata to be used for scanning.
+class FilesDB:
+    schema_version = 1
+    schema_version_description = "Changed from md5 to xxhash if available."
+
+    create_table_query = """CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER,
+        entry_dt DATETIME, digest BLOB, digest_partial BLOB, digest_samples BLOB)"""
+    drop_table_query = "DROP TABLE IF EXISTS files;"
+    select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns"
+    select_query_ignore_mtime = "SELECT {key} FROM files WHERE path=:path AND size=:size"
+    insert_query = """
+        INSERT INTO files (path, size, mtime_ns, entry_dt, {key})
+        VALUES (:path, :size, :mtime_ns, datetime('now'), :value)
+        ON CONFLICT(path) DO UPDATE SET size=:size, mtime_ns=:mtime_ns, entry_dt=datetime('now'), {key}=:value;
    """

-    INITIAL_INFO = {
-        "size": 0,
-        "mtime": 0,
-        "md5": "",
-        "md5partial": "",
-    }
+    ignore_mtime = False
+
+    def __init__(self):
+        self.conn = None
+        self.lock = None
+
+    def connect(self, path: Union[AnyStr, os.PathLike]) -> None:
+        if platform.startswith("gnu0"):
+            self.conn = sqlite3.connect(path, check_same_thread=False, isolation_level=None)
+        else:
+            self.conn = sqlite3.connect(path, check_same_thread=False)
+        self.lock = Lock()
+        self._check_upgrade()
+
+    def _check_upgrade(self) -> None:
+        with self.lock, self.conn as conn:
+            has_schema = conn.execute(
+                "SELECT NAME FROM sqlite_master WHERE type='table' AND name='schema_version'"
+            ).fetchall()
+            version = None
+            if has_schema:
+                version = conn.execute("SELECT version FROM schema_version ORDER BY version DESC").fetchone()[0]
+            else:
+                conn.execute("CREATE TABLE schema_version (version int PRIMARY KEY, description TEXT)")
+            if version != self.schema_version:
+                conn.execute(self.drop_table_query)
+                conn.execute(
+                    "INSERT OR REPLACE INTO schema_version VALUES (:version, :description)",
+                    {"version": self.schema_version, "description": self.schema_version_description},
+                )
+            conn.execute(self.create_table_query)
+
+    def clear(self) -> None:
+        with self.lock, self.conn as conn:
+            conn.execute(self.drop_table_query)
+            conn.execute(self.create_table_query)
+
+    def get(self, path: Path, key: str) -> Union[bytes, None]:
+        stat = path.stat()
+        size = stat.st_size
+        mtime_ns = stat.st_mtime_ns
+        try:
+            with self.conn as conn:
+                if self.ignore_mtime:
+                    cursor = conn.execute(
+                        self.select_query_ignore_mtime.format(key=key), {"path": str(path), "size": size}
+                    )
+                else:
+                    cursor = conn.execute(
+                        self.select_query.format(key=key),
+                        {"path": str(path), "size": size, "mtime_ns": mtime_ns},
+                    )
+                result = cursor.fetchone()
+                cursor.close()
+
+            if result:
+                return result[0]
+        except Exception as ex:
+            logging.warning(f"Couldn't get {key} for {path} w/{size}, {mtime_ns}: {ex}")
+
+        return None
+
+    def put(self, path: Path, key: str, value: Any) -> None:
+        stat = path.stat()
+        size = stat.st_size
+        mtime_ns = stat.st_mtime_ns
+        try:
+            with self.lock, self.conn as conn:
+                conn.execute(
+                    self.insert_query.format(key=key),
+                    {"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value},
+                )
+        except Exception as ex:
+            logging.warning(f"Couldn't put {key} for {path} w/{size}, {mtime_ns}: {ex}")
+
+    def commit(self) -> None:
+        with self.lock:
+            self.conn.commit()
+
+    def close(self) -> None:
+        with self.lock:
+            self.conn.close()
+
+
+filesdb = FilesDB()  # Singleton
+
+
+class File:
+    """Represents a file and holds metadata to be used for scanning."""
+
+    INITIAL_INFO = {"size": 0, "mtime": 0, "digest": b"", "digest_partial": b"", "digest_samples": b""}
    # Slots for File make us save quite a bit of memory. In a memory test I've made with a lot of
    # files, I saved 35% memory usage with "unread" files (no _read_info() call) and gains become
    # even greater when we take into account read attributes (70%!). Yeah, it's worth it.
-    __slots__ = ("path", "is_ref", "words") + tuple(INITIAL_INFO.keys())
+    __slots__ = ("path", "unicode_path", "is_ref", "words") + tuple(INITIAL_INFO.keys())

    def __init__(self, path):
-        self.path = path
        for attrname in self.INITIAL_INFO:
            setattr(self, attrname, NOT_SET)
+        if type(path) is os.DirEntry:
+            self.path = Path(path.path)
+            self.size = nonone(path.stat().st_size, 0)
+            self.mtime = nonone(path.stat().st_mtime, 0)
+        else:
+            self.path = path
+        if self.path:
+            self.unicode_path = str(self.path)

    def __repr__(self):
-        return "<{} {}>".format(self.__class__.__name__, str(self.path))
+        return f"<{self.__class__.__name__} {str(self.path)}>"

    def __getattribute__(self, attrname):
        result = object.__getattribute__(self, attrname)
@ -98,51 +229,83 @@ class File:
            try:
                self._read_info(attrname)
            except Exception as e:
-                logging.warning(
-                    "An error '%s' was raised while decoding '%s'", e, repr(self.path)
-                )
+                logging.warning("An error '%s' was raised while decoding '%s'", e, repr(self.path))
            result = object.__getattribute__(self, attrname)
            if result is NOT_SET:
                result = self.INITIAL_INFO[attrname]
        return result

-    # This offset is where we should start reading the file to get a partial md5
-    # For audio file, it should be where audio data starts
-    def _get_md5partial_offset_and_size(self):
-        return (0x4000, 0x4000)  # 16Kb
+    def _calc_digest(self):
+        # type: () -> bytes
+
+        with self.path.open("rb") as fp:
+            file_hash = hasher()
+            # The goal here is to not run out of memory on really big files. However, the chunk
+            # size has to be large enough so that the python loop isn't too costly in terms of
+            # CPU.
+            CHUNK_SIZE = 1024 * 1024  # 1 mb
+            filedata = fp.read(CHUNK_SIZE)
+            while filedata:
+                file_hash.update(filedata)
+                filedata = fp.read(CHUNK_SIZE)
+            return file_hash.digest()
+
+    def _calc_digest_partial(self):
+        # type: () -> bytes
+        with self.path.open("rb") as fp:
+            fp.seek(PARTIAL_OFFSET_SIZE[0])
+            partial_data = fp.read(PARTIAL_OFFSET_SIZE[1])
+            return hasher(partial_data).digest()
+
+    def _calc_digest_samples(self) -> bytes:
+        size = self.size
+        with self.path.open("rb") as fp:
+            # Chunk at 25% of the file
+            fp.seek(floor(size * 25 / 100), 0)
+            file_data = fp.read(CHUNK_SIZE)
+            file_hash = hasher(file_data)
+
+            # Chunk at 60% of the file
+            fp.seek(floor(size * 60 / 100), 0)
+            file_data = fp.read(CHUNK_SIZE)
+            file_hash.update(file_data)
+
+            # Last chunk of the file
+            fp.seek(-CHUNK_SIZE, 2)
+            file_data = fp.read(CHUNK_SIZE)
+            file_hash.update(file_data)
+            return file_hash.digest()

    def _read_info(self, field):
+        # print(f"_read_info({field}) for {self}")
        if field in ("size", "mtime"):
            stats = self.path.stat()
            self.size = nonone(stats.st_size, 0)
            self.mtime = nonone(stats.st_mtime, 0)
-        elif field == "md5partial":
-            try:
-                fp = self.path.open("rb")
-                offset, size = self._get_md5partial_offset_and_size()
-                fp.seek(offset)
-                partialdata = fp.read(size)
-                md5 = hashlib.md5(partialdata)
-                self.md5partial = md5.digest()
-                fp.close()
-            except Exception:
-                pass
-        elif field == "md5":
-            try:
-                fp = self.path.open("rb")
-                md5 = hashlib.md5()
-                # The goal here is to not run out of memory on really big files. However, the chunk
-                # size has to be large enough so that the python loop isn't too costly in terms of
-                # CPU.
-                CHUNK_SIZE = 1024 * 1024  # 1 mb
-                filedata = fp.read(CHUNK_SIZE)
-                while filedata:
-                    md5.update(filedata)
-                    filedata = fp.read(CHUNK_SIZE)
-                self.md5 = md5.digest()
-                fp.close()
-            except Exception:
-                pass
+        elif field == "digest_partial":
+            self.digest_partial = filesdb.get(self.path, "digest_partial")
+            if self.digest_partial is None:
+                # If file is smaller than partial requirements just use the full digest
+                if self.size < PARTIAL_OFFSET_SIZE[0] + PARTIAL_OFFSET_SIZE[1]:
+                    self.digest_partial = self.digest
+                else:
+                    self.digest_partial = self._calc_digest_partial()
+                filesdb.put(self.path, "digest_partial", self.digest_partial)
+        elif field == "digest":
+            self.digest = filesdb.get(self.path, "digest")
+            if self.digest is None:
+                self.digest = self._calc_digest()
+                filesdb.put(self.path, "digest", self.digest)
+        elif field == "digest_samples":
+            size = self.size
+            # Might as well hash such small files entirely.
+            if size <= MIN_FILE_SIZE:
+                self.digest_samples = self.digest
+                return
+            self.digest_samples = filesdb.get(self.path, "digest_samples")
+            if self.digest_samples is None:
+                self.digest_samples = self._calc_digest_samples()
+                filesdb.put(self.path, "digest_samples", self.digest_samples)

    def _read_all_info(self, attrnames=None):
        """Cache all possible info.
@ -157,27 +320,33 @@ class File:
    # --- Public
    @classmethod
    def can_handle(cls, path):
-        """Returns whether this file wrapper class can handle ``path``.
-        """
-        return not path.islink() and path.isfile()
+        """Returns whether this file wrapper class can handle ``path``."""
+        return not path.is_symlink() and path.is_file()
+
+    def exists(self) -> bool:
+        """Safely check if the underlying file exists, treat error as non-existent"""
+        try:
+            return self.path.exists()
+        except OSError as ex:
+            logging.warning(f"Checking {self.path} raised: {ex}")
+            return False

    def rename(self, newname):
        if newname == self.name:
            return
-        destpath = self.path.parent()[newname]
+        destpath = self.path.parent.joinpath(newname)
        if destpath.exists():
-            raise AlreadyExistsError(newname, self.path.parent())
+            raise AlreadyExistsError(newname, self.path.parent)
        try:
            self.path.rename(destpath)
-        except EnvironmentError:
+        except OSError:
            raise OperationError(self)
        if not destpath.exists():
            raise OperationError(self)
        self.path = destpath

    def get_display_info(self, group, delta):
-        """Returns a display-ready dict of dupe's data.
-        """
+        """Returns a display-ready dict of dupe's data."""
        raise NotImplementedError()

    # --- Properties
@ -191,19 +360,20 @@ class File:

    @property
    def folder_path(self):
-        return self.path.parent()
+        return self.path.parent


 class Folder(File):
    """A wrapper around a folder path.

-    It has the size/md5 info of a File, but it's value are the sum of its subitems.
+    It has the size/digest info of a File, but its value is the sum of its subitems.
    """

    __slots__ = File.__slots__ + ("_subfolders",)

    def __init__(self, path):
        File.__init__(self, path)
+        self.size = NOT_SET
        self._subfolders = None

    def _all_items(self):
@ -212,37 +382,37 @@ class Folder(File):
        return folders + files

    def _read_info(self, field):
+        # print(f"_read_info({field}) for Folder {self}")
        if field in {"size", "mtime"}:
            size = sum((f.size for f in self._all_items()), 0)
            self.size = size
            stats = self.path.stat()
            self.mtime = nonone(stats.st_mtime, 0)
-        elif field in {"md5", "md5partial"}:
+        elif field in {"digest", "digest_partial", "digest_samples"}:
            # What's sensitive here is that we must make sure that subfiles'
-            # md5 are always added up in the same order, but we also want a
-            # different md5 if a file gets moved in a different subdirectory.
-            def get_dir_md5_concat():
+            # digest are always added up in the same order, but we also want a
+            # different digest if a file gets moved in a different subdirectory.
+
+            def get_dir_digest_concat():
                items = self._all_items()
                items.sort(key=lambda f: f.path)
-                md5s = [getattr(f, field) for f in items]
-                return b"".join(md5s)
+                digests = [getattr(f, field) for f in items]
+                return b"".join(digests)

-            md5 = hashlib.md5(get_dir_md5_concat())
-            digest = md5.digest()
+            digest = hasher(get_dir_digest_concat()).digest()
            setattr(self, field, digest)

    @property
    def subfolders(self):
        if self._subfolders is None:
-            subfolders = [
-                p for p in self.path.listdir() if not p.islink() and p.isdir()
-            ]
+            with os.scandir(self.path) as iter:
+                subfolders = [p for p in iter if not p.is_symlink() and p.is_dir()]
            self._subfolders = [self.__class__(p) for p in subfolders]
        return self._subfolders

    @classmethod
    def can_handle(cls, path):
-        return not path.islink() and path.isdir()
+        return not path.is_symlink() and path.is_dir()


 def get_file(path, fileclasses=[File]):
@ -267,10 +437,11 @@ def get_files(path, fileclasses=[File]):
    assert all(issubclass(fileclass, File) for fileclass in fileclasses)
    try:
        result = []
-        for path in path.listdir():
-            file = get_file(path, fileclasses=fileclasses)
-            if file is not None:
-                result.append(file)
+        with os.scandir(path) as iter:
+            for item in iter:
+                file = get_file(item, fileclasses=fileclasses)
+                if file is not None:
+                    result.append(file)
        return result
-    except EnvironmentError:
+    except OSError:
        raise InvalidPath(path)
--- a/core/gui/base.py
+++ b/core/gui/base.py
@ -15,16 +15,21 @@ class DupeGuruGUIObject(Listener):
        self.app = app

    def directories_changed(self):
+        # Implemented in child classes
        pass

    def dupes_selected(self):
+        # Implemented in child classes
        pass

    def marking_changed(self):
+        # Implemented in child classes
        pass

    def results_changed(self):
+        # Implemented in child classes
        pass

    def results_changed_but_keep_selection(self):
+        # Implemented in child classes
        pass
--- a/core/gui/deletion_options.py
+++ b/core/gui/deletion_options.py
@ -29,8 +29,7 @@ class DeletionOptionsView:
    """

    def update_msg(self, msg: str):
-        """Update the dialog's prompt with ``str``.
-        """
+        """Update the dialog's prompt with ``str``."""

    def show(self):
        """Show the dialog in a modal fashion.
@ -39,8 +38,7 @@ class DeletionOptionsView:
        """

    def set_hardlink_option_enabled(self, is_enabled: bool):
-        """Enable or disable the widget controlling :attr:`DeletionOptions.use_hardlinks`.
-        """
+        """Enable or disable the widget controlling :attr:`DeletionOptions.use_hardlinks`."""


 class DeletionOptions(GUIObject):
@ -75,8 +73,7 @@ class DeletionOptions(GUIObject):
        return self.view.show()

    def supports_links(self):
-        """Returns whether our platform supports symlinks.
-        """
+        """Returns whether our platform supports symlinks."""
        # When on a platform that doesn't implement it, calling os.symlink() (with the wrong number
        # of arguments) raises NotImplementedError, which allows us to gracefully check for the
        # feature.
--- a/core/gui/details_panel.py
+++ b/core/gui/details_panel.py
@ -7,7 +7,7 @@
 # http://www.gnu.org/licenses/gpl-3.0.html

 from hscommon.gui.base import GUIObject
-from .base import DupeGuruGUIObject
+from core.gui.base import DupeGuruGUIObject


 class DetailsPanel(GUIObject, DupeGuruGUIObject):
@ -32,9 +32,7 @@ class DetailsPanel(GUIObject, DupeGuruGUIObject):
        # we don't want the two sides of the table to display the stats for the same file
        ref = group.ref if group is not None and group.ref is not dupe else None
        data2 = self.app.get_display_info(ref, group, False)
-        columns = self.app.result_table.COLUMNS[
-            1:
-        ]  # first column is the 'marked' column
+        columns = self.app.result_table.COLUMNS[1:]  # first column is the 'marked' column
        self._table = [(c.display, data1[c.name], data2[c.name]) for c in columns]

    # --- Public
@ -46,5 +44,4 @@ class DetailsPanel(GUIObject, DupeGuruGUIObject):

    # --- Event Handlers
    def dupes_selected(self):
-        self._refresh()
-        self.view.refresh()
+        self._view_updated()
--- a/core/gui/directory_tree.py
+++ b/core/gui/directory_tree.py
@ -8,10 +8,10 @@

 from hscommon.gui.tree import Tree, Node

-from ..directories import DirectoryState
-from .base import DupeGuruGUIObject
+from core.directories import DirectoryState
+from core.gui.base import DupeGuruGUIObject

-STATE_ORDER = [DirectoryState.Normal, DirectoryState.Reference, DirectoryState.Excluded]
+STATE_ORDER = [DirectoryState.NORMAL, DirectoryState.REFERENCE, DirectoryState.EXCLUDED]


 # Lazily loads children
@ -36,9 +36,7 @@ class DirectoryNode(Node):
        self._loaded = True

    def update_all_states(self):
-        self._state = STATE_ORDER.index(
-            self._tree.app.directories.get_state(self._directory_path)
-        )
+        self._state = STATE_ORDER.index(self._tree.app.directories.get_state(self._directory_path))
        for node in self:
            node.update_all_states()

@ -88,9 +86,9 @@ class DirectoryTree(Tree, DupeGuruGUIObject):
        else:
            # All selected nodes or on second-or-more level, exclude them.
            nodes = self.selected_nodes
-            newstate = DirectoryState.Excluded
-            if all(node.state == DirectoryState.Excluded for node in nodes):
-                newstate = DirectoryState.Normal
+            newstate = DirectoryState.EXCLUDED
+            if all(node.state == DirectoryState.EXCLUDED for node in nodes):
+                newstate = DirectoryState.NORMAL
            for node in nodes:
                node.state = newstate

@ -105,5 +103,4 @@ class DirectoryTree(Tree, DupeGuruGUIObject):

    # --- Event Handlers
    def directories_changed(self):
-        self._refresh()
-        self.view.refresh()
+        self._view_updated()
--- a/core/gui/exclude_list_dialog.py
+++ b/core/gui/exclude_list_dialog.py
@ -5,8 +5,9 @@
 # which should be included with this package. The terms are also available at
 # http://www.gnu.org/licenses/gpl-3.0.html

-# from hscommon.trans import tr
-from .exclude_list_table import ExcludeListTable
+from core.gui.exclude_list_table import ExcludeListTable
+from core.exclude import has_sep
+from os import sep
 import logging


@ -30,9 +31,10 @@ class ExcludeListDialogCore:
        self.refresh()

    def rename_selected(self, newregex):
-        """Renames the selected regex to ``newregex``.
-        If there's more than one selected row, the first one is used.
+        """Rename the selected regex to ``newregex``.
+        If there is more than one selected row, the first one is used.
        :param str newregex: The regex to rename the row's regex to.
+        :return bool: true if success, false if error.
        """
        try:
            r = self.exclude_list_table.selected_rows[0]
@ -44,25 +46,42 @@ class ExcludeListDialogCore:
        return False

    def add(self, regex):
-        try:
-            self.exclude_list.add(regex)
-        except Exception as e:
-            raise(e)
+        self.exclude_list.add(regex)
        self.exclude_list.mark(regex)
        self.exclude_list_table.add(regex)

    def test_string(self, test_string):
-        """Sets property on row to highlight if its regex matches test_string supplied."""
+        """Set the highlight property on each row when its regex matches the
+        test_string supplied. Return True if any row matched."""
        matched = False
        for row in self.exclude_list_table.rows:
            compiled_regex = self.exclude_list.get_compiled(row.regex)
-            if compiled_regex and compiled_regex.match(test_string):
-                matched = True
+
+            if self.is_match(test_string, compiled_regex):
                row.highlight = True
+                matched = True
            else:
                row.highlight = False
        return matched

+    def is_match(self, test_string, compiled_regex):
+        # This method is like an inverted version of ExcludeList.is_excluded()
+        if not compiled_regex:
+            return False
+        matched = False
+
+        # Test only the filename portion of the path
+        if not has_sep(compiled_regex.pattern) and sep in test_string:
+            filename = test_string.rsplit(sep, 1)[1]
+            if compiled_regex.fullmatch(filename):
+                matched = True
+            return matched
+
+        # Test the entire path + filename
+        if compiled_regex.fullmatch(test_string):
+            matched = True
+        return matched
+
    def reset_rows_highlight(self):
        for row in self.exclude_list_table.rows:
            row.highlight = False
--- a/core/gui/exclude_list_table.py
+++ b/core/gui/exclude_list_table.py
@ -2,23 +2,21 @@
 # which should be included with this package. The terms are also available at
 # http://www.gnu.org/licenses/gpl-3.0.html

-from .base import DupeGuruGUIObject
+from core.gui.base import DupeGuruGUIObject
 from hscommon.gui.table import GUITable, Row
 from hscommon.gui.column import Column, Columns
 from hscommon.trans import trget
+
 tr = trget("ui")


 class ExcludeListTable(GUITable, DupeGuruGUIObject):
-    COLUMNS = [
-        Column("marked", ""),
-        Column("regex", tr("Regular Expressions"))
-    ]
+    COLUMNS = [Column("marked", ""), Column("regex", tr("Regular Expressions"))]

    def __init__(self, exclude_list_dialog, app):
        GUITable.__init__(self)
        DupeGuruGUIObject.__init__(self, app)
-        self.columns = Columns(self)
+        self._columns = Columns(self)
        self.dialog = exclude_list_dialog

    def rename_selected(self, newname):
@ -36,7 +34,7 @@ class ExcludeListTable(GUITable, DupeGuruGUIObject):
        return ExcludeListRow(self, self.dialog.exclude_list.is_marked(regex), regex), 0

    def _do_delete(self):
-        self.dalog.exclude_list.remove(self.selected_row.regex)
+        self.dialog.exclude_list.remove(self.selected_row.regex)

    # --- Override
    def add(self, regex):
--- a/core/gui/ignore_list_dialog.py
+++ b/core/gui/ignore_list_dialog.py
@ -6,7 +6,7 @@
 # http://www.gnu.org/licenses/gpl-3.0.html

 from hscommon.trans import tr
-from .ignore_list_table import IgnoreListTable
+from core.gui.ignore_list_table import IgnoreListTable


 class IgnoreListDialog:
@ -22,11 +22,9 @@ class IgnoreListDialog:
    def clear(self):
        if not self.ignore_list:
            return
-        msg = tr(
-            "Do you really want to remove all %d items from the ignore list?"
-        ) % len(self.ignore_list)
+        msg = tr("Do you really want to remove all %d items from the ignore list?") % len(self.ignore_list)
        if self.app.view.ask_yes_no(msg):
-            self.ignore_list.Clear()
+            self.ignore_list.clear()
            self.refresh()

    def refresh(self):
--- a/core/gui/ignore_list_table.py
+++ b/core/gui/ignore_list_table.py
@ -22,7 +22,7 @@ class IgnoreListTable(GUITable):

    def __init__(self, ignore_list_dialog):
        GUITable.__init__(self)
-        self.columns = Columns(self)
+        self._columns = Columns(self)
        self.view = None
        self.dialog = ignore_list_dialog

--- a/core/gui/problem_dialog.py
+++ b/core/gui/problem_dialog.py
@ -8,7 +8,7 @@

 from hscommon import desktop

-from .problem_table import ProblemTable
+from core.gui.problem_table import ProblemTable


 class ProblemDialog:
--- a/core/gui/problem_table.py
+++ b/core/gui/problem_table.py
@ -21,7 +21,7 @@ class ProblemTable(GUITable):

    def __init__(self, problem_dialog):
        GUITable.__init__(self)
-        self.columns = Columns(self)
+        self._columns = Columns(self)
        self.dialog = problem_dialog

    # --- Override
--- a/core/gui/result_table.py
+++ b/core/gui/result_table.py
@ -11,7 +11,7 @@ from operator import attrgetter
 from hscommon.gui.table import GUITable, Row
 from hscommon.gui.column import Columns

-from .base import DupeGuruGUIObject
+from core.gui.base import DupeGuruGUIObject


 class DupeRow(Row):
@ -41,11 +41,11 @@ class DupeRow(Row):
            # table.DELTA_COLUMNS are always "delta"
            self._delta_columns = self.table.DELTA_COLUMNS.copy()
            dupe_info = self.data
+            if self._group.ref is None:
+                return False
            ref_info = self._group.ref.get_display_info(group=self._group, delta=False)
            for key, value in dupe_info.items():
-                if (key not in self._delta_columns) and (
-                    ref_info[key].lower() != value.lower()
-                ):
+                if (key not in self._delta_columns) and (ref_info[key].lower() != value.lower()):
                    self._delta_columns.add(key)
        return column_name in self._delta_columns

@ -82,7 +82,7 @@ class ResultTable(GUITable, DupeGuruGUIObject):
    def __init__(self, app):
        GUITable.__init__(self)
        DupeGuruGUIObject.__init__(self, app)
-        self.columns = Columns(self, prefaccess=app, savename="ResultTable")
+        self._columns = Columns(self, prefaccess=app, savename="ResultTable")
        self._power_marker = False
        self._delta_values = False
        self._sort_descriptors = ("name", True)
@ -190,4 +190,4 @@ class ResultTable(GUITable, DupeGuruGUIObject):
        self.view.refresh()

    def save_session(self):
-        self.columns.save_columns()
+        self._columns.save_columns()
--- a/core/gui/stats_label.py
+++ b/core/gui/stats_label.py
@ -6,7 +6,7 @@
 # which should be included with this package. The terms are also available at
 # http://www.gnu.org/licenses/gpl-3.0.html

-from .base import DupeGuruGUIObject
+from core.gui.base import DupeGuruGUIObject


 class StatsLabel(DupeGuruGUIObject):
--- a/core/ignore.py
+++ b/core/ignore.py
@ -20,8 +20,7 @@ class IgnoreList:

    # ---Override
    def __init__(self):
-        self._ignored = {}
-        self._count = 0
+        self.clear()

    def __iter__(self):
        for first, seconds in self._ignored.items():
@ -32,7 +31,7 @@ class IgnoreList:
        return self._count

    # ---Public
-    def AreIgnored(self, first, second):
+    def are_ignored(self, first, second):
        def do_check(first, second):
            try:
                matches = self._ignored[first]
@ -42,23 +41,23 @@ class IgnoreList:

        return do_check(first, second) or do_check(second, first)

-    def Clear(self):
+    def clear(self):
        self._ignored = {}
        self._count = 0

-    def Filter(self, func):
+    def filter(self, func):
        """Applies a filter on all ignored items, and remove all matches where func(first,second)
        doesn't return True.
        """
        filtered = IgnoreList()
        for first, second in self:
            if func(first, second):
-                filtered.Ignore(first, second)
+                filtered.ignore(first, second)
        self._ignored = filtered._ignored
        self._count = filtered._count

-    def Ignore(self, first, second):
-        if self.AreIgnored(first, second):
+    def ignore(self, first, second):
+        if self.are_ignored(first, second):
            return
        try:
            matches = self._ignored[first]
@ -88,9 +87,8 @@ class IgnoreList:
            except KeyError:
                return False

-        if not inner(first, second):
-            if not inner(second, first):
-                raise ValueError()
+        if not inner(first, second) and not inner(second, first):
+            raise ValueError()

    def load_from_xml(self, infile):
        """Loads the ignore list from a XML created with save_to_xml.
@ -110,7 +108,7 @@ class IgnoreList:
            for sfn in subfile_elems:
                subfile_path = sfn.get("path")
                if subfile_path:
-                    self.Ignore(file_path, subfile_path)
+                    self.ignore(file_path, subfile_path)

    def save_to_xml(self, outfile):
        """Create a XML file that can be used by load_from_xml.
--- a/core/markable.py
+++ b/core/markable.py
@ -17,9 +17,11 @@ class Markable:
    # in self.__marked, and is not affected by __inverted. Thus, self.mark while __inverted
    # is True will launch _DidUnmark.
    def _did_mark(self, o):
+        # Implemented in child classes
        pass

    def _did_unmark(self, o):
+        # Implemented in child classes
        pass

    def _get_markable_count(self):
--- a/core/me/init.py
+++ b/core/me/init.py
@ -1 +1 @@
-from . import fs, prioritize, result_table, scanner  # noqa
+from core.me import fs, prioritize, result_table, scanner  # noqa
--- a/core/me/fs.py
+++ b/core/me/fs.py
@ -6,7 +6,7 @@
 # which should be included with this package. The terms are also available at
 # http://www.gnu.org/licenses/gpl-3.0.html

-from hsaudiotag import auto
+import mutagen
 from hscommon.util import get_file_ext, format_size, format_time

 from core.util import format_timestamp, format_perc, format_words, format_dupe_count
@ -26,6 +26,9 @@ TAG_FIELDS = {
    "comment",
 }

+# This is a temporary workaround for migration from hsaudiotag for the can_handle method
+SUPPORTED_EXTS = {"mp3", "wma", "m4a", "m4p", "ogg", "flac", "aif", "aiff", "aifc"}
+

 class MusicFile(fs.File):
    INITIAL_INFO = fs.File.INITIAL_INFO.copy()
@ -50,7 +53,7 @@ class MusicFile(fs.File):
    def can_handle(cls, path):
        if not fs.File.can_handle(path):
            return False
-        return get_file_ext(path.name) in auto.EXT2CLASS
+        return get_file_ext(path.name) in SUPPORTED_EXTS

    def get_display_info(self, group, delta):
        size = self.size
@ -94,22 +97,19 @@ class MusicFile(fs.File):
            "dupe_count": format_dupe_count(dupe_count),
        }

-    def _get_md5partial_offset_and_size(self):
-        f = auto.File(str(self.path))
-        return (f.audio_offset, f.audio_size)
-
    def _read_info(self, field):
        fs.File._read_info(self, field)
        if field in TAG_FIELDS:
-            f = auto.File(str(self.path))
-            self.audiosize = f.audio_size
-            self.bitrate = f.bitrate
-            self.duration = f.duration
-            self.samplerate = f.sample_rate
-            self.artist = f.artist
-            self.album = f.album
-            self.title = f.title
-            self.genre = f.genre
-            self.comment = f.comment
-            self.year = f.year
-            self.track = f.track
+            # The various conversions here are to make this look like the previous implementation
+            file = mutagen.File(str(self.path), easy=True)
+            self.audiosize = self.path.stat().st_size
+            self.bitrate = file.info.bitrate / 1000
+            self.duration = file.info.length
+            self.samplerate = file.info.sample_rate
+            self.artist = ", ".join(file.tags.get("artist") or [])
+            self.album = ", ".join(file.tags.get("album") or [])
+            self.title = ", ".join(file.tags.get("title") or [])
+            self.genre = ", ".join(file.tags.get("genre") or [])
+            self.comment = ", ".join(file.tags.get("comment") or [""])
+            self.year = ", ".join(file.tags.get("date") or [])
+            self.track = (file.tags.get("tracknumber") or [""])[0]
--- a/core/me/scanner.py
+++ b/core/me/scanner.py
@ -17,9 +17,9 @@ class ScannerME(ScannerBase):
    @staticmethod
    def get_scan_options():
        return [
-            ScanOption(ScanType.Filename, tr("Filename")),
-            ScanOption(ScanType.Fields, tr("Filename - Fields")),
-            ScanOption(ScanType.FieldsNoOrder, tr("Filename - Fields (No Order)")),
-            ScanOption(ScanType.Tag, tr("Tags")),
-            ScanOption(ScanType.Contents, tr("Contents")),
+            ScanOption(ScanType.FILENAME, tr("Filename")),
+            ScanOption(ScanType.FIELDS, tr("Filename - Fields")),
+            ScanOption(ScanType.FIELDSNOORDER, tr("Filename - Fields (No Order)")),
+            ScanOption(ScanType.TAG, tr("Tags")),
+            ScanOption(ScanType.CONTENTS, tr("Contents")),
        ]
--- a/core/pe/init.py
+++ b/core/pe/init.py
@ -1,8 +1,7 @@
-from . import (  # noqa
+from core.pe import (  # noqa
    block,
    cache,
    exif,
-    iphoto_plist,
    matchblock,
    matchexif,
    photo,
--- a/core/pe/block.py
+++ b/core/pe/block.py
@ -6,7 +6,7 @@
 # which should be included with this package. The terms are also available at
 # http://www.gnu.org/licenses/gpl-3.0.html

-from ._block import NoBlocksError, DifferentBlockCountError, avgdiff, getblocks2  # NOQA
+from core.pe._block import NoBlocksError, DifferentBlockCountError, avgdiff, getblocks2  # NOQA

 # Converted to C
 # def getblock(image):
--- a/core/pe/block.pyi
+++ b/core/pe/block.pyi
@ -0,0 +1,13 @@
+from typing import Tuple, List, Union, Sequence
+
+_block = Tuple[int, int, int]
+
+class NoBlocksError(Exception): ...  # noqa: E302, E701
+class DifferentBlockCountError(Exception): ...  # noqa E701
+
+def getblock(image: object) -> Union[_block, None]: ...  # noqa: E302
+def getblocks2(image: object, block_count_per_side: int) -> Union[List[_block], None]: ...
+def diff(first: _block, second: _block) -> int: ...
+def avgdiff(  # noqa: E302
+    first: Sequence[_block], second: Sequence[_block], limit: int = 768, min_iterations: int = 1
+) -> Union[int, None]: ...
--- a/core/pe/cache.py
+++ b/core/pe/cache.py
@ -4,24 +4,13 @@
 # which should be included with this package. The terms are also available at
 # http://www.gnu.org/licenses/gpl-3.0.html

-from ._cache import string_to_colors  # noqa
+from core.pe._cache import bytes_to_colors  # noqa


-def colors_to_string(colors):
-    """Transform the 3 sized tuples 'colors' into a hex string.
+def colors_to_bytes(colors):
+    """Transform the 3 sized tuples 'colors' into a bytes string.

-    [(0,100,255)] --> 0064ff
-    [(1,2,3),(4,5,6)] --> 010203040506
+    [(0,100,255)] --> b'\x00d\xff'
+    [(1,2,3),(4,5,6)] --> b'\x01\x02\x03\x04\x05\x06'
    """
-    return "".join("%02x%02x%02x" % (r, g, b) for r, g, b in colors)
-
-
-# This function is an important bottleneck of dupeGuru PE. It has been converted to C.
-# def string_to_colors(s):
-#     """Transform the string 's' in a list of 3 sized tuples.
-#     """
-#     result = []
-#     for i in xrange(0, len(s), 6):
-#         number = int(s[i:i+6], 16)
-#         result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
-#     return result
+    return b"".join(map(bytes, colors))
--- a/core/pe/cache.pyi
+++ b/core/pe/cache.pyi
@ -0,0 +1,6 @@
+from typing import Union, Tuple, List
+
+_block = Tuple[int, int, int]
+
+def colors_to_bytes(colors: List[_block]) -> bytes: ...  # noqa: E302
+def bytes_to_colors(s: bytes) -> Union[List[_block], None]: ...
--- a/core/pe/cache_shelve.py
+++ b/core/pe/cache_shelve.py
@ -1,144 +0,0 @@
-# Copyright 2016 Virgil Dupras
-#
-# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
-# which should be included with this package. The terms are also available at
-# http://www.gnu.org/licenses/gpl-3.0.html
-
-import os
-import os.path as op
-import shelve
-import tempfile
-from collections import namedtuple
-
-from .cache import string_to_colors, colors_to_string
-
-
-def wrap_path(path):
-    return "path:{}".format(path)
-
-
-def unwrap_path(key):
-    return key[5:]
-
-
-def wrap_id(path):
-    return "id:{}".format(path)
-
-
-def unwrap_id(key):
-    return int(key[3:])
-
-
-CacheRow = namedtuple("CacheRow", "id path blocks mtime")
-
-
-class ShelveCache:
-    """A class to cache picture blocks in a shelve backend.
-    """
-
-    def __init__(self, db=None, readonly=False):
-        self.istmp = db is None
-        if self.istmp:
-            self.dtmp = tempfile.mkdtemp()
-            self.ftmp = db = op.join(self.dtmp, "tmpdb")
-        flag = "r" if readonly else "c"
-        self.shelve = shelve.open(db, flag)
-        self.maxid = self._compute_maxid()
-
-    def __contains__(self, key):
-        return wrap_path(key) in self.shelve
-
-    def __delitem__(self, key):
-        row = self.shelve[wrap_path(key)]
-        del self.shelve[wrap_path(key)]
-        del self.shelve[wrap_id(row.id)]
-
-    def __getitem__(self, key):
-        if isinstance(key, int):
-            skey = self.shelve[wrap_id(key)]
-        else:
-            skey = wrap_path(key)
-        return string_to_colors(self.shelve[skey].blocks)
-
-    def __iter__(self):
-        return (unwrap_path(k) for k in self.shelve if k.startswith("path:"))
-
-    def __len__(self):
-        return sum(1 for k in self.shelve if k.startswith("path:"))
-
-    def __setitem__(self, path_str, blocks):
-        blocks = colors_to_string(blocks)
-        if op.exists(path_str):
-            mtime = int(os.stat(path_str).st_mtime)
-        else:
-            mtime = 0
-        if path_str in self:
-            rowid = self.shelve[wrap_path(path_str)].id
-        else:
-            rowid = self._get_new_id()
-        row = CacheRow(rowid, path_str, blocks, mtime)
-        self.shelve[wrap_path(path_str)] = row
-        self.shelve[wrap_id(rowid)] = wrap_path(path_str)
-
-    def _compute_maxid(self):
-        return max(
-            (unwrap_id(k) for k in self.shelve if k.startswith("id:")), default=1
-        )
-
-    def _get_new_id(self):
-        self.maxid += 1
-        return self.maxid
-
-    def clear(self):
-        self.shelve.clear()
-
-    def close(self):
-        if self.shelve is not None:
-            self.shelve.close()
-            if self.istmp:
-                os.remove(self.ftmp)
-                os.rmdir(self.dtmp)
-        self.shelve = None
-
-    def filter(self, func):
-        to_delete = [key for key in self if not func(key)]
-        for key in to_delete:
-            del self[key]
-
-    def get_id(self, path):
-        if path in self:
-            return self.shelve[wrap_path(path)].id
-        else:
-            raise ValueError(path)
-
-    def get_multiple(self, rowids):
-        for rowid in rowids:
-            try:
-                skey = self.shelve[wrap_id(rowid)]
-            except KeyError:
-                continue
-            yield (rowid, string_to_colors(self.shelve[skey].blocks))
-
-    def purge_outdated(self):
-        """Go through the cache and purge outdated records.
-
-        A record is outdated if the picture doesn't exist or if its mtime is greater than the one in
-        the db.
-        """
-        todelete = []
-        for path in self:
-            row = self.shelve[wrap_path(path)]
-            if row.mtime and op.exists(path):
-                picture_mtime = os.stat(path).st_mtime
-                if int(picture_mtime) <= row.mtime:
-                    # not outdated
-                    continue
-            todelete.append(path)
-        for path in todelete:
-            try:
-                del self[path]
-            except KeyError:
-                # I have no idea why a KeyError sometimes happen, but it does, as we can see in
-                # #402 and #439. I don't think it hurts to silently ignore the error, so that's
-                # what we do
-                pass
--- a/core/pe/cache_sqlite.py
+++ b/core/pe/cache_sqlite.py
@ -9,12 +9,23 @@ import os.path as op
 import logging
 import sqlite3 as sqlite

-from .cache import string_to_colors, colors_to_string
+from core.pe.cache import bytes_to_colors, colors_to_bytes


 class SqliteCache:
-    """A class to cache picture blocks in a sqlite backend.
-    """
+    """A class to cache picture blocks in a sqlite backend."""
+
+    schema_version = 2
+    schema_version_description = "Added blocks for all 8 orientations."
+
+    create_table_query = (
+        "CREATE TABLE IF NOT EXISTS "
+        "pictures(path TEXT, mtime_ns INTEGER, blocks BLOB, blocks2 BLOB, blocks3 BLOB, "
+        "blocks4 BLOB, blocks5 BLOB, blocks6 BLOB, blocks7 BLOB, blocks8 BLOB)"
+    )
+    create_index_query = "CREATE INDEX IF NOT EXISTS idx_path on pictures (path)"
+    drop_table_query = "DROP TABLE IF EXISTS pictures"
+    drop_index_query = "DROP INDEX IF EXISTS idx_path"

    def __init__(self, db=":memory:", readonly=False):
        # readonly is not used in the sqlite version of the cache
@ -36,12 +47,20 @@ class SqliteCache:
    # Optimized
    def __getitem__(self, key):
        if isinstance(key, int):
-            sql = "select blocks from pictures where rowid = ?"
+            sql = (
+                "select blocks, blocks2, blocks3, blocks4, blocks5, blocks6, blocks7, blocks8 "
+                "from pictures "
+                "where rowid = ?"
+            )
        else:
-            sql = "select blocks from pictures where path = ?"
-        result = self.con.execute(sql, [key]).fetchone()
-        if result:
-            result = string_to_colors(result[0])
+            sql = (
+                "select blocks, blocks2, blocks3, blocks4, blocks5, blocks6, blocks7, blocks8 "
+                "from pictures "
+                "where path = ?"
+            )
+        blocks = self.con.execute(sql, [key]).fetchone()
+        if blocks:
+            result = [bytes_to_colors(block) for block in blocks]
            return result
        else:
            raise KeyError(key)
@ -57,49 +76,60 @@ class SqliteCache:
        return result[0][0]

    def __setitem__(self, path_str, blocks):
-        blocks = colors_to_string(blocks)
+        blocks = [colors_to_bytes(block) for block in blocks]
        if op.exists(path_str):
            mtime = int(os.stat(path_str).st_mtime)
        else:
            mtime = 0
        if path_str in self:
-            sql = "update pictures set blocks = ?, mtime = ? where path = ?"
+            sql = (
+                "update pictures set blocks = ?, blocks2 = ?, blocks3 = ?, blocks4 = ?, blocks5 = ?, blocks6 = ?, "
+                "blocks7 = ?, blocks8 = ?, mtime_ns = ?"
+                "where path = ?"
+            )
        else:
-            sql = "insert into pictures(blocks,mtime,path) values(?,?,?)"
+            sql = (
+                "insert into pictures(blocks,blocks2,blocks3,blocks4,blocks5,blocks6,blocks7,blocks8,mtime_ns,path) "
+                "values(?,?,?,?,?,?,?,?,?,?)"
+            )
        try:
-            self.con.execute(sql, [blocks, mtime, path_str])
+            self.con.execute(sql, blocks + [mtime, path_str])
        except sqlite.OperationalError:
            logging.warning("Picture cache could not set value for key %r", path_str)
        except sqlite.DatabaseError as e:
-            logging.warning(
-                "DatabaseError while setting value for key %r: %s", path_str, str(e)
-            )
+            logging.warning("DatabaseError while setting value for key %r: %s", path_str, str(e))

    def _create_con(self, second_try=False):
-        def create_tables():
-            logging.debug("Creating picture cache tables.")
-            self.con.execute("drop table if exists pictures")
-            self.con.execute("drop index if exists idx_path")
-            self.con.execute(
-                "create table pictures(path TEXT, mtime INTEGER, blocks TEXT)"
-            )
-            self.con.execute("create index idx_path on pictures (path)")
-
-        self.con = sqlite.connect(self.dbname, isolation_level=None)
        try:
-            self.con.execute("select path, mtime, blocks from pictures where 1=2")
-        except sqlite.OperationalError:  # new db
-            create_tables()
+            self.con = sqlite.connect(self.dbname, isolation_level=None)
+            self._check_upgrade()
        except sqlite.DatabaseError as e:  # corrupted db
            if second_try:
                raise  # Something really strange is happening
-            logging.warning(
-                "Could not create picture cache because of an error: %s", str(e)
-            )
+            logging.warning("Could not create picture cache because of an error: %s", str(e))
            self.con.close()
            os.remove(self.dbname)
            self._create_con(second_try=True)

+    def _check_upgrade(self) -> None:
+        with self.con as conn:
+            has_schema = conn.execute(
+                "SELECT NAME FROM sqlite_master WHERE type='table' AND name='schema_version'"
+            ).fetchall()
+            version = None
+            if has_schema:
+                version = conn.execute("SELECT version FROM schema_version ORDER BY version DESC").fetchone()[0]
+            else:
+                conn.execute("CREATE TABLE schema_version (version int PRIMARY KEY, description TEXT)")
+            if version != self.schema_version:
+                conn.execute(self.drop_table_query)
+                conn.execute(
+                    "INSERT OR REPLACE INTO schema_version VALUES (:version, :description)",
+                    {"version": self.schema_version, "description": self.schema_version_description},
+                )
+            conn.execute(self.create_table_query)
+            conn.execute(self.create_index_query)
+
    def clear(self):
        self.close()
        if self.dbname != ":memory:":
@ -125,11 +155,28 @@ class SqliteCache:
            raise ValueError(path)

    def get_multiple(self, rowids):
-        sql = "select rowid, blocks from pictures where rowid in (%s)" % ",".join(
-            map(str, rowids)
+        ids = ",".join(map(str, rowids))
+        sql = (
+            "select rowid, blocks, blocks2, blocks3, blocks4, blocks5, blocks6, blocks7, blocks8 "
+            f"from pictures where rowid in ({ids})"
        )
        cur = self.con.execute(sql)
-        return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)
+        return (
+            (
+                rowid,
+                [
+                    bytes_to_colors(blocks),
+                    bytes_to_colors(blocks2),
+                    bytes_to_colors(blocks3),
+                    bytes_to_colors(blocks4),
+                    bytes_to_colors(blocks5),
+                    bytes_to_colors(blocks6),
+                    bytes_to_colors(blocks7),
+                    bytes_to_colors(blocks8),
+                ],
+            )
+            for rowid, blocks, blocks2, blocks3, blocks4, blocks5, blocks6, blocks7, blocks8 in cur
+        )

    def purge_outdated(self):
        """Go through the cache and purge outdated records.
@ -138,17 +185,15 @@ class SqliteCache:
        the db.
        """
        todelete = []
-        sql = "select rowid, path, mtime from pictures"
+        sql = "select rowid, path, mtime_ns from pictures"
        cur = self.con.execute(sql)
-        for rowid, path_str, mtime in cur:
-            if mtime and op.exists(path_str):
+        for rowid, path_str, mtime_ns in cur:
+            if mtime_ns and op.exists(path_str):
                picture_mtime = os.stat(path_str).st_mtime
-                if int(picture_mtime) <= mtime:
+                if int(picture_mtime) <= mtime_ns:
                    # not outdated
                    continue
            todelete.append(rowid)
        if todelete:
-            sql = "delete from pictures where rowid in (%s)" % ",".join(
-                map(str, todelete)
-            )
+            sql = "delete from pictures where rowid in (%s)" % ",".join(map(str, todelete))
            self.con.execute(sql)
--- a/core/pe/exif.py
+++ b/core/pe/exif.py
@ -193,8 +193,8 @@ class TIFF_file:
        self.s2nfunc = s2n_intel if self.endian == INTEL_ENDIAN else s2n_motorola

    def s2n(self, offset, length, signed=0, debug=False):
-        slice = self.data[offset : offset + length]
-        val = self.s2nfunc(slice)
+        data_slice = self.data[offset : offset + length]
+        val = self.s2nfunc(data_slice)
        # Sign extension ?
        if signed:
            msb = 1 << (8 * length - 1)
@ -206,7 +206,7 @@ class TIFF_file:
                "Slice for offset %d length %d: %r and value: %d",
                offset,
                length,
-                slice,
+                data_slice,
                val,
            )
        return val
@ -236,10 +236,10 @@ class TIFF_file:
        for i in range(entries):
            entry = ifd + 2 + 12 * i
            tag = self.s2n(entry, 2)
-            type = self.s2n(entry + 2, 2)
-            if not 1 <= type <= 10:
+            entry_type = self.s2n(entry + 2, 2)
+            if not 1 <= entry_type <= 10:
                continue  # not handled
-            typelen = [1, 1, 2, 4, 8, 1, 1, 2, 4, 8][type - 1]
+            typelen = [1, 1, 2, 4, 8, 1, 1, 2, 4, 8][entry_type - 1]
            count = self.s2n(entry + 4, 4)
            if count > MAX_COUNT:
                logging.debug("Probably corrupt. Aborting.")
@ -247,25 +247,23 @@ class TIFF_file:
            offset = entry + 8
            if count * typelen > 4:
                offset = self.s2n(offset, 4)
-            if type == 2:
+            if entry_type == 2:
                # Special case: nul-terminated ASCII string
                values = str(self.data[offset : offset + count - 1], encoding="latin-1")
            else:
                values = []
-                signed = type == 6 or type >= 8
-                for j in range(count):
-                    if type in {5, 10}:
+                signed = entry_type == 6 or entry_type >= 8
+                for _ in range(count):
+                    if entry_type in {5, 10}:
                        # The type is either 5 or 10
-                        value_j = Fraction(
-                            self.s2n(offset, 4, signed), self.s2n(offset + 4, 4, signed)
-                        )
+                        value_j = Fraction(self.s2n(offset, 4, signed), self.s2n(offset + 4, 4, signed))
                    else:
                        # Not a fraction
                        value_j = self.s2n(offset, typelen, signed)
                    values.append(value_j)
                    offset = offset + typelen
            # Now "values" is either a string or an array
-            a.append((tag, type, values))
+            a.append((tag, entry_type, values))
        return a


@ -296,13 +294,11 @@ def get_fields(fp):
    logging.debug("Exif header length: %d bytes", length)
    data = fp.read(length - 8)
    data_format = data[0]
-    logging.debug(
-        "%s format", {INTEL_ENDIAN: "Intel", MOTOROLA_ENDIAN: "Motorola"}[data_format]
-    )
+    logging.debug("%s format", {INTEL_ENDIAN: "Intel", MOTOROLA_ENDIAN: "Motorola"}[data_format])
    T = TIFF_file(data)
    # There may be more than one IFD per file, but we only read the first one because others are
    # most likely thumbnails.
-    main_IFD_offset = T.first_IFD()
+    main_ifd_offset = T.first_IFD()
    result = {}

    def add_tag_to_result(tag, values):
@ -314,8 +310,8 @@ def get_fields(fp):
            return  # don't overwrite data
        result[stag] = values

-    logging.debug("IFD at offset %d", main_IFD_offset)
-    IFD = T.dump_IFD(main_IFD_offset)
+    logging.debug("IFD at offset %d", main_ifd_offset)
+    IFD = T.dump_IFD(main_ifd_offset)
    exif_off = gps_off = 0
    for tag, type, values in IFD:
        if tag == 0x8769:
--- a/core/pe/iphoto_plist.py
+++ b/core/pe/iphoto_plist.py
@ -1,33 +0,0 @@
-# Created By: Virgil Dupras
-# Created On: 2014-03-15
-# Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
-#
-# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
-# which should be included with this package. The terms are also available at
-# http://www.gnu.org/licenses/gpl-3.0.html
-
-import plistlib
-
-
-class IPhotoPlistParser(plistlib._PlistParser):
-    """A parser for iPhoto plists.
-
-    iPhoto plists tend to be malformed, so we have to subclass the built-in parser to be a bit more
-    lenient.
-    """
-
-    def __init__(self):
-        plistlib._PlistParser.__init__(self, use_builtin_types=True, dict_type=dict)
-        # For debugging purposes, we remember the last bit of data to be analyzed so that we can
-        # log it in case of an exception
-        self.lastdata = ""
-
-    def get_data(self):
-        self.lastdata = plistlib._PlistParser.get_data(self)
-        return self.lastdata
-
-    def end_integer(self):
-        try:
-            self.add_object(int(self.get_data()))
-        except ValueError:
-            self.add_object(0)
--- a/core/pe/matchblock.py
+++ b/core/pe/matchblock.py
@ -15,7 +15,8 @@ from hscommon.trans import tr
 from hscommon.jobprogress import job

 from core.engine import Match
-from .block import avgdiff, DifferentBlockCountError, NoBlocksError
+from core.pe.block import avgdiff, DifferentBlockCountError, NoBlocksError
+from core.pe.cache_sqlite import SqliteCache

 # OPTIMIZATION NOTES:
 # The bottleneck of the matching phase is CPU, which is why we use multiprocessing. However, another
@ -27,7 +28,7 @@ from .block import avgdiff, DifferentBlockCountError, NoBlocksError
 # to files in other chunks. So chunkifying doesn't save us any actual comparison, but the advantage
 # is that instead of reading blocks from disk number_of_files**2 times, we read it
 # number_of_files*number_of_chunks times.
-# Determining the right chunk size is tricky, bceause if it's too big, too many blocks will be in
+# Determining the right chunk size is tricky, because if it's too big, too many blocks will be in
 # memory at the same time and we might end up with memory trashing, which is awfully slow. So,
 # because our *real* bottleneck is CPU, the chunk size must simply be enough so that the CPU isn't
 # starved by Disk IOs.
@ -50,17 +51,10 @@ except Exception:


 def get_cache(cache_path, readonly=False):
-    if cache_path.endswith("shelve"):
-        from .cache_shelve import ShelveCache
-
-        return ShelveCache(cache_path, readonly=readonly)
-    else:
-        from .cache_sqlite import SqliteCache
-
-        return SqliteCache(cache_path, readonly=readonly)
+    return SqliteCache(cache_path, readonly=readonly)


-def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
+def prepare_pictures(pictures, cache_path, with_dimensions, match_rotated, j=job.nulljob):
    # The MemoryError handlers in there use logging without first caring about whether or not
    # there is enough memory left to carry on the operation because it is assumed that the
    # MemoryError happens when trying to read an image file, which is freed from memory by the
@ -78,16 +72,21 @@ def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
                # entry in iPhoto library.
                logging.warning("We have a picture with a null path here")
                continue
-            picture.unicode_path = str(picture.path)
            logging.debug("Analyzing picture at %s", picture.unicode_path)
            if with_dimensions:
                picture.dimensions  # pre-read dimensions
            try:
-                if picture.unicode_path not in cache:
-                    blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE)
+                if picture.unicode_path not in cache or (
+                    match_rotated and any(block == [] for block in cache[picture.unicode_path])
+                ):
+                    if match_rotated:
+                        blocks = [picture.get_blocks(BLOCK_COUNT_PER_SIDE, orientation) for orientation in range(1, 9)]
+                    else:
+                        blocks = [[]] * 8
+                        blocks[max(picture.get_orientation() - 1, 0)] = picture.get_blocks(BLOCK_COUNT_PER_SIDE)
                    cache[picture.unicode_path] = blocks
                prepared.append(picture)
-            except (IOError, ValueError) as e:
+            except (OSError, ValueError) as e:
                logging.warning(str(e))
            except MemoryError:
                logging.warning(
@ -95,9 +94,7 @@ def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):
                    picture.unicode_path,
                    picture.size,
                )
-                if (
-                    picture.size < 10 * 1024 * 1024
-                ):  # We're really running out of memory
+                if picture.size < 10 * 1024 * 1024:  # We're really running out of memory
                    raise
    except MemoryError:
        logging.warning("Ran out of memory while preparing pictures")
@ -106,9 +103,7 @@ def prepare_pictures(pictures, cache_path, with_dimensions, j=job.nulljob):


 def get_chunks(pictures):
-    min_chunk_count = (
-        multiprocessing.cpu_count() * 2
-    )  # have enough chunks to feed all subprocesses
+    min_chunk_count = multiprocessing.cpu_count() * 2  # have enough chunks to feed all subprocesses
    chunk_count = len(pictures) // DEFAULT_CHUNK_SIZE
    chunk_count = max(min_chunk_count, chunk_count)
    chunk_size = (len(pictures) // chunk_count) + 1
@ -129,13 +124,13 @@ def get_match(first, second, percentage):
    return Match(first, second, percentage)


-def async_compare(ref_ids, other_ids, dbname, threshold, picinfo):
+def async_compare(ref_ids, other_ids, dbname, threshold, picinfo, match_rotated=False):
    # The list of ids in ref_ids have to be compared to the list of ids in other_ids. other_ids
    # can be None. In this case, ref_ids has to be compared with itself
    # picinfo is a dictionary {pic_id: (dimensions, is_ref)}
    cache = get_cache(dbname, readonly=True)
    limit = 100 - threshold
-    ref_pairs = list(cache.get_multiple(ref_ids))
+    ref_pairs = list(cache.get_multiple(ref_ids))  # (rowid, [b, b2, ..., b8])
    if other_ids is not None:
        other_pairs = list(cache.get_multiple(other_ids))
        comparisons_to_do = [(r, o) for r in ref_pairs for o in other_pairs]
@ -148,22 +143,35 @@ def async_compare(ref_ids, other_ids, dbname, threshold, picinfo):
        if ref_is_ref and other_is_ref:
            continue
        if ref_dimensions != other_dimensions:
-            continue
-        try:
-            diff = avgdiff(ref_blocks, other_blocks, limit, MIN_ITERATIONS)
-            percentage = 100 - diff
-        except (DifferentBlockCountError, NoBlocksError):
-            percentage = 0
-        if percentage >= threshold:
-            results.append((ref_id, other_id, percentage))
+            if match_rotated:
+                rotated_ref_dimensions = (ref_dimensions[1], ref_dimensions[0])
+                if rotated_ref_dimensions != other_dimensions:
+                    continue
+            else:
+                continue
+
+        orientation_range = 1
+        if match_rotated:
+            orientation_range = 8
+
+        for orientation_ref in range(orientation_range):
+            try:
+                diff = avgdiff(ref_blocks[orientation_ref], other_blocks[0], limit, MIN_ITERATIONS)
+                percentage = 100 - diff
+            except (DifferentBlockCountError, NoBlocksError):
+                percentage = 0
+            if percentage >= threshold:
+                results.append((ref_id, other_id, percentage))
+                break
+
    cache.close()
    return results


-def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljob):
+def getmatches(pictures, cache_path, threshold, match_scaled=False, match_rotated=False, j=job.nulljob):
    def get_picinfo(p):
        if match_scaled:
-            return (None, p.is_ref)
+            return ((None, None), p.is_ref)
        else:
            return (p.dimensions, p.is_ref)

@ -185,9 +193,7 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo
        j.set_progress(comparison_count, progress_msg)

    j = j.start_subjob([3, 7])
-    pictures = prepare_pictures(
-        pictures, cache_path, with_dimensions=not match_scaled, j=j
-    )
+    pictures = prepare_pictures(pictures, cache_path, not match_scaled, match_rotated, j=j)
    j = j.start_subjob([9, 1], tr("Preparing for matching"))
    cache = get_cache(cache_path)
    id2picture = {}
@ -217,7 +223,7 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo
                picinfo.update({p.cache_id: get_picinfo(p) for p in other_chunk})
            else:
                other_ids = None
-            args = (ref_ids, other_ids, cache_path, threshold, picinfo)
+            args = (ref_ids, other_ids, cache_path, threshold, picinfo, match_rotated)
            async_results.append(pool.apply_async(async_compare, args))
            collect_results()
        collect_results(collect_all=True)
@ -231,12 +237,8 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo
            chunks,
            pictures,
        )  # some wiggle room for the next statements
-        logging.warning(
-            "Ran out of memory when scanning! We had %d matches.", len(matches)
-        )
-        del matches[
-            -len(matches) // 3 :
-        ]  # some wiggle room to ensure we don't run out of memory again.
+        logging.warning("Ran out of memory when scanning! We had %d matches.", len(matches))
+        del matches[-len(matches) // 3 :]  # some wiggle room to ensure we don't run out of memory again.
    pool.close()
    result = []
    myiter = j.iter_with_progress(
@ -248,7 +250,7 @@ def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljo
    for ref_id, other_id, percentage in myiter:
        ref = id2picture[ref_id]
        other = id2picture[other_id]
-        if percentage == 100 and ref.md5 != other.md5:
+        if percentage == 100 and ref.digest != other.digest:
            percentage = 99
        if percentage >= threshold:
            ref.dimensions  # pre-read dimensions for display in results
--- a/core/pe/modules/block.c
+++ b/core/pe/modules/block.c
@ -2,9 +2,9 @@
 * Created On: 2010-01-30
 * Copyright 2014 Hardcoded Software (http://www.hardcoded.net)
 *
- * This software is licensed under the "BSD" License as described in the "LICENSE" file, 
- * which should be included with this package. The terms are also available at 
- * http://www.hardcoded.net/licenses/bsd_license
+ * This software is licensed under the "BSD" License as described in the
+ * "LICENSE" file, which should be included with this package. The terms are
+ * also available at http://www.hardcoded.net/licenses/bsd_license
 */

 #include "common.h"
@ -14,86 +14,84 @@ static PyObject *NoBlocksError;
 /* avgdiff/maxdiff has been called with 2 block lists of different size. */
 static PyObject *DifferentBlockCountError;

-/* Returns a 3 sized tuple containing the mean color of 'image'.    
+/* Returns a 3 sized tuple containing the mean color of 'image'.
 * image: a PIL image or crop.
 */
-static PyObject* getblock(PyObject *image)
-{
-    int i, totr, totg, totb;
-    Py_ssize_t pixel_count;
-    PyObject *ppixels;
-    
-    totr = totg = totb = 0;
-    ppixels = PyObject_CallMethod(image, "getdata", NULL);
-    if (ppixels == NULL) {
-        return NULL;
-    }
-    
-    pixel_count = PySequence_Length(ppixels);
-    for (i=0; i<pixel_count; i++) {
-        PyObject *ppixel, *pr, *pg, *pb;
-        int r, g, b;
-        
-        ppixel = PySequence_ITEM(ppixels, i);
-        pr = PySequence_ITEM(ppixel, 0);
-        pg = PySequence_ITEM(ppixel, 1);
-        pb = PySequence_ITEM(ppixel, 2);
-        Py_DECREF(ppixel);
-        r = PyLong_AsLong(pr);
-        g = PyLong_AsLong(pg);
-        b = PyLong_AsLong(pb);
-        Py_DECREF(pr);
-        Py_DECREF(pg);
-        Py_DECREF(pb);
-        
-        totr += r;
-        totg += g;
-        totb += b;
-    }
-    
-    Py_DECREF(ppixels);
-    
-    if (pixel_count) {
-        totr /= pixel_count;
-        totg /= pixel_count;
-        totb /= pixel_count;
-    }
-    
-    return inttuple(3, totr, totg, totb);
+static PyObject *getblock(PyObject *image) {
+  int i, totr, totg, totb;
+  Py_ssize_t pixel_count;
+  PyObject *ppixels;
+
+  totr = totg = totb = 0;
+  ppixels = PyObject_CallMethod(image, "getdata", NULL);
+  if (ppixels == NULL) {
+    return NULL;
+  }
+
+  pixel_count = PySequence_Length(ppixels);
+  for (i = 0; i < pixel_count; i++) {
+    PyObject *ppixel, *pr, *pg, *pb;
+    int r, g, b;
+
+    ppixel = PySequence_ITEM(ppixels, i);
+    pr = PySequence_ITEM(ppixel, 0);
+    pg = PySequence_ITEM(ppixel, 1);
+    pb = PySequence_ITEM(ppixel, 2);
+    Py_DECREF(ppixel);
+    r = PyLong_AsLong(pr);
+    g = PyLong_AsLong(pg);
+    b = PyLong_AsLong(pb);
+    Py_DECREF(pr);
+    Py_DECREF(pg);
+    Py_DECREF(pb);
+
+    totr += r;
+    totg += g;
+    totb += b;
+  }
+
+  Py_DECREF(ppixels);
+
+  if (pixel_count) {
+    totr /= pixel_count;
+    totg /= pixel_count;
+    totb /= pixel_count;
+  }
+
+  return inttuple(3, totr, totg, totb);
 }

 /* Returns the difference between the first block and the second.
 * It returns an absolute sum of the 3 differences (RGB).
 */
-static int diff(PyObject *first, PyObject *second)
-{
-    int r1, g1, b1, r2, b2, g2;
-    PyObject *pr, *pg, *pb;
-    pr = PySequence_ITEM(first, 0);
-    pg = PySequence_ITEM(first, 1);
-    pb = PySequence_ITEM(first, 2);
-    r1 = PyLong_AsLong(pr);
-    g1 = PyLong_AsLong(pg);
-    b1 = PyLong_AsLong(pb);
-    Py_DECREF(pr);
-    Py_DECREF(pg);
-    Py_DECREF(pb);
-    
-    pr = PySequence_ITEM(second, 0);
-    pg = PySequence_ITEM(second, 1);
-    pb = PySequence_ITEM(second, 2);
-    r2 = PyLong_AsLong(pr);
-    g2 = PyLong_AsLong(pg);
-    b2 = PyLong_AsLong(pb);
-    Py_DECREF(pr);
-    Py_DECREF(pg);
-    Py_DECREF(pb);
-    
-    return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2);
+static int diff(PyObject *first, PyObject *second) {
+  int r1, g1, b1, r2, b2, g2;
+  PyObject *pr, *pg, *pb;
+  pr = PySequence_ITEM(first, 0);
+  pg = PySequence_ITEM(first, 1);
+  pb = PySequence_ITEM(first, 2);
+  r1 = PyLong_AsLong(pr);
+  g1 = PyLong_AsLong(pg);
+  b1 = PyLong_AsLong(pb);
+  Py_DECREF(pr);
+  Py_DECREF(pg);
+  Py_DECREF(pb);
+
+  pr = PySequence_ITEM(second, 0);
+  pg = PySequence_ITEM(second, 1);
+  pb = PySequence_ITEM(second, 2);
+  r2 = PyLong_AsLong(pr);
+  g2 = PyLong_AsLong(pg);
+  b2 = PyLong_AsLong(pb);
+  Py_DECREF(pr);
+  Py_DECREF(pg);
+  Py_DECREF(pb);
+
+  return abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2);
 }

 PyDoc_STRVAR(block_getblocks2_doc,
-"Returns a list of blocks (3 sized tuples).\n\
+             "Returns a list of blocks (3 sized tuples).\n\
 \n\
 image: A PIL image to base the blocks on.\n\
 block_count_per_side: This integer determine the number of blocks the function will return.\n\
@ -101,153 +99,150 @@ If it is 10, for example, 100 blocks will be returns (10 width, 10 height). The
 necessarely cover square areas. The area covered by each block will be proportional to the image\n\
 itself.\n");

-static PyObject* block_getblocks2(PyObject *self, PyObject *args)
-{
-    int block_count_per_side, width, height, block_width, block_height, ih;
-    PyObject *image;
-    PyObject *pimage_size, *pwidth, *pheight;
-    PyObject *result;
-    
-    if (!PyArg_ParseTuple(args, "Oi", &image, &block_count_per_side)) {
+static PyObject *block_getblocks2(PyObject *self, PyObject *args) {
+  int block_count_per_side, width, height, block_width, block_height, ih;
+  PyObject *image;
+  PyObject *pimage_size, *pwidth, *pheight;
+  PyObject *result;
+
+  if (!PyArg_ParseTuple(args, "Oi", &image, &block_count_per_side)) {
+    return NULL;
+  }
+
+  pimage_size = PyObject_GetAttrString(image, "size");
+  pwidth = PySequence_ITEM(pimage_size, 0);
+  pheight = PySequence_ITEM(pimage_size, 1);
+  width = PyLong_AsLong(pwidth);
+  height = PyLong_AsLong(pheight);
+  Py_DECREF(pimage_size);
+  Py_DECREF(pwidth);
+  Py_DECREF(pheight);
+
+  if (!(width && height)) {
+    return PyList_New(0);
+  }
+
+  block_width = max(width / block_count_per_side, 1);
+  block_height = max(height / block_count_per_side, 1);
+
+  result = PyList_New((Py_ssize_t)block_count_per_side * block_count_per_side);
+  if (result == NULL) {
+    return NULL;
+  }
+
+  for (ih = 0; ih < block_count_per_side; ih++) {
+    int top, bottom, iw;
+    top = min(ih * block_height, height - block_height);
+    bottom = top + block_height;
+    for (iw = 0; iw < block_count_per_side; iw++) {
+      int left, right;
+      PyObject *pbox;
+      PyObject *pmethodname;
+      PyObject *pcrop;
+      PyObject *pblock;
+
+      left = min(iw * block_width, width - block_width);
+      right = left + block_width;
+      pbox = inttuple(4, left, top, right, bottom);
+      pmethodname = PyUnicode_FromString("crop");
+      pcrop = PyObject_CallMethodObjArgs(image, pmethodname, pbox, NULL);
+      Py_DECREF(pmethodname);
+      Py_DECREF(pbox);
+      if (pcrop == NULL) {
+        Py_DECREF(result);
        return NULL;
-    }
-    
-    pimage_size = PyObject_GetAttrString(image, "size");
-    pwidth = PySequence_ITEM(pimage_size, 0);
-    pheight = PySequence_ITEM(pimage_size, 1);
-    width = PyLong_AsLong(pwidth);
-    height = PyLong_AsLong(pheight);
-    Py_DECREF(pimage_size);
-    Py_DECREF(pwidth);
-    Py_DECREF(pheight);
-    
-    if (!(width && height)) {
-        return PyList_New(0);
-    }
-    
-    block_width = max(width / block_count_per_side, 1);
-    block_height = max(height / block_count_per_side, 1);
-    
-    result = PyList_New(block_count_per_side * block_count_per_side);
-    if (result == NULL) {
+      }
+      pblock = getblock(pcrop);
+      Py_DECREF(pcrop);
+      if (pblock == NULL) {
+        Py_DECREF(result);
        return NULL;
+      }
+      PyList_SET_ITEM(result, ih * block_count_per_side + iw, pblock);
    }
-    
-    for (ih=0; ih<block_count_per_side; ih++) {
-        int top, bottom, iw;
-        top = min(ih*block_height, height-block_height);
-        bottom = top + block_height;
-        for (iw=0; iw<block_count_per_side; iw++) {
-            int left, right;
-            PyObject *pbox;
-            PyObject *pmethodname;
-            PyObject *pcrop;
-            PyObject *pblock;
-            
-            left = min(iw*block_width, width-block_width);
-            right = left + block_width;
-            pbox = inttuple(4, left, top, right, bottom);
-            pmethodname = PyUnicode_FromString("crop");
-            pcrop = PyObject_CallMethodObjArgs(image, pmethodname, pbox, NULL);
-            Py_DECREF(pmethodname);
-            Py_DECREF(pbox);
-            if (pcrop == NULL) {
-                Py_DECREF(result);
-                return NULL;
-            }
-            pblock = getblock(pcrop);
-            Py_DECREF(pcrop);
-            if (pblock == NULL) {
-                Py_DECREF(result);
-                return NULL;
-            }
-            PyList_SET_ITEM(result, ih*block_count_per_side+iw, pblock);
-        }
-    }
-    
-    return result;
+  }
+
+  return result;
 }

 PyDoc_STRVAR(block_avgdiff_doc,
-"Returns the average diff between first blocks and seconds.\n\
+             "Returns the average diff between first blocks and seconds.\n\
 \n\
 If the result surpasses limit, limit + 1 is returned, except if less than min_iterations\n\
 iterations have been made in the blocks.\n");

-static PyObject* block_avgdiff(PyObject *self, PyObject *args)
-{
-    PyObject *first, *second;
-    int limit, min_iterations;
-    Py_ssize_t count;
-    int sum, i, result;
-    
-    if (!PyArg_ParseTuple(args, "OOii", &first, &second, &limit, &min_iterations)) {
-        return NULL;
+static PyObject *block_avgdiff(PyObject *self, PyObject *args) {
+  PyObject *first, *second;
+  int limit, min_iterations;
+  Py_ssize_t count;
+  int sum, i, result;
+
+  if (!PyArg_ParseTuple(args, "OOii", &first, &second, &limit,
+                        &min_iterations)) {
+    return NULL;
+  }
+
+  count = PySequence_Length(first);
+  if (count != PySequence_Length(second)) {
+    PyErr_SetString(DifferentBlockCountError, "");
+    return NULL;
+  }
+  if (!count) {
+    PyErr_SetString(NoBlocksError, "");
+    return NULL;
+  }
+
+  sum = 0;
+  for (i = 0; i < count; i++) {
+    int iteration_count;
+    PyObject *item1, *item2;
+
+    iteration_count = i + 1;
+    item1 = PySequence_ITEM(first, i);
+    item2 = PySequence_ITEM(second, i);
+    sum += diff(item1, item2);
+    Py_DECREF(item1);
+    Py_DECREF(item2);
+    if ((sum > limit * iteration_count) &&
+        (iteration_count >= min_iterations)) {
+      return PyLong_FromLong(limit + 1);
    }
-    
-    count = PySequence_Length(first);
-    if (count != PySequence_Length(second)) {
-        PyErr_SetString(DifferentBlockCountError, "");
-        return NULL;
-    }
-    if (!count) {
-        PyErr_SetString(NoBlocksError, "");
-        return NULL;
-    }
-    
-    sum = 0;
-    for (i=0; i<count; i++) {
-        int iteration_count;
-        PyObject *item1, *item2;
-        
-        iteration_count = i + 1;
-        item1 = PySequence_ITEM(first, i);
-        item2 = PySequence_ITEM(second, i);
-        sum += diff(item1, item2);
-        Py_DECREF(item1);
-        Py_DECREF(item2);
-        if ((sum > limit*iteration_count) && (iteration_count >= min_iterations)) {
-            return PyLong_FromLong(limit + 1);
-        }
-    }
-    
-    result = sum / count;
-    if (!result && sum) {
-        result = 1;
-    }
-    return PyLong_FromLong(result);
+  }
+
+  result = sum / count;
+  if (!result && sum) {
+    result = 1;
+  }
+  return PyLong_FromLong(result);
 }

 static PyMethodDef BlockMethods[] = {
-    {"getblocks2",  block_getblocks2, METH_VARARGS, block_getblocks2_doc},
-    {"avgdiff",  block_avgdiff, METH_VARARGS, block_avgdiff_doc},
+    {"getblocks2", block_getblocks2, METH_VARARGS, block_getblocks2_doc},
+    {"avgdiff", block_avgdiff, METH_VARARGS, block_avgdiff_doc},
    {NULL, NULL, 0, NULL} /* Sentinel */
 };

-static struct PyModuleDef BlockDef = {
-    PyModuleDef_HEAD_INIT,
-    "_block",
-    NULL,
-    -1,
-    BlockMethods,
-    NULL,
-    NULL,
-    NULL,
-    NULL
-};
+static struct PyModuleDef BlockDef = {PyModuleDef_HEAD_INIT,
+                                      "_block",
+                                      NULL,
+                                      -1,
+                                      BlockMethods,
+                                      NULL,
+                                      NULL,
+                                      NULL,
+                                      NULL};

-PyObject *
-PyInit__block(void)
-{
-    PyObject *m = PyModule_Create(&BlockDef);
-    if (m == NULL) {
-        return NULL;
-    }
-    
-    NoBlocksError = PyErr_NewException("_block.NoBlocksError", NULL, NULL);
-    PyModule_AddObject(m, "NoBlocksError", NoBlocksError);
-    DifferentBlockCountError = PyErr_NewException("_block.DifferentBlockCountError", NULL, NULL);
-    PyModule_AddObject(m, "DifferentBlockCountError", DifferentBlockCountError);
+PyObject *PyInit__block(void) {
+  PyObject *m = PyModule_Create(&BlockDef);
+  if (m == NULL) {
+    return NULL;
+  }

-    return m;
-}
+  NoBlocksError = PyErr_NewException("_block.NoBlocksError", NULL, NULL);
+  PyModule_AddObject(m, "NoBlocksError", NoBlocksError);
+  DifferentBlockCountError =
+      PyErr_NewException("_block.DifferentBlockCountError", NULL, NULL);
+  PyModule_AddObject(m, "DifferentBlockCountError", DifferentBlockCountError);
+
+  return m;
+}
--- a/core/pe/modules/block_osx.m
+++ b/core/pe/modules/block_osx.m
@ -2,14 +2,16 @@
 * Created On: 2010-02-04
 * Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
 *
- * This software is licensed under the "GPLv3" License as described in the "LICENSE" file, 
- * which should be included with this package. The terms are also available at 
+ * This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
+ * which should be included with this package. The terms are also available at
 * http://www.gnu.org/licenses/gpl-3.0.html
 **/

 #include "common.h"

 #import <Foundation/Foundation.h>
+#import <CoreGraphics/CoreGraphics.h>
+#import <ImageIO/ImageIO.h>

 #define RADIANS( degrees ) ( degrees * M_PI / 180 )

@ -20,7 +22,7 @@ pystring2cfstring(PyObject *pystring)
    UInt8 *s;
    CFIndex size;
    CFStringRef result;
-    
+
    if (PyUnicode_Check(pystring)) {
        encoded = PyUnicode_AsUTF8String(pystring);
        if (encoded == NULL) {
@ -30,7 +32,7 @@ pystring2cfstring(PyObject *pystring)
        encoded = pystring;
        Py_INCREF(encoded);
    }
-    
+
    s = (UInt8*)PyBytes_AS_STRING(encoded);
    size = PyBytes_GET_SIZE(encoded);
    result = CFStringCreateWithBytes(NULL, s, size, kCFStringEncodingUTF8, FALSE);
@ -48,20 +50,20 @@ static PyObject* block_osx_get_image_size(PyObject *self, PyObject *args)
    long width, height;
    PyObject *pwidth, *pheight;
    PyObject *result;
-    
+
    width = 0;
    height = 0;
    if (!PyArg_ParseTuple(args, "O", &path)) {
        return NULL;
    }
-    
+
    image_path = pystring2cfstring(path);
    if (image_path == NULL) {
        return PyErr_NoMemory();
    }
    image_url = CFURLCreateWithFileSystemPath(NULL, image_path, kCFURLPOSIXPathStyle, FALSE);
    CFRelease(image_path);
-    
+
    source = CGImageSourceCreateWithURL(image_url, NULL);
    CFRelease(image_url);
    if (source != NULL) {
@ -73,7 +75,7 @@ static PyObject* block_osx_get_image_size(PyObject *self, PyObject *args)
        }
        CFRelease(source);
    }
-    
+
    pwidth = PyLong_FromLong(width);
    if (pwidth == NULL) {
        return NULL;
@ -89,19 +91,19 @@ static PyObject* block_osx_get_image_size(PyObject *self, PyObject *args)
 }

 static CGContextRef
-MyCreateBitmapContext(int width, int height) 
+MyCreateBitmapContext(int width, int height)
 {
    CGContextRef context = NULL;
    CGColorSpaceRef colorSpace;
    void *bitmapData;
    int bitmapByteCount;
    int bitmapBytesPerRow;
-    
+
    bitmapBytesPerRow = (width * 4);
    bitmapByteCount = (bitmapBytesPerRow * height);
-    
+
    colorSpace = CGColorSpaceCreateWithName(kCGColorSpaceGenericRGB);
-    
+
    // calloc() must be used to allocate bitmapData here because the buffer has to be zeroed.
    // If it's not zeroes, when images with transparency are drawn in the context, this buffer
    // will stay with undefined pixels, which means that two pictures with the same pixels will
@ -111,7 +113,7 @@ MyCreateBitmapContext(int width, int height)
        fprintf(stderr, "Memory not allocated!");
        return NULL;
    }
-    
+
    context = CGBitmapContextCreate(bitmapData, width, height, 8, bitmapBytesPerRow, colorSpace,
        (CGBitmapInfo)kCGImageAlphaNoneSkipLast);
    if (context== NULL) {
@ -126,7 +128,7 @@ MyCreateBitmapContext(int width, int height)
 static PyObject* getblock(unsigned char *imageData, int imageWidth, int imageHeight, int boxX, int boxY, int boxW, int boxH)
 {
    int i,j, totalR, totalG, totalB;
-    
+
    totalR = totalG = totalB = 0;
    for(i=boxY; i<boxY+boxH; i++) {
        for(j=boxX; j<boxX+boxW; j++) {
@ -140,7 +142,7 @@ static PyObject* getblock(unsigned char *imageData, int imageWidth, int imageHei
    totalR /= pixelCount;
    totalG /= pixelCount;
    totalB /= pixelCount;
-    
+
    return inttuple(3, totalR, totalG, totalB);
 }

@ -153,27 +155,27 @@ static PyObject* block_osx_getblocks(PyObject *self, PyObject *args)
    CGImageRef image;
    size_t width, height, image_width, image_height;
    int block_count, block_width, block_height, orientation, i;
-    
+
    if (!PyArg_ParseTuple(args, "Oii", &path, &block_count, &orientation)) {
        return NULL;
    }
-    
+
    if (PySequence_Length(path) == 0) {
        PyErr_SetString(PyExc_ValueError, "empty path");
        return NULL;
    }
-    
+
    if ((orientation > 8) || (orientation < 0)) {
        orientation = 0; // simplifies checks later since we can only have values in 0-8
    }
-    
+
    image_path = pystring2cfstring(path);
    if (image_path == NULL) {
        return PyErr_NoMemory();
    }
    image_url = CFURLCreateWithFileSystemPath(NULL, image_path, kCFURLPOSIXPathStyle, FALSE);
    CFRelease(image_path);
-    
+
    source = CGImageSourceCreateWithURL(image_url, NULL);
    CFRelease(image_url);
    if (source == NULL) {
@ -185,8 +187,8 @@ static PyObject* block_osx_getblocks(PyObject *self, PyObject *args)
        CFRelease(source);
        return PyErr_NoMemory();
    }
-    
-    
+
+
    width = image_width = CGImageGetWidth(image);
    height = image_height = CGImageGetHeight(image);
    if (orientation >= 5) {
@ -194,9 +196,9 @@ static PyObject* block_osx_getblocks(PyObject *self, PyObject *args)
        width = image_height;
        height = image_width;
    }
-    
+
    CGContextRef context = MyCreateBitmapContext(width, height);
-    
+
    if (orientation == 2) {
        // Flip X
        CGContextTranslateCTM(context, width, 0);
@ -205,7 +207,7 @@ static PyObject* block_osx_getblocks(PyObject *self, PyObject *args)
    else if (orientation == 3) {
        // Rot 180
        CGContextTranslateCTM(context, width, height);
-        CGContextRotateCTM(context, RADIANS(180)); 
+        CGContextRotateCTM(context, RADIANS(180));
    }
    else if (orientation == 4) {
        // Flip Y
@ -240,21 +242,21 @@ static PyObject* block_osx_getblocks(PyObject *self, PyObject *args)
    CGContextDrawImage(context, myBoundingBox, image);
    unsigned char *bitmapData = CGBitmapContextGetData(context);
    CGContextRelease(context);
-    
+
    CGImageRelease(image);
    CFRelease(source);
    if (bitmapData == NULL) {
        return PyErr_NoMemory();
    }
-    
+
    block_width = max(width/block_count, 1);
    block_height = max(height/block_count, 1);
-    
+
    result = PyList_New(block_count * block_count);
    if (result == NULL) {
        return NULL;
    }
-    
+
    for(i=0; i<block_count; i++) {
        int j, top;
        top = min(i*block_height, height-block_height);
@ -269,8 +271,8 @@ static PyObject* block_osx_getblocks(PyObject *self, PyObject *args)
            PyList_SET_ITEM(result, i*block_count+j, block);
        }
    }
-    
-    free(bitmapData); 
+
+    free(bitmapData);
    return result;
 }

@ -300,4 +302,4 @@ PyInit__block_osx(void)
        return NULL;
    }
    return m;
-}
+}
--- a/core/pe/modules/cache.c
+++ b/core/pe/modules/cache.c
@ -2,94 +2,68 @@
 * Created On: 2010-01-30
 * Copyright 2014 Hardcoded Software (http://www.hardcoded.net)
 *
- * This software is licensed under the "BSD" License as described in the "LICENSE" file, 
- * which should be included with this package. The terms are also available at 
- * http://www.hardcoded.net/licenses/bsd_license
+ * This software is licensed under the "BSD" License as described in the
+ * "LICENSE" file, which should be included with this package. The terms are
+ * also available at http://www.hardcoded.net/licenses/bsd_license
 */

 #include "common.h"

-/* I know that there strtol out there, but it requires a pointer to
- * a char, which would in turn require me to buffer my chars around,
- * making the whole process slower.
- */
-static long
-xchar_to_long(char c)
-{
-    if ((c >= 48) && (c <= 57)) { /* 0-9 */
-        return c - 48;
-    }
-    else if ((c >= 65) && (c <= 70)) { /* A-F */
-        return c - 55;
-    }
-    else if ((c >= 97) && (c <= 102)) { /* a-f */
-        return c - 87;
-    }
-    return 0;
-}
+static PyObject *cache_bytes_to_colors(PyObject *self, PyObject *args) {
+  char *y;
+  Py_ssize_t char_count, i, color_count;
+  PyObject *result;
+  unsigned long r, g, b;
+  Py_ssize_t ci;
+  PyObject *color_tuple;

-static PyObject*
-cache_string_to_colors(PyObject *self, PyObject *args)
-{
-    char *s;
-    Py_ssize_t char_count, color_count, i;
-    PyObject *result;
-    
-    if (!PyArg_ParseTuple(args, "s#", &s, &char_count)) {
-        return NULL;
+  if (!PyArg_ParseTuple(args, "y#", &y, &char_count)) {
+    return NULL;
+  }
+
+  color_count = char_count / 3;
+  result = PyList_New(color_count);
+  if (result == NULL) {
+    return NULL;
+  }
+
+  for (i = 0; i < color_count; i++) {
+    ci = i * 3;
+    r = (unsigned char)y[ci];
+    g = (unsigned char)y[ci + 1];
+    b = (unsigned char)y[ci + 2];
+
+    color_tuple = inttuple(3, r, g, b);
+    if (color_tuple == NULL) {
+      Py_DECREF(result);
+      return NULL;
    }
-    
-    color_count = (char_count / 6);
-    result = PyList_New(color_count);
-    if (result == NULL) {
-        return NULL;
-    }
-    
-    for (i=0; i<color_count; i++) {
-        long r, g, b;
-        Py_ssize_t ci;
-        PyObject *color_tuple;
-        
-        ci = i * 6;
-        r = (xchar_to_long(s[ci]) << 4) + xchar_to_long(s[ci+1]);
-        g = (xchar_to_long(s[ci+2]) << 4) + xchar_to_long(s[ci+3]);
-        b = (xchar_to_long(s[ci+4]) << 4) + xchar_to_long(s[ci+5]);
-        
-        color_tuple = inttuple(3, r, g, b);
-        if (color_tuple == NULL) {
-            Py_DECREF(result);
-            return NULL;
-        }
-        PyList_SET_ITEM(result, i, color_tuple);
-    }
-    
-    return result;
+    PyList_SET_ITEM(result, i, color_tuple);
+  }
+
+  return result;
 }

 static PyMethodDef CacheMethods[] = {
-    {"string_to_colors",  cache_string_to_colors, METH_VARARGS,
-     "Transform the string 's' in a list of 3 sized tuples."},
-    {NULL, NULL, 0, NULL}        /* Sentinel */
+    {"bytes_to_colors", cache_bytes_to_colors, METH_VARARGS,
+     "Transform the bytes 's' into a list of 3 sized tuples."},
+    {NULL, NULL, 0, NULL} /* Sentinel */
 };

-static struct PyModuleDef CacheDef = {
-    PyModuleDef_HEAD_INIT,
-    "_cache",
-    NULL,
-    -1,
-    CacheMethods,
-    NULL,
-    NULL,
-    NULL,
-    NULL
-};
+static struct PyModuleDef CacheDef = {PyModuleDef_HEAD_INIT,
+                                      "_cache",
+                                      NULL,
+                                      -1,
+                                      CacheMethods,
+                                      NULL,
+                                      NULL,
+                                      NULL,
+                                      NULL};

-PyObject *
-PyInit__cache(void)
-{
-    PyObject *m = PyModule_Create(&CacheDef);
-    if (m == NULL) {
-        return NULL;
-    }
-    return m;
-}
+PyObject *PyInit__cache(void) {
+  PyObject *m = PyModule_Create(&CacheDef);
+  if (m == NULL) {
+    return NULL;
+  }
+  return m;
+}
--- a/core/pe/modules/common.c
+++ b/core/pe/modules/common.c
@ -2,8 +2,8 @@
 * Created On: 2010-02-04
 * Copyright 2014 Hardcoded Software (http://www.hardcoded.net)
 *
- * This software is licensed under the "BSD" License as described in the "LICENSE" file, 
- * which should be included with this package. The terms are also available at 
+ * This software is licensed under the "BSD" License as described in the "LICENSE" file,
+ * which should be included with this package. The terms are also available at
 * http://www.hardcoded.net/licenses/bsd_license
 */

@ -27,19 +27,19 @@ PyObject* inttuple(int n, ...)
    PyObject *pnumber;
    PyObject *result;
    va_list numbers;
-    
+
    va_start(numbers, n);
    result = PyTuple_New(n);
-    
+
    for (i=0; i<n; i++) {
-        pnumber = PyLong_FromLong(va_arg(numbers, long));
+        pnumber = PyLong_FromUnsignedLong(va_arg(numbers, long));
        if (pnumber == NULL) {
            Py_DECREF(result);
            return NULL;
        }
        PyTuple_SET_ITEM(result, i, pnumber);
    }
-    
+
    va_end(numbers);
    return result;
 }
--- a/core/pe/modules/common.h
+++ b/core/pe/modules/common.h
@ -2,8 +2,8 @@
 * Created On: 2010-02-04
 * Copyright 2015 Hardcoded Software (http://www.hardcoded.net)
 *
- * This software is licensed under the "GPLv3" License as described in the "LICENSE" file, 
- * which should be included with this package. The terms are also available at 
+ * This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
+ * which should be included with this package. The terms are also available at
 * http://www.gnu.org/licenses/gpl-3.0.html
 */

@ -17,4 +17,4 @@ int min(int a, int b);
 #endif

 /* Create a tuple out of an array of integers. */
-PyObject* inttuple(int n, ...);
+PyObject* inttuple(int n, ...);
--- a/core/pe/photo.py
+++ b/core/pe/photo.py
@ -9,7 +9,7 @@ from hscommon.util import get_file_ext, format_size

 from core.util import format_timestamp, format_perc, format_dupe_count
 from core import fs
-from . import exif
+from core.pe import exif

 # This global value is set by the platform-specific subclasser of the Photo base class
 PLAT_SPECIFIC_PHOTO_CLASS = None
@ -29,7 +29,7 @@ class Photo(fs.File):
    __slots__ = fs.File.__slots__ + tuple(INITIAL_INFO.keys())

    # These extensions are supported on all platforms
-    HANDLED_EXTS = {"png", "jpg", "jpeg", "gif", "bmp", "tiff", "tif"}
+    HANDLED_EXTS = {"png", "jpg", "jpeg", "gif", "bmp", "tiff", "tif", "webp"}

    def _plat_get_dimensions(self):
        raise NotImplementedError()
@ -37,7 +37,7 @@ class Photo(fs.File):
    def _plat_get_blocks(self, block_count_per_side, orientation):
        raise NotImplementedError()

-    def _get_orientation(self):
+    def get_orientation(self):
        if not hasattr(self, "_cached_orientation"):
            try:
                with self.path.open("rb") as fp:
@ -95,10 +95,13 @@ class Photo(fs.File):
        fs.File._read_info(self, field)
        if field == "dimensions":
            self.dimensions = self._plat_get_dimensions()
-            if self._get_orientation() in {5, 6, 7, 8}:
+            if self.get_orientation() in {5, 6, 7, 8}:
                self.dimensions = (self.dimensions[1], self.dimensions[0])
        elif field == "exif_timestamp":
            self.exif_timestamp = self._get_exif_timestamp()

-    def get_blocks(self, block_count_per_side):
-        return self._plat_get_blocks(block_count_per_side, self._get_orientation())
+    def get_blocks(self, block_count_per_side, orientation: int = None):
+        if orientation is None:
+            return self._plat_get_blocks(block_count_per_side, self.get_orientation())
+        else:
+            return self._plat_get_blocks(block_count_per_side, orientation)
--- a/core/pe/scanner.py
+++ b/core/pe/scanner.py
@ -8,30 +8,32 @@ from hscommon.trans import tr

 from core.scanner import Scanner, ScanType, ScanOption

-from . import matchblock, matchexif
+from core.pe import matchblock, matchexif


 class ScannerPE(Scanner):
    cache_path = None
    match_scaled = False
+    match_rotated = False

    @staticmethod
    def get_scan_options():
        return [
-            ScanOption(ScanType.FuzzyBlock, tr("Contents")),
-            ScanOption(ScanType.ExifTimestamp, tr("EXIF Timestamp")),
+            ScanOption(ScanType.FUZZYBLOCK, tr("Contents")),
+            ScanOption(ScanType.EXIFTIMESTAMP, tr("EXIF Timestamp")),
        ]

    def _getmatches(self, files, j):
-        if self.scan_type == ScanType.FuzzyBlock:
+        if self.scan_type == ScanType.FUZZYBLOCK:
            return matchblock.getmatches(
                files,
                cache_path=self.cache_path,
                threshold=self.min_match_percentage,
                match_scaled=self.match_scaled,
+                match_rotated=self.match_rotated,
                j=j,
            )
-        elif self.scan_type == ScanType.ExifTimestamp:
+        elif self.scan_type == ScanType.EXIFTIMESTAMP:
            return matchexif.getmatches(files, self.match_scaled, j)
        else:
-            raise Exception("Invalid scan type")
+            raise ValueError("Invalid scan type")
--- a/core/prioritize.py
+++ b/core/prioritize.py
@ -43,7 +43,7 @@ class Criterion:

    @property
    def display(self):
-        return "{} ({})".format(self.category.NAME, self.display_value)
+        return f"{self.category.NAME} ({self.display_value})"


 class ValueListCategory(CriterionCategory):
@ -82,10 +82,12 @@ class FolderCategory(ValueListCategory):

    def sort_key(self, dupe, crit_value):
        value = self.extract_value(dupe)
-        if value[: len(crit_value)] == crit_value:
-            return 0
-        else:
+        # This is instead of using is_relative_to() which was added in py 3.9
+        try:
+            value.relative_to(crit_value)
+        except ValueError:
            return 1
+        return 0


 class FilenameCategory(CriterionCategory):
--- a/core/results.py
+++ b/core/results.py
@ -10,6 +10,7 @@ import logging
 import re
 import os
 import os.path as op
+from errno import EISDIR, EACCES
 from xml.etree import ElementTree as ET

 from hscommon.jobprogress.job import nulljob
@ -17,8 +18,8 @@ from hscommon.conflict import get_conflicted_name
 from hscommon.util import flatten, nonone, FileOrPath, format_size
 from hscommon.trans import tr

-from . import engine
-from .markable import Markable
+from core import engine
+from core.markable import Markable


 class Results(Markable):
@ -52,6 +53,7 @@ class Results(Markable):
        self.app = app
        self.problems = []  # (dupe, error_msg)
        self.is_modified = False
+        self.refresh_required = False

    def _did_mark(self, dupe):
        self.__marked_size += dupe.size
@ -94,8 +96,9 @@ class Results(Markable):

    # ---Private
    def __get_dupe_list(self):
-        if self.__dupes is None:
+        if self.__dupes is None or self.refresh_required:
            self.__dupes = flatten(group.dupes for group in self.groups)
+            self.refresh_required = False
            if None in self.__dupes:
                # This is debug logging to try to figure out #44
                logging.warning(
@ -104,9 +107,7 @@ class Results(Markable):
                    self.groups,
                )
            if self.__filtered_dupes:
-                self.__dupes = [
-                    dupe for dupe in self.__dupes if dupe in self.__filtered_dupes
-                ]
+                self.__dupes = [dupe for dupe in self.__dupes if dupe in self.__filtered_dupes]
            sd = self.__dupes_sort_descriptor
            if sd:
                self.sort_dupes(sd[0], sd[1], sd[2])
@ -125,18 +126,10 @@ class Results(Markable):
            total_count = self.__total_count
            total_size = self.__total_size
        else:
-            mark_count = len(
-                [dupe for dupe in self.__filtered_dupes if self.is_marked(dupe)]
-            )
-            marked_size = sum(
-                dupe.size for dupe in self.__filtered_dupes if self.is_marked(dupe)
-            )
-            total_count = len(
-                [dupe for dupe in self.__filtered_dupes if self.is_markable(dupe)]
-            )
-            total_size = sum(
-                dupe.size for dupe in self.__filtered_dupes if self.is_markable(dupe)
-            )
+            mark_count = len([dupe for dupe in self.__filtered_dupes if self.is_marked(dupe)])
+            marked_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_marked(dupe))
+            total_count = len([dupe for dupe in self.__filtered_dupes if self.is_markable(dupe)])
+            total_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_markable(dupe))
        if self.mark_inverted:
            marked_size = self.__total_size - marked_size
        result = tr("%d / %d (%s / %s) duplicates marked.") % (
@ -199,11 +192,7 @@ class Results(Markable):
            self.__filters.append(filter_str)
            if self.__filtered_dupes is None:
                self.__filtered_dupes = flatten(g[:] for g in self.groups)
-            self.__filtered_dupes = set(
-                dupe
-                for dupe in self.__filtered_dupes
-                if filter_re.search(str(dupe.path))
-            )
+            self.__filtered_dupes = {dupe for dupe in self.__filtered_dupes if filter_re.search(str(dupe.path))}
            filtered_groups = set()
            for dupe in self.__filtered_dupes:
                filtered_groups.add(self.get_group_of_duplicate(dupe))
@ -215,8 +204,7 @@ class Results(Markable):
        self.__dupes = None

    def get_group_of_duplicate(self, dupe):
-        """Returns :class:`~core.engine.Group` in which ``dupe`` belongs.
-        """
+        """Returns :class:`~core.engine.Group` in which ``dupe`` belongs."""
        try:
            return self.__group_of_duplicate[dupe]
        except (TypeError, KeyError):
@ -282,8 +270,7 @@ class Results(Markable):
        self.is_modified = False

    def make_ref(self, dupe):
-        """Make ``dupe`` take the :attr:`~core.engine.Group.ref` position of its group.
-        """
+        """Make ``dupe`` take the :attr:`~core.engine.Group.ref` position of its group."""
        g = self.get_group_of_duplicate(dupe)
        r = g.ref
        if not g.switch_ref(dupe):
@ -315,7 +302,7 @@ class Results(Markable):
            try:
                func(dupe)
                to_remove.append(dupe)
-            except (EnvironmentError, UnicodeEncodeError) as e:
+            except (OSError, UnicodeEncodeError) as e:
                self.problems.append((dupe, str(e)))
        if remove_from_results:
            self.remove_duplicates(to_remove)
@ -388,10 +375,10 @@ class Results(Markable):

        try:
            do_write(outfile)
-        except IOError as e:
-            # If our IOError is because dest is already a directory, we want to handle that. 21 is
-            # the code we get on OS X and Linux, 13 is what we get on Windows.
-            if e.errno in {21, 13}:
+        except OSError as e:
+            # If our OSError is because dest is already a directory, we want to handle that. 21 is
+            # the code we get on OS X and Linux (EISDIR), 13 is what we get on Windows (EACCES).
+            if e.errno in (EISDIR, EACCES):
                p = str(outfile)
                dirname, basename = op.split(p)
                otherfiles = os.listdir(dirname)
@ -410,10 +397,10 @@ class Results(Markable):
        """
        if not self.__dupes:
            self.__get_dupe_list()
-        keyfunc = lambda d: self.app._get_dupe_sort_key(
-            d, lambda: self.get_group_of_duplicate(d), key, delta
+        self.__dupes.sort(
+            key=lambda d: self.app._get_dupe_sort_key(d, lambda: self.get_group_of_duplicate(d), key, delta),
+            reverse=not asc,
        )
-        self.__dupes.sort(key=keyfunc, reverse=not asc)
        self.__dupes_sort_descriptor = (key, asc, delta)

    def sort_groups(self, key, asc=True):
@ -424,8 +411,7 @@ class Results(Markable):
        :param str key: key attribute name to sort with.
        :param bool asc: If false, sorting is reversed.
        """
-        keyfunc = lambda g: self.app._get_group_sort_key(g, key)
-        self.groups.sort(key=keyfunc, reverse=not asc)
+        self.groups.sort(key=lambda g: self.app._get_group_sort_key(g, key), reverse=not asc)
        self.__groups_sort_descriptor = (key, asc)

    # ---Properties
--- a/core/scanner.py
+++ b/core/scanner.py
@ -13,7 +13,7 @@ from hscommon.jobprogress import job
 from hscommon.util import dedupe, rem_file_ext, get_file_ext
 from hscommon.trans import tr

-from . import engine
+from core import engine

 # It's quite ugly to have scan types from all editions all put in the same class, but because there's
 # there will be some nasty bugs popping up (ScanType is used in core when in should exclusively be
@ -21,16 +21,16 @@ from . import engine


 class ScanType:
-    Filename = 0
-    Fields = 1
-    FieldsNoOrder = 2
-    Tag = 3
-    Folders = 4
-    Contents = 5
+    FILENAME = 0
+    FIELDS = 1
+    FIELDSNOORDER = 2
+    TAG = 3
+    FOLDERS = 4
+    CONTENTS = 5

    # PE
-    FuzzyBlock = 10
-    ExifTimestamp = 11
+    FUZZYBLOCK = 10
+    EXIFTIMESTAMP = 11


 ScanOption = namedtuple("ScanOption", "scan_type label")
@ -77,30 +77,35 @@ class Scanner:
        self.discarded_file_count = 0

    def _getmatches(self, files, j):
-        if self.size_threshold or self.scan_type in {
-            ScanType.Contents,
-            ScanType.Folders,
-        }:
+        if (
+            self.size_threshold
+            or self.large_size_threshold
+            or self.scan_type
+            in {
+                ScanType.CONTENTS,
+                ScanType.FOLDERS,
+            }
+        ):
            j = j.start_subjob([2, 8])
-            for f in j.iter_with_progress(files, tr("Read size of %d/%d files")):
-                f.size  # pre-read, makes a smoother progress if read here (especially for bundles)
            if self.size_threshold:
                files = [f for f in files if f.size >= self.size_threshold]
-        if self.scan_type in {ScanType.Contents, ScanType.Folders}:
-            return engine.getmatches_by_contents(files, j=j)
+            if self.large_size_threshold:
+                files = [f for f in files if f.size <= self.large_size_threshold]
+        if self.scan_type in {ScanType.CONTENTS, ScanType.FOLDERS}:
+            return engine.getmatches_by_contents(files, bigsize=self.big_file_size_threshold, j=j)
        else:
            j = j.start_subjob([2, 8])
            kw = {}
            kw["match_similar_words"] = self.match_similar_words
            kw["weight_words"] = self.word_weighting
            kw["min_match_percentage"] = self.min_match_percentage
-            if self.scan_type == ScanType.FieldsNoOrder:
-                self.scan_type = ScanType.Fields
+            if self.scan_type == ScanType.FIELDSNOORDER:
+                self.scan_type = ScanType.FIELDS
                kw["no_field_order"] = True
            func = {
-                ScanType.Filename: lambda f: engine.getwords(rem_file_ext(f.name)),
-                ScanType.Fields: lambda f: engine.getfields(rem_file_ext(f.name)),
-                ScanType.Tag: lambda f: [
+                ScanType.FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
+                ScanType.FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
+                ScanType.TAG: lambda f: [
                    engine.getwords(str(getattr(f, attrname)))
                    for attrname in SCANNABLE_TAGS
                    if attrname in self.scanned_tags
@ -127,7 +132,7 @@ class Scanner:
            return False
        if is_same_with_digit(refname, dupename):
            return True
-        return len(dupe.path) > len(ref.path)
+        return len(dupe.path.parts) > len(ref.path.parts)

    @staticmethod
    def get_scan_options():
@ -150,49 +155,36 @@ class Scanner:
        # "duplicated duplicates if you will). Then, we also don't want mixed file kinds if the
        # option isn't enabled, we want matches for which both files exist and, lastly, we don't
        # want matches with both files as ref.
-        if self.scan_type == ScanType.Folders and matches:
+        if self.scan_type == ScanType.FOLDERS and matches:
            allpath = {m.first.path for m in matches}
            allpath |= {m.second.path for m in matches}
            sortedpaths = sorted(allpath)
            toremove = set()
            last_parent_path = sortedpaths[0]
            for p in sortedpaths[1:]:
-                if p in last_parent_path:
+                if last_parent_path in p.parents:
                    toremove.add(p)
                else:
                    last_parent_path = p
-            matches = [
-                m
-                for m in matches
-                if m.first.path not in toremove or m.second.path not in toremove
-            ]
+            matches = [m for m in matches if m.first.path not in toremove or m.second.path not in toremove]
        if not self.mix_file_kind:
-            matches = [
-                m
-                for m in matches
-                if get_file_ext(m.first.name) == get_file_ext(m.second.name)
-            ]
-        matches = [
-            m for m in matches if m.first.path.exists() and m.second.path.exists()
-        ]
-        matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)]
+            matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
+        if self.include_exists_check:
+            matches = [m for m in matches if m.first.exists() and m.second.exists()]
+        # Contents already handles ref checks, other scan types might not catch during scan
+        if self.scan_type != ScanType.CONTENTS:
+            matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)]
        if ignore_list:
-            matches = [
-                m
-                for m in matches
-                if not ignore_list.AreIgnored(str(m.first.path), str(m.second.path))
-            ]
+            matches = [m for m in matches if not ignore_list.are_ignored(str(m.first.path), str(m.second.path))]
        logging.info("Grouping matches")
        groups = engine.get_groups(matches)
        if self.scan_type in {
-            ScanType.Filename,
-            ScanType.Fields,
-            ScanType.FieldsNoOrder,
-            ScanType.Tag,
+            ScanType.FILENAME,
+            ScanType.FIELDS,
+            ScanType.FIELDSNOORDER,
+            ScanType.TAG,
        }:
-            matched_files = dedupe(
-                [m.first for m in matches] + [m.second for m in matches]
-            )
+            matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
            self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups)
        else:
            # Ticket #195
@ -215,7 +207,10 @@ class Scanner:
    match_similar_words = False
    min_match_percentage = 80
    mix_file_kind = True
-    scan_type = ScanType.Filename
+    scan_type = ScanType.FILENAME
    scanned_tags = {"artist", "title"}
    size_threshold = 0
+    large_size_threshold = 0
+    big_file_size_threshold = 0
    word_weighting = False
+    include_exists_check = True
--- a/core/se/init.py
+++ b/core/se/init.py
@ -1 +1 @@
-from . import fs, result_table, scanner  # noqa
+from core.se import fs, result_table, scanner  # noqa
--- a/core/se/scanner.py
+++ b/core/se/scanner.py
@ -13,7 +13,7 @@ class ScannerSE(ScannerBase):
    @staticmethod
    def get_scan_options():
        return [
-            ScanOption(ScanType.Filename, tr("Filename")),
-            ScanOption(ScanType.Contents, tr("Contents")),
-            ScanOption(ScanType.Folders, tr("Folders")),
+            ScanOption(ScanType.FILENAME, tr("Filename")),
+            ScanOption(ScanType.CONTENTS, tr("Contents")),
+            ScanOption(ScanType.FOLDERS, tr("Folders")),
        ]
--- a/core/tests/app_test.py
+++ b/core/tests/app_test.py
@ -7,31 +7,30 @@
 import os
 import os.path as op
 import logging
+import tempfile

 import pytest
-from hscommon.path import Path
+from pathlib import Path
 import hscommon.conflict
 import hscommon.util
 from hscommon.testutil import eq_, log_calls
 from hscommon.jobprogress.job import Job

-from .base import TestApp
-from .results_test import GetTestGroups
-from .. import app, fs, engine
-from ..scanner import ScanType
+from core.tests.base import TestApp
+from core.tests.results_test import GetTestGroups
+from core import app, fs, engine
+from core.scanner import ScanType


 def add_fake_files_to_directories(directories, files):
    directories.get_files = lambda j=None: iter(files)
-    directories._dirs.append("this is just so Scan() doesnt return 3")
+    directories._dirs.append("this is just so Scan() doesn't return 3")


 class TestCaseDupeGuru:
    def test_apply_filter_calls_results_apply_filter(self, monkeypatch):
        dgapp = TestApp().app
-        monkeypatch.setattr(
-            dgapp.results, "apply_filter", log_calls(dgapp.results.apply_filter)
-        )
+        monkeypatch.setattr(dgapp.results, "apply_filter", log_calls(dgapp.results.apply_filter))
        dgapp.apply_filter("foo")
        eq_(2, len(dgapp.results.apply_filter.calls))
        call = dgapp.results.apply_filter.calls[0]
@ -41,15 +40,11 @@ class TestCaseDupeGuru:

    def test_apply_filter_escapes_regexp(self, monkeypatch):
        dgapp = TestApp().app
-        monkeypatch.setattr(
-            dgapp.results, "apply_filter", log_calls(dgapp.results.apply_filter)
-        )
+        monkeypatch.setattr(dgapp.results, "apply_filter", log_calls(dgapp.results.apply_filter))
        dgapp.apply_filter("()[]\\.|+?^abc")
        call = dgapp.results.apply_filter.calls[1]
        eq_("\\(\\)\\[\\]\\\\\\.\\|\\+\\?\\^abc", call["filter_str"])
-        dgapp.apply_filter(
-            "(*)"
-        )  # In "simple mode", we want the * to behave as a wilcard
+        dgapp.apply_filter("(*)")  # In "simple mode", we want the * to behave as a wildcard
        call = dgapp.results.apply_filter.calls[3]
        eq_(r"\(.*\)", call["filter_str"])
        dgapp.options["escape_filter_regexp"] = False
@ -62,7 +57,7 @@ class TestCaseDupeGuru:
        # for this unit is pathetic. What's done is done. My approach now is to add tests for
        # every change I want to make. The blowup was caused by a missing import.
        p = Path(str(tmpdir))
-        p["foo"].open("w").close()
+        p.joinpath("foo").touch()
        monkeypatch.setattr(
            hscommon.conflict,
            "smart_copy",
@ -70,40 +65,39 @@ class TestCaseDupeGuru:
        )
        # XXX This monkeypatch is temporary. will be fixed in a better monkeypatcher.
        monkeypatch.setattr(app, "smart_copy", hscommon.conflict.smart_copy)
-        monkeypatch.setattr(
-            os, "makedirs", lambda path: None
-        )  # We don't want the test to create that fake directory
+        monkeypatch.setattr(os, "makedirs", lambda path: None)  # We don't want the test to create that fake directory
        dgapp = TestApp().app
        dgapp.directories.add_path(p)
        [f] = dgapp.directories.get_files()
-        dgapp.copy_or_move(f, True, "some_destination", 0)
-        eq_(1, len(hscommon.conflict.smart_copy.calls))
-        call = hscommon.conflict.smart_copy.calls[0]
-        eq_(call["dest_path"], op.join("some_destination", "foo"))
-        eq_(call["source_path"], f.path)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            dgapp.copy_or_move(f, True, tmp_dir, 0)
+            eq_(1, len(hscommon.conflict.smart_copy.calls))
+            call = hscommon.conflict.smart_copy.calls[0]
+            eq_(call["dest_path"], Path(tmp_dir, "foo"))
+            eq_(call["source_path"], f.path)

    def test_copy_or_move_clean_empty_dirs(self, tmpdir, monkeypatch):
        tmppath = Path(str(tmpdir))
-        sourcepath = tmppath["source"]
+        sourcepath = tmppath.joinpath("source")
        sourcepath.mkdir()
-        sourcepath["myfile"].open("w")
+        sourcepath.joinpath("myfile").touch()
        app = TestApp().app
        app.directories.add_path(tmppath)
        [myfile] = app.directories.get_files()
        monkeypatch.setattr(app, "clean_empty_dirs", log_calls(lambda path: None))
-        app.copy_or_move(myfile, False, tmppath["dest"], 0)
+        app.copy_or_move(myfile, False, tmppath.joinpath("dest"), 0)
        calls = app.clean_empty_dirs.calls
        eq_(1, len(calls))
        eq_(sourcepath, calls[0]["path"])

-    def test_Scan_with_objects_evaluating_to_false(self):
+    def test_scan_with_objects_evaluating_to_false(self):
        class FakeFile(fs.File):
            def __bool__(self):
                return False

        # At some point, any() was used in a wrong way that made Scan() wrongly return 1
        app = TestApp().app
-        f1, f2 = [FakeFile("foo") for i in range(2)]
+        f1, f2 = (FakeFile("foo") for _ in range(2))
        f1.is_ref, f2.is_ref = (False, False)
        assert not (bool(f1) and bool(f2))
        add_fake_files_to_directories(app.directories, [f1, f2])
@ -114,11 +108,11 @@ class TestCaseDupeGuru:
        # If the ignore_hardlink_matches option is set, don't match files hardlinking to the same
        # inode.
        tmppath = Path(str(tmpdir))
-        tmppath["myfile"].open("w").write("foo")
-        os.link(str(tmppath["myfile"]), str(tmppath["hardlink"]))
+        tmppath.joinpath("myfile").open("wt").write("foo")
+        os.link(str(tmppath.joinpath("myfile")), str(tmppath.joinpath("hardlink")))
        app = TestApp().app
        app.directories.add_path(tmppath)
-        app.options["scan_type"] = ScanType.Contents
+        app.options["scan_type"] = ScanType.CONTENTS
        app.options["ignore_hardlink_matches"] = True
        app.start_scanning()
        eq_(len(app.results.groups), 0)
@ -132,7 +126,7 @@ class TestCaseDupeGuru:
        assert not dgapp.result_table.rename_selected("foo")  # no crash


-class TestCaseDupeGuru_clean_empty_dirs:
+class TestCaseDupeGuruCleanEmptyDirs:
    @pytest.fixture
    def do_setup(self, request):
        monkeypatch = request.getfixturevalue("monkeypatch")
@ -161,7 +155,7 @@ class TestCaseDupeGuru_clean_empty_dirs:
        # delete_if_empty must be recursively called up in the path until it returns False
        @log_calls
        def mock_delete_if_empty(path, files_to_delete=[]):
-            return len(path) > 1
+            return len(path.parts) > 1

        monkeypatch.setattr(hscommon.util, "delete_if_empty", mock_delete_if_empty)
        # XXX This monkeypatch is temporary. will be fixed in a better monkeypatcher.
@ -188,11 +182,11 @@ class TestCaseDupeGuruWithResults:
        self.rtable.refresh()
        tmpdir = request.getfixturevalue("tmpdir")
        tmppath = Path(str(tmpdir))
-        tmppath["foo"].mkdir()
-        tmppath["bar"].mkdir()
+        tmppath.joinpath("foo").mkdir()
+        tmppath.joinpath("bar").mkdir()
        self.app.directories.add_path(tmppath)

-    def test_GetObjects(self, do_setup):
+    def test_get_objects(self, do_setup):
        objects = self.objects
        groups = self.groups
        r = self.rtable[0]
@ -205,7 +199,7 @@ class TestCaseDupeGuruWithResults:
        assert r._group is groups[1]
        assert r._dupe is objects[4]

-    def test_GetObjects_after_sort(self, do_setup):
+    def test_get_objects_after_sort(self, do_setup):
        objects = self.objects
        groups = self.groups[:]  # we need an un-sorted reference
        self.rtable.sort("name", False)
@ -220,7 +214,7 @@ class TestCaseDupeGuruWithResults:
        # The first 2 dupes have been removed. The 3rd one is a ref. it stays there, in first pos.
        eq_(self.rtable.selected_indexes, [1])  # no exception

-    def test_selectResultNodePaths(self, do_setup):
+    def test_select_result_node_paths(self, do_setup):
        app = self.app
        objects = self.objects
        self.rtable.select([1, 2])
@ -228,7 +222,7 @@ class TestCaseDupeGuruWithResults:
        assert app.selected_dupes[0] is objects[1]
        assert app.selected_dupes[1] is objects[2]

-    def test_selectResultNodePaths_with_ref(self, do_setup):
+    def test_select_result_node_paths_with_ref(self, do_setup):
        app = self.app
        objects = self.objects
        self.rtable.select([1, 2, 3])
@ -237,7 +231,7 @@ class TestCaseDupeGuruWithResults:
        assert app.selected_dupes[1] is objects[2]
        assert app.selected_dupes[2] is self.groups[1].ref

-    def test_selectResultNodePaths_after_sort(self, do_setup):
+    def test_select_result_node_paths_after_sort(self, do_setup):
        app = self.app
        objects = self.objects
        groups = self.groups[:]  # To keep the old order in memory
@ -264,7 +258,7 @@ class TestCaseDupeGuruWithResults:
        app.remove_selected()
        eq_(self.rtable.selected_indexes, [])  # no exception

-    def test_selectPowerMarkerRows_after_sort(self, do_setup):
+    def test_select_powermarker_rows_after_sort(self, do_setup):
        app = self.app
        objects = self.objects
        self.rtable.power_marker = True
@ -303,7 +297,7 @@ class TestCaseDupeGuruWithResults:
        app.toggle_selected_mark_state()
        eq_(app.results.mark_count, 0)

-    def test_refreshDetailsWithSelected(self, do_setup):
+    def test_refresh_details_with_selected(self, do_setup):
        self.rtable.select([1, 4])
        eq_(self.dpanel.row(0), ("Filename", "bar bleh", "foo bar"))
        self.dpanel.view.check_gui_calls(["refresh"])
@ -311,7 +305,7 @@ class TestCaseDupeGuruWithResults:
        eq_(self.dpanel.row(0), ("Filename", "---", "---"))
        self.dpanel.view.check_gui_calls(["refresh"])

-    def test_makeSelectedReference(self, do_setup):
+    def test_make_selected_reference(self, do_setup):
        app = self.app
        objects = self.objects
        groups = self.groups
@ -320,9 +314,7 @@ class TestCaseDupeGuruWithResults:
        assert groups[0].ref is objects[1]
        assert groups[1].ref is objects[4]

-    def test_makeSelectedReference_by_selecting_two_dupes_in_the_same_group(
-        self, do_setup
-    ):
+    def test_make_selected_reference_by_selecting_two_dupes_in_the_same_group(self, do_setup):
        app = self.app
        objects = self.objects
        groups = self.groups
@ -332,7 +324,7 @@ class TestCaseDupeGuruWithResults:
        assert groups[0].ref is objects[1]
        assert groups[1].ref is objects[4]

-    def test_removeSelected(self, do_setup):
+    def test_remove_selected(self, do_setup):
        app = self.app
        self.rtable.select([1, 4])
        app.remove_selected()
@ -340,7 +332,7 @@ class TestCaseDupeGuruWithResults:
        app.remove_selected()
        eq_(len(app.results.dupes), 0)

-    def test_addDirectory_simple(self, do_setup):
+    def test_add_directory_simple(self, do_setup):
        # There's already a directory in self.app, so adding another once makes 2 of em
        app = self.app
        # any other path that isn't a parent or child of the already added path
@ -348,7 +340,7 @@ class TestCaseDupeGuruWithResults:
        app.add_directory(otherpath)
        eq_(len(app.directories), 2)

-    def test_addDirectory_already_there(self, do_setup):
+    def test_add_directory_already_there(self, do_setup):
        app = self.app
        otherpath = Path(op.dirname(__file__))
        app.add_directory(otherpath)
@ -356,7 +348,7 @@ class TestCaseDupeGuruWithResults:
        eq_(len(app.view.messages), 1)
        assert "already" in app.view.messages[0]

-    def test_addDirectory_does_not_exist(self, do_setup):
+    def test_add_directory_does_not_exist(self, do_setup):
        app = self.app
        app.add_directory("/does_not_exist")
        eq_(len(app.view.messages), 1)
@ -372,30 +364,30 @@ class TestCaseDupeGuruWithResults:
        # BOTH the ref and the other dupe should have been added
        eq_(len(app.ignore_list), 3)

-    def test_purgeIgnoreList(self, do_setup, tmpdir):
+    def test_purge_ignorelist(self, do_setup, tmpdir):
        app = self.app
        p1 = str(tmpdir.join("file1"))
        p2 = str(tmpdir.join("file2"))
        open(p1, "w").close()
        open(p2, "w").close()
        dne = "/does_not_exist"
-        app.ignore_list.Ignore(dne, p1)
-        app.ignore_list.Ignore(p2, dne)
-        app.ignore_list.Ignore(p1, p2)
+        app.ignore_list.ignore(dne, p1)
+        app.ignore_list.ignore(p2, dne)
+        app.ignore_list.ignore(p1, p2)
        app.purge_ignore_list()
        eq_(1, len(app.ignore_list))
-        assert app.ignore_list.AreIgnored(p1, p2)
-        assert not app.ignore_list.AreIgnored(dne, p1)
+        assert app.ignore_list.are_ignored(p1, p2)
+        assert not app.ignore_list.are_ignored(dne, p1)

    def test_only_unicode_is_added_to_ignore_list(self, do_setup):
-        def FakeIgnore(first, second):
+        def fake_ignore(first, second):
            if not isinstance(first, str):
                self.fail()
            if not isinstance(second, str):
                self.fail()

        app = self.app
-        app.ignore_list.Ignore = FakeIgnore
+        app.ignore_list.ignore = fake_ignore
        self.rtable.select([4])
        app.add_selected_to_ignore_list()

@ -404,9 +396,7 @@ class TestCaseDupeGuruWithResults:
        # results table.
        app = self.app
        app.JOB = Job(1, lambda *args, **kw: False)  # Cancels the task
-        add_fake_files_to_directories(
-            app.directories, self.objects
-        )  # We want the scan to at least start
+        add_fake_files_to_directories(app.directories, self.objects)  # We want the scan to at least start
        app.start_scanning()  # will be cancelled immediately
        eq_(len(app.result_table), 0)

@ -431,17 +421,14 @@ class TestCaseDupeGuruWithResults:
        # don't crash


-class TestCaseDupeGuru_renameSelected:
+class TestCaseDupeGuruRenameSelected:
    @pytest.fixture
    def do_setup(self, request):
        tmpdir = request.getfixturevalue("tmpdir")
        p = Path(str(tmpdir))
-        fp = open(str(p["foo bar 1"]), mode="w")
-        fp.close()
-        fp = open(str(p["foo bar 2"]), mode="w")
-        fp.close()
-        fp = open(str(p["foo bar 3"]), mode="w")
-        fp.close()
+        p.joinpath("foo bar 1").touch()
+        p.joinpath("foo bar 2").touch()
+        p.joinpath("foo bar 3").touch()
        files = fs.get_files(p)
        for f in files:
            f.is_ref = False
@ -463,7 +450,7 @@ class TestCaseDupeGuru_renameSelected:
        g = self.groups[0]
        self.rtable.select([1])
        assert app.rename_selected("renamed")
-        names = [p.name for p in self.p.listdir()]
+        names = [p.name for p in self.p.glob("*")]
        assert "renamed" in names
        assert "foo bar 2" not in names
        eq_(g.dupes[0].name, "renamed")
@ -476,7 +463,7 @@ class TestCaseDupeGuru_renameSelected:
        assert not app.rename_selected("renamed")
        msg = logging.warning.calls[0]["msg"]
        eq_("dupeGuru Warning: list index out of range", msg)
-        names = [p.name for p in self.p.listdir()]
+        names = [p.name for p in self.p.glob("*")]
        assert "renamed" not in names
        assert "foo bar 2" in names
        eq_(g.dupes[0].name, "foo bar 2")
@ -489,7 +476,7 @@ class TestCaseDupeGuru_renameSelected:
        assert not app.rename_selected("foo bar 1")
        msg = logging.warning.calls[0]["msg"]
        assert msg.startswith("dupeGuru Warning: 'foo bar 1' already exists in")
-        names = [p.name for p in self.p.listdir()]
+        names = [p.name for p in self.p.glob("*")]
        assert "foo bar 1" in names
        assert "foo bar 2" in names
        eq_(g.dupes[0].name, "foo bar 2")
@ -500,9 +487,9 @@ class TestAppWithDirectoriesInTree:
    def do_setup(self, request):
        tmpdir = request.getfixturevalue("tmpdir")
        p = Path(str(tmpdir))
-        p["sub1"].mkdir()
-        p["sub2"].mkdir()
-        p["sub3"].mkdir()
+        p.joinpath("sub1").mkdir()
+        p.joinpath("sub2").mkdir()
+        p.joinpath("sub3").mkdir()
        app = TestApp()
        self.app = app.app
        self.dtree = app.dtree
@ -514,7 +501,6 @@ class TestAppWithDirectoriesInTree:
        # refreshed.
        node = self.dtree[0]
        eq_(len(node), 3)  # a len() call is required for subnodes to be loaded
-        subnode = node[0]
        node.state = 1  # the state property is a state index
        node = self.dtree[0]
        eq_(len(node), 3)
--- a/core/tests/base.py
+++ b/core/tests/base.py
@ -5,17 +5,16 @@
 # http://www.gnu.org/licenses/gpl-3.0.html

 from hscommon.testutil import TestApp as TestAppBase, CallLogger, eq_, with_app  # noqa
-from hscommon.path import Path
+from pathlib import Path
 from hscommon.util import get_file_ext, format_size
 from hscommon.gui.column import Column
 from hscommon.jobprogress.job import nulljob, JobCancelled

-from .. import engine
-from .. import prioritize
-from ..engine import getwords
-from ..app import DupeGuru as DupeGuruBase
-from ..gui.result_table import ResultTable as ResultTableBase
-from ..gui.prioritize_dialog import PrioritizeDialog
+from core import engine, prioritize
+from core.engine import getwords
+from core.app import DupeGuru as DupeGuruBase
+from core.gui.result_table import ResultTable as ResultTableBase
+from core.gui.prioritize_dialog import PrioritizeDialog


 class DupeGuruView:
@ -86,8 +85,9 @@ class NamedObject:
            folder = "basepath"
        self._folder = Path(folder)
        self.size = size
-        self.md5partial = name
-        self.md5 = name
+        self.digest_partial = name
+        self.digest = name
+        self.digest_samples = name
        if with_words:
            self.words = getwords(name)
        self.is_ref = False
@ -110,11 +110,11 @@ class NamedObject:

    @property
    def path(self):
-        return self._folder[self.name]
+        return self._folder.joinpath(self.name)

    @property
    def folder_path(self):
-        return self.path.parent()
+        return self.path.parent

    @property
    def extension(self):
@ -139,9 +139,7 @@ def GetTestGroups():
    matches = engine.getmatches(objects)  # we should have 5 matches
    groups = engine.get_groups(matches)  # We should have 2 groups
    for g in groups:
-        g.prioritize(
-            lambda x: objects.index(x)
-        )  # We want the dupes to be in the same order as the list is
+        g.prioritize(lambda x: objects.index(x))  # We want the dupes to be in the same order as the list is
    groups.sort(key=len, reverse=True)  # We want the group with 3 members to be first.
    return (objects, matches, groups)

@ -152,8 +150,8 @@ class TestApp(TestAppBase):
    def __init__(self):
        def link_gui(gui):
            gui.view = self.make_logger()
-            if hasattr(gui, "columns"):  # tables
-                gui.columns.view = self.make_logger()
+            if hasattr(gui, "_columns"):  # tables
+                gui._columns.view = self.make_logger()
            return gui

        TestAppBase.__init__(self)
--- a/core/tests/block_test.py
+++ b/core/tests/block_test.py
@ -9,14 +9,12 @@ from pytest import raises, skip
 from hscommon.testutil import eq_

 try:
-    from ..pe.block import avgdiff, getblocks2, NoBlocksError, DifferentBlockCountError
+    from core.pe.block import avgdiff, getblocks2, NoBlocksError, DifferentBlockCountError
 except ImportError:
    skip("Can't import the block module, probably hasn't been compiled.")


-def my_avgdiff(
-    first, second, limit=768, min_iter=3
-):  # this is so I don't have to re-write every call
+def my_avgdiff(first, second, limit=768, min_iter=3):  # this is so I don't have to re-write every call
    return avgdiff(first, second, limit, min_iter)


@ -75,99 +73,6 @@ class TestCasegetblock:
        eq_((meanred, meangreen, meanblue), b)


-# class TCdiff(unittest.TestCase):
-#     def test_diff(self):
-#         b1 = (10, 20, 30)
-#         b2 = (1, 2, 3)
-#         eq_(9 + 18 + 27, diff(b1, b2))
-#
-#     def test_diff_negative(self):
-#         b1 = (10, 20, 30)
-#         b2 = (1, 2, 3)
-#         eq_(9 + 18 + 27, diff(b2, b1))
-#
-#     def test_diff_mixed_positive_and_negative(self):
-#         b1 = (1, 5, 10)
-#         b2 = (10, 1, 15)
-#         eq_(9 + 4 + 5, diff(b1, b2))
-#
-
-# class TCgetblocks(unittest.TestCase):
-#     def test_empty_image(self):
-#         im = empty()
-#         blocks = getblocks(im, 1)
-#         eq_(0, len(blocks))
-#
-#     def test_one_block_image(self):
-#         im = four_pixels()
-#         blocks = getblocks2(im, 1)
-#         eq_(1, len(blocks))
-#         block = blocks[0]
-#         meanred = (0xff + 0x80) // 4
-#         meangreen = (0x80 + 0x40) // 4
-#         meanblue = (0xff + 0x80) // 4
-#         eq_((meanred, meangreen, meanblue), block)
-#
-#     def test_not_enough_height_to_fit_a_block(self):
-#         im = FakeImage((2, 1), [BLACK, BLACK])
-#         blocks = getblocks(im, 2)
-#         eq_(0, len(blocks))
-#
-#     def xtest_dont_include_leftovers(self):
-#         # this test is disabled because getblocks is not used and getblock in cdeffed
-#         pixels = [
-#             RED,(0, 0x80, 0xff), BLACK,
-#             (0x80, 0, 0),(0, 0x40, 0x80), BLACK,
-#             BLACK, BLACK, BLACK
-#         ]
-#         im = FakeImage((3, 3), pixels)
-#         blocks = getblocks(im, 2)
-#         block = blocks[0]
-#         #Because the block is smaller than the image, only blocksize must be considered.
-#         meanred = (0xff + 0x80) // 4
-#         meangreen = (0x80 + 0x40) // 4
-#         meanblue = (0xff + 0x80) // 4
-#         eq_((meanred, meangreen, meanblue), block)
-#
-#     def xtest_two_blocks(self):
-#         # this test is disabled because getblocks is not used and getblock in cdeffed
-#         pixels = [BLACK for i in xrange(4 * 2)]
-#         pixels[0] = RED
-#         pixels[1] = (0, 0x80, 0xff)
-#         pixels[4] = (0x80, 0, 0)
-#         pixels[5] = (0, 0x40, 0x80)
-#         im = FakeImage((4, 2), pixels)
-#         blocks = getblocks(im, 2)
-#         eq_(2, len(blocks))
-#         block = blocks[0]
-#         #Because the block is smaller than the image, only blocksize must be considered.
-#         meanred = (0xff + 0x80) // 4
-#         meangreen = (0x80 + 0x40) // 4
-#         meanblue = (0xff + 0x80) // 4
-#         eq_((meanred, meangreen, meanblue), block)
-#         eq_(BLACK, blocks[1])
-#
-#     def test_four_blocks(self):
-#         pixels = [BLACK for i in xrange(4 * 4)]
-#         pixels[0] = RED
-#         pixels[1] = (0, 0x80, 0xff)
-#         pixels[4] = (0x80, 0, 0)
-#         pixels[5] = (0, 0x40, 0x80)
-#         im = FakeImage((4, 4), pixels)
-#         blocks = getblocks2(im, 2)
-#         eq_(4, len(blocks))
-#         block = blocks[0]
-#         #Because the block is smaller than the image, only blocksize must be considered.
-#         meanred = (0xff + 0x80) // 4
-#         meangreen = (0x80 + 0x40) // 4
-#         meanblue = (0xff + 0x80) // 4
-#         eq_((meanred, meangreen, meanblue), block)
-#         eq_(BLACK, blocks[1])
-#         eq_(BLACK, blocks[2])
-#         eq_(BLACK, blocks[3])
-#
-
-
 class TestCasegetblocks2:
    def test_empty_image(self):
        im = empty()
@ -272,8 +177,8 @@ class TestCaseavgdiff:
    def test_return_at_least_1_at_the_slightest_difference(self):
        ref = (0, 0, 0)
        b1 = (1, 0, 0)
-        blocks1 = [ref for i in range(250)]
-        blocks2 = [ref for i in range(250)]
+        blocks1 = [ref for _ in range(250)]
+        blocks2 = [ref for _ in range(250)]
        blocks2[0] = b1
        eq_(1, my_avgdiff(blocks1, blocks2))

@ -282,41 +187,3 @@ class TestCaseavgdiff:
        blocks1 = [ref, ref]
        blocks2 = [ref, ref]
        eq_(0, my_avgdiff(blocks1, blocks2))
-
-
-# class TCmaxdiff(unittest.TestCase):
-#     def test_empty(self):
-#         self.assertRaises(NoBlocksError, maxdiff,[],[])
-#
-#     def test_two_blocks(self):
-#         b1 = (5, 10, 15)
-#         b2 = (255, 250, 245)
-#         b3 = (0, 0, 0)
-#         b4 = (255, 0, 255)
-#         blocks1 = [b1, b2]
-#         blocks2 = [b3, b4]
-#         expected1 = 5 + 10 + 15
-#         expected2 = 0 + 250 + 10
-#         expected = max(expected1, expected2)
-#         eq_(expected, maxdiff(blocks1, blocks2))
-#
-#     def test_blocks_not_the_same_size(self):
-#         b = (0, 0, 0)
-#         self.assertRaises(DifferentBlockCountError, maxdiff,[b, b],[b])
-#
-#     def test_first_arg_is_empty_but_not_second(self):
-#         #Don't return 0 (as when the 2 lists are empty), raise!
-#         b = (0, 0, 0)
-#         self.assertRaises(DifferentBlockCountError, maxdiff,[],[b])
-#
-#     def test_limit(self):
-#         b1 = (5, 10, 15)
-#         b2 = (255, 250, 245)
-#         b3 = (0, 0, 0)
-#         b4 = (255, 0, 255)
-#         blocks1 = [b1, b2]
-#         blocks2 = [b3, b4]
-#         expected1 = 5 + 10 + 15
-#         expected2 = 0 + 250 + 10
-#         eq_(expected1, maxdiff(blocks1, blocks2, expected1 - 1))
-#
--- a/core/tests/cache_test.py
+++ b/core/tests/cache_test.py
@ -10,41 +10,41 @@ from pytest import raises, skip
 from hscommon.testutil import eq_

 try:
-    from ..pe.cache import colors_to_string, string_to_colors
-    from ..pe.cache_sqlite import SqliteCache
-    from ..pe.cache_shelve import ShelveCache
+    from core.pe.cache import colors_to_bytes, bytes_to_colors
+    from core.pe.cache_sqlite import SqliteCache
 except ImportError:
    skip("Can't import the cache module, probably hasn't been compiled.")


-class TestCasecolors_to_string:
+class TestCaseColorsToString:
    def test_no_color(self):
-        eq_("", colors_to_string([]))
+        eq_(b"", colors_to_bytes([]))

    def test_single_color(self):
-        eq_("000000", colors_to_string([(0, 0, 0)]))
-        eq_("010101", colors_to_string([(1, 1, 1)]))
-        eq_("0a141e", colors_to_string([(10, 20, 30)]))
+        eq_(b"\x00\x00\x00", colors_to_bytes([(0, 0, 0)]))
+        eq_(b"\x01\x01\x01", colors_to_bytes([(1, 1, 1)]))
+        eq_(b"\x0a\x14\x1e", colors_to_bytes([(10, 20, 30)]))

    def test_two_colors(self):
-        eq_("000102030405", colors_to_string([(0, 1, 2), (3, 4, 5)]))
+        eq_(b"\x00\x01\x02\x03\x04\x05", colors_to_bytes([(0, 1, 2), (3, 4, 5)]))


-class TestCasestring_to_colors:
+class TestCaseStringToColors:
    def test_empty(self):
-        eq_([], string_to_colors(""))
+        eq_([], bytes_to_colors(b""))

    def test_single_color(self):
-        eq_([(0, 0, 0)], string_to_colors("000000"))
-        eq_([(2, 3, 4)], string_to_colors("020304"))
-        eq_([(10, 20, 30)], string_to_colors("0a141e"))
+        eq_([(0, 0, 0)], bytes_to_colors(b"\x00\x00\x00"))
+        eq_([(2, 3, 4)], bytes_to_colors(b"\x02\x03\x04"))
+        eq_([(10, 20, 30)], bytes_to_colors(b"\x0a\x14\x1e"))

    def test_two_colors(self):
-        eq_([(10, 20, 30), (40, 50, 60)], string_to_colors("0a141e28323c"))
+        eq_([(10, 20, 30), (40, 50, 60)], bytes_to_colors(b"\x0a\x14\x1e\x28\x32\x3c"))

    def test_incomplete_color(self):
        # don't return anything if it's not a complete color
-        eq_([], string_to_colors("102"))
+        eq_([], bytes_to_colors(b"\x01"))
+        eq_([(1, 2, 3)], bytes_to_colors(b"\x01\x02\x03\x04"))


 class BaseTestCaseCache:
@ -59,13 +59,13 @@ class BaseTestCaseCache:

    def test_set_then_retrieve_blocks(self):
        c = self.get_cache()
-        b = [(0, 0, 0), (1, 2, 3)]
+        b = [[(0, 0, 0), (1, 2, 3)]] * 8
        c["foo"] = b
        eq_(b, c["foo"])

    def test_delitem(self):
        c = self.get_cache()
-        c["foo"] = ""
+        c["foo"] = [[]] * 8
        del c["foo"]
        assert "foo" not in c
        with raises(KeyError):
@ -74,16 +74,16 @@ class BaseTestCaseCache:
    def test_persistance(self, tmpdir):
        DBNAME = tmpdir.join("hstest.db")
        c = self.get_cache(str(DBNAME))
-        c["foo"] = [(1, 2, 3)]
+        c["foo"] = [[(1, 2, 3)]] * 8
        del c
        c = self.get_cache(str(DBNAME))
-        eq_([(1, 2, 3)], c["foo"])
+        eq_([[(1, 2, 3)]] * 8, c["foo"])

    def test_filter(self):
        c = self.get_cache()
-        c["foo"] = ""
-        c["bar"] = ""
-        c["baz"] = ""
+        c["foo"] = [[]] * 8
+        c["bar"] = [[]] * 8
+        c["baz"] = [[]] * 8
        c.filter(lambda p: p != "bar")  # only 'bar' is removed
        eq_(2, len(c))
        assert "foo" in c
@ -92,9 +92,9 @@ class BaseTestCaseCache:

    def test_clear(self):
        c = self.get_cache()
-        c["foo"] = ""
-        c["bar"] = ""
-        c["baz"] = ""
+        c["foo"] = [[]] * 8
+        c["bar"] = [[]] * 8
+        c["baz"] = [[]] * 8
        c.clear()
        eq_(0, len(c))
        assert "foo" not in c
@ -104,7 +104,7 @@ class BaseTestCaseCache:
    def test_by_id(self):
        # it's possible to use the cache by referring to the files by their row_id
        c = self.get_cache()
-        b = [(0, 0, 0), (1, 2, 3)]
+        b = [[(0, 0, 0), (1, 2, 3)]] * 8
        c["foo"] = b
        foo_id = c.get_id("foo")
        eq_(c[foo_id], b)
@ -127,15 +127,10 @@ class TestCaseSqliteCache(BaseTestCaseCache):
        fp.write("invalid sqlite content")
        fp.close()
        c = self.get_cache(dbname)  # should not raise a DatabaseError
-        c["foo"] = [(1, 2, 3)]
+        c["foo"] = [[(1, 2, 3)]] * 8
        del c
        c = self.get_cache(dbname)
-        eq_(c["foo"], [(1, 2, 3)])
-
-
-class TestCaseShelveCache(BaseTestCaseCache):
-    def get_cache(self, dbname=None):
-        return ShelveCache(dbname)
+        eq_(c["foo"], [[(1, 2, 3)]] * 8)


 class TestCaseCacheSQLEscape:
@ -157,7 +152,7 @@ class TestCaseCacheSQLEscape:

    def test_delitem(self):
        c = self.get_cache()
-        c["foo'bar"] = []
+        c["foo'bar"] = [[]] * 8
        try:
            del c["foo'bar"]
        except KeyError:
--- a/core/tests/directories_test.py
+++ b/core/tests/directories_test.py
@ -10,45 +10,39 @@ import tempfile
 import shutil

 from pytest import raises
-from hscommon.path import Path
+from pathlib import Path
 from hscommon.testutil import eq_
 from hscommon.plat import ISWINDOWS

-from ..fs import File
-from ..directories import (
+from core.fs import File
+from core.directories import (
    Directories,
    DirectoryState,
    AlreadyThereError,
    InvalidPathError,
 )
-from ..exclude import ExcludeList, ExcludeDict
+from core.exclude import ExcludeList, ExcludeDict


 def create_fake_fs(rootpath):
    # We have it as a separate function because other units are using it.
-    rootpath = rootpath["fs"]
+    rootpath = rootpath.joinpath("fs")
    rootpath.mkdir()
-    rootpath["dir1"].mkdir()
-    rootpath["dir2"].mkdir()
-    rootpath["dir3"].mkdir()
-    fp = rootpath["file1.test"].open("w")
-    fp.write("1")
-    fp.close()
-    fp = rootpath["file2.test"].open("w")
-    fp.write("12")
-    fp.close()
-    fp = rootpath["file3.test"].open("w")
-    fp.write("123")
-    fp.close()
-    fp = rootpath["dir1"]["file1.test"].open("w")
-    fp.write("1")
-    fp.close()
-    fp = rootpath["dir2"]["file2.test"].open("w")
-    fp.write("12")
-    fp.close()
-    fp = rootpath["dir3"]["file3.test"].open("w")
-    fp.write("123")
-    fp.close()
+    rootpath.joinpath("dir1").mkdir()
+    rootpath.joinpath("dir2").mkdir()
+    rootpath.joinpath("dir3").mkdir()
+    with rootpath.joinpath("file1.test").open("wt") as fp:
+        fp.write("1")
+    with rootpath.joinpath("file2.test").open("wt") as fp:
+        fp.write("12")
+    with rootpath.joinpath("file3.test").open("wt") as fp:
+        fp.write("123")
+    with rootpath.joinpath("dir1", "file1.test").open("wt") as fp:
+        fp.write("1")
+    with rootpath.joinpath("dir2", "file2.test").open("wt") as fp:
+        fp.write("12")
+    with rootpath.joinpath("dir3", "file3.test").open("wt") as fp:
+        fp.write("123")
    return rootpath


@ -60,11 +54,10 @@ def setup_module(module):
    # and another with a more complex structure.
    testpath = Path(tempfile.mkdtemp())
    module.testpath = testpath
-    rootpath = testpath["onefile"]
+    rootpath = testpath.joinpath("onefile")
    rootpath.mkdir()
-    fp = rootpath["test.txt"].open("w")
-    fp.write("test_data")
-    fp.close()
+    with rootpath.joinpath("test.txt").open("wt") as fp:
+        fp.write("test_data")
    create_fake_fs(testpath)


@ -80,41 +73,41 @@ def test_empty():

 def test_add_path():
    d = Directories()
-    p = testpath["onefile"]
+    p = testpath.joinpath("onefile")
    d.add_path(p)
    eq_(1, len(d))
    assert p in d
-    assert (p["foobar"]) in d
-    assert p.parent() not in d
-    p = testpath["fs"]
+    assert (p.joinpath("foobar")) in d
+    assert p.parent not in d
+    p = testpath.joinpath("fs")
    d.add_path(p)
    eq_(2, len(d))
    assert p in d


-def test_AddPath_when_path_is_already_there():
+def test_add_path_when_path_is_already_there():
    d = Directories()
-    p = testpath["onefile"]
+    p = testpath.joinpath("onefile")
    d.add_path(p)
    with raises(AlreadyThereError):
        d.add_path(p)
    with raises(AlreadyThereError):
-        d.add_path(p["foobar"])
+        d.add_path(p.joinpath("foobar"))
    eq_(1, len(d))


 def test_add_path_containing_paths_already_there():
    d = Directories()
-    d.add_path(testpath["onefile"])
+    d.add_path(testpath.joinpath("onefile"))
    eq_(1, len(d))
    d.add_path(testpath)
    eq_(len(d), 1)
    eq_(d[0], testpath)


-def test_AddPath_non_latin(tmpdir):
+def test_add_path_non_latin(tmpdir):
    p = Path(str(tmpdir))
-    to_add = p["unicode\u201a"]
+    to_add = p.joinpath("unicode\u201a")
    os.mkdir(str(to_add))
    d = Directories()
    try:
@ -125,61 +118,61 @@ def test_AddPath_non_latin(tmpdir):

 def test_del():
    d = Directories()
-    d.add_path(testpath["onefile"])
+    d.add_path(testpath.joinpath("onefile"))
    try:
        del d[1]
        assert False
    except IndexError:
        pass
-    d.add_path(testpath["fs"])
+    d.add_path(testpath.joinpath("fs"))
    del d[1]
    eq_(1, len(d))


 def test_states():
    d = Directories()
-    p = testpath["onefile"]
+    p = testpath.joinpath("onefile")
    d.add_path(p)
-    eq_(DirectoryState.Normal, d.get_state(p))
-    d.set_state(p, DirectoryState.Reference)
-    eq_(DirectoryState.Reference, d.get_state(p))
-    eq_(DirectoryState.Reference, d.get_state(p["dir1"]))
+    eq_(DirectoryState.NORMAL, d.get_state(p))
+    d.set_state(p, DirectoryState.REFERENCE)
+    eq_(DirectoryState.REFERENCE, d.get_state(p))
+    eq_(DirectoryState.REFERENCE, d.get_state(p.joinpath("dir1")))
    eq_(1, len(d.states))
    eq_(p, list(d.states.keys())[0])
-    eq_(DirectoryState.Reference, d.states[p])
+    eq_(DirectoryState.REFERENCE, d.states[p])


 def test_get_state_with_path_not_there():
    # When the path's not there, just return DirectoryState.Normal
    d = Directories()
-    d.add_path(testpath["onefile"])
-    eq_(d.get_state(testpath), DirectoryState.Normal)
+    d.add_path(testpath.joinpath("onefile"))
+    eq_(d.get_state(testpath), DirectoryState.NORMAL)


 def test_states_overwritten_when_larger_directory_eat_smaller_ones():
    # ref #248
    # When setting the state of a folder, we overwrite previously set states for subfolders.
    d = Directories()
-    p = testpath["onefile"]
+    p = testpath.joinpath("onefile")
    d.add_path(p)
-    d.set_state(p, DirectoryState.Excluded)
+    d.set_state(p, DirectoryState.EXCLUDED)
    d.add_path(testpath)
-    d.set_state(testpath, DirectoryState.Reference)
-    eq_(d.get_state(p), DirectoryState.Reference)
-    eq_(d.get_state(p["dir1"]), DirectoryState.Reference)
-    eq_(d.get_state(testpath), DirectoryState.Reference)
+    d.set_state(testpath, DirectoryState.REFERENCE)
+    eq_(d.get_state(p), DirectoryState.REFERENCE)
+    eq_(d.get_state(p.joinpath("dir1")), DirectoryState.REFERENCE)
+    eq_(d.get_state(testpath), DirectoryState.REFERENCE)


 def test_get_files():
    d = Directories()
-    p = testpath["fs"]
+    p = testpath.joinpath("fs")
    d.add_path(p)
-    d.set_state(p["dir1"], DirectoryState.Reference)
-    d.set_state(p["dir2"], DirectoryState.Excluded)
+    d.set_state(p.joinpath("dir1"), DirectoryState.REFERENCE)
+    d.set_state(p.joinpath("dir2"), DirectoryState.EXCLUDED)
    files = list(d.get_files())
    eq_(5, len(files))
    for f in files:
-        if f.path.parent() == p["dir1"]:
+        if f.path.parent == p.joinpath("dir1"):
            assert f.is_ref
        else:
            assert not f.is_ref
@ -193,7 +186,7 @@ def test_get_files_with_folders():
            return True

    d = Directories()
-    p = testpath["fs"]
+    p = testpath.joinpath("fs")
    d.add_path(p)
    files = list(d.get_files(fileclasses=[FakeFile]))
    # We have the 3 root files and the 3 root dirs
@ -202,25 +195,25 @@ def test_get_files_with_folders():

 def test_get_folders():
    d = Directories()
-    p = testpath["fs"]
+    p = testpath.joinpath("fs")
    d.add_path(p)
-    d.set_state(p["dir1"], DirectoryState.Reference)
-    d.set_state(p["dir2"], DirectoryState.Excluded)
+    d.set_state(p.joinpath("dir1"), DirectoryState.REFERENCE)
+    d.set_state(p.joinpath("dir2"), DirectoryState.EXCLUDED)
    folders = list(d.get_folders())
    eq_(len(folders), 3)
    ref = [f for f in folders if f.is_ref]
    not_ref = [f for f in folders if not f.is_ref]
    eq_(len(ref), 1)
-    eq_(ref[0].path, p["dir1"])
+    eq_(ref[0].path, p.joinpath("dir1"))
    eq_(len(not_ref), 2)
    eq_(ref[0].size, 1)


 def test_get_files_with_inherited_exclusion():
    d = Directories()
-    p = testpath["onefile"]
+    p = testpath.joinpath("onefile")
    d.add_path(p)
-    d.set_state(p, DirectoryState.Excluded)
+    d.set_state(p, DirectoryState.EXCLUDED)
    eq_([], list(d.get_files()))


@ -233,14 +226,14 @@ def test_save_and_load(tmpdir):
    p2.mkdir()
    d1.add_path(p1)
    d1.add_path(p2)
-    d1.set_state(p1, DirectoryState.Reference)
-    d1.set_state(p1["dir1"], DirectoryState.Excluded)
+    d1.set_state(p1, DirectoryState.REFERENCE)
+    d1.set_state(p1.joinpath("dir1"), DirectoryState.EXCLUDED)
    tmpxml = str(tmpdir.join("directories_testunit.xml"))
    d1.save_to_file(tmpxml)
    d2.load_from_file(tmpxml)
    eq_(2, len(d2))
-    eq_(DirectoryState.Reference, d2.get_state(p1))
-    eq_(DirectoryState.Excluded, d2.get_state(p1["dir1"]))
+    eq_(DirectoryState.REFERENCE, d2.get_state(p1))
+    eq_(DirectoryState.EXCLUDED, d2.get_state(p1.joinpath("dir1")))


 def test_invalid_path():
@ -254,7 +247,12 @@ def test_invalid_path():
 def test_set_state_on_invalid_path():
    d = Directories()
    try:
-        d.set_state(Path("foobar",), DirectoryState.Normal)
+        d.set_state(
+            Path(
+                "foobar",
+            ),
+            DirectoryState.NORMAL,
+        )
    except LookupError:
        assert False

@ -263,7 +261,7 @@ def test_load_from_file_with_invalid_path(tmpdir):
    # This test simulates a load from file resulting in a
    # InvalidPath raise. Other directories must be loaded.
    d1 = Directories()
-    d1.add_path(testpath["onefile"])
+    d1.add_path(testpath.joinpath("onefile"))
    # Will raise InvalidPath upon loading
    p = Path(str(tmpdir.join("toremove")))
    p.mkdir()
@ -278,11 +276,11 @@ def test_load_from_file_with_invalid_path(tmpdir):

 def test_unicode_save(tmpdir):
    d = Directories()
-    p1 = Path(str(tmpdir))["hello\xe9"]
+    p1 = Path(str(tmpdir), "hello\xe9")
    p1.mkdir()
-    p1["foo\xe9"].mkdir()
+    p1.joinpath("foo\xe9").mkdir()
    d.add_path(p1)
-    d.set_state(p1["foo\xe9"], DirectoryState.Excluded)
+    d.set_state(p1.joinpath("foo\xe9"), DirectoryState.EXCLUDED)
    tmpxml = str(tmpdir.join("directories_testunit.xml"))
    try:
        d.save_to_file(tmpxml)
@ -292,12 +290,12 @@ def test_unicode_save(tmpdir):

 def test_get_files_refreshes_its_directories():
    d = Directories()
-    p = testpath["fs"]
+    p = testpath.joinpath("fs")
    d.add_path(p)
    files = d.get_files()
    eq_(6, len(list(files)))
    time.sleep(1)
-    os.remove(str(p["dir1"]["file1.test"]))
+    os.remove(str(p.joinpath("dir1", "file1.test")))
    files = d.get_files()
    eq_(5, len(list(files)))

@ -306,54 +304,57 @@ def test_get_files_does_not_choke_on_non_existing_directories(tmpdir):
    d = Directories()
    p = Path(str(tmpdir))
    d.add_path(p)
-    p.rmtree()
+    shutil.rmtree(str(p))
    eq_([], list(d.get_files()))


 def test_get_state_returns_excluded_by_default_for_hidden_directories(tmpdir):
    d = Directories()
    p = Path(str(tmpdir))
-    hidden_dir_path = p[".foo"]
-    p[".foo"].mkdir()
+    hidden_dir_path = p.joinpath(".foo")
+    p.joinpath(".foo").mkdir()
    d.add_path(p)
-    eq_(d.get_state(hidden_dir_path), DirectoryState.Excluded)
+    eq_(d.get_state(hidden_dir_path), DirectoryState.EXCLUDED)
    # But it can be overriden
-    d.set_state(hidden_dir_path, DirectoryState.Normal)
-    eq_(d.get_state(hidden_dir_path), DirectoryState.Normal)
+    d.set_state(hidden_dir_path, DirectoryState.NORMAL)
+    eq_(d.get_state(hidden_dir_path), DirectoryState.NORMAL)


 def test_default_path_state_override(tmpdir):
    # It's possible for a subclass to override the default state of a path
    class MyDirectories(Directories):
        def _default_state_for_path(self, path):
-            if "foobar" in path:
-                return DirectoryState.Excluded
+            if "foobar" in path.parts:
+                return DirectoryState.EXCLUDED
+            return DirectoryState.NORMAL

    d = MyDirectories()
    p1 = Path(str(tmpdir))
-    p1["foobar"].mkdir()
-    p1["foobar/somefile"].open("w").close()
-    p1["foobaz"].mkdir()
-    p1["foobaz/somefile"].open("w").close()
+    p1.joinpath("foobar").mkdir()
+    p1.joinpath("foobar/somefile").touch()
+    p1.joinpath("foobaz").mkdir()
+    p1.joinpath("foobaz/somefile").touch()
    d.add_path(p1)
-    eq_(d.get_state(p1["foobaz"]), DirectoryState.Normal)
-    eq_(d.get_state(p1["foobar"]), DirectoryState.Excluded)
+    eq_(d.get_state(p1.joinpath("foobaz")), DirectoryState.NORMAL)
+    eq_(d.get_state(p1.joinpath("foobar")), DirectoryState.EXCLUDED)
    eq_(len(list(d.get_files())), 1)  # only the 'foobaz' file is there
    # However, the default state can be changed
-    d.set_state(p1["foobar"], DirectoryState.Normal)
-    eq_(d.get_state(p1["foobar"]), DirectoryState.Normal)
+    d.set_state(p1.joinpath("foobar"), DirectoryState.NORMAL)
+    eq_(d.get_state(p1.joinpath("foobar")), DirectoryState.NORMAL)
    eq_(len(list(d.get_files())), 2)


-class TestExcludeList():
+class TestExcludeList:
    def setup_method(self, method):
        self.d = Directories(exclude_list=ExcludeList(union_regex=False))

    def get_files_and_expect_num_result(self, num_result):
        """Calls get_files(), get the filenames only, print for debugging.
        num_result is how many files are expected as a result."""
-        print(f"EXCLUDED REGEX: paths {self.d._exclude_list.compiled_paths} \
-files: {self.d._exclude_list.compiled_files} all: {self.d._exclude_list.compiled}")
+        print(
+            f"EXCLUDED REGEX: paths {self.d._exclude_list.compiled_paths} \
+files: {self.d._exclude_list.compiled_files} all: {self.d._exclude_list.compiled}"
+        )
        files = list(self.d.get_files())
        files = [file.name for file in files]
        print(f"FINAL FILES {files}")
@ -365,42 +366,42 @@ files: {self.d._exclude_list.compiled_files} all: {self.d._exclude_list.compiled
        self.d._exclude_list.add(regex)
        self.d._exclude_list.mark(regex)
        p1 = Path(str(tmpdir))
-        p1["$Recycle.Bin"].mkdir()
-        p1["$Recycle.Bin"]["subdir"].mkdir()
+        p1.joinpath("$Recycle.Bin").mkdir()
+        p1.joinpath("$Recycle.Bin", "subdir").mkdir()
        self.d.add_path(p1)
-        eq_(self.d.get_state(p1["$Recycle.Bin"]), DirectoryState.Excluded)
-        # By default, subdirs should be excluded too, but this can be overriden separately
-        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Excluded)
-        self.d.set_state(p1["$Recycle.Bin"]["subdir"], DirectoryState.Normal)
-        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal)
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin")), DirectoryState.EXCLUDED)
+        # By default, subdirs should be excluded too, but this can be overridden separately
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin", "subdir")), DirectoryState.EXCLUDED)
+        self.d.set_state(p1.joinpath("$Recycle.Bin", "subdir"), DirectoryState.NORMAL)
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin", "subdir")), DirectoryState.NORMAL)

    def test_exclude_refined(self, tmpdir):
        regex1 = r"^\$Recycle\.Bin$"
        self.d._exclude_list.add(regex1)
        self.d._exclude_list.mark(regex1)
        p1 = Path(str(tmpdir))
-        p1["$Recycle.Bin"].mkdir()
-        p1["$Recycle.Bin"]["somefile.png"].open("w").close()
-        p1["$Recycle.Bin"]["some_unwanted_file.jpg"].open("w").close()
-        p1["$Recycle.Bin"]["subdir"].mkdir()
-        p1["$Recycle.Bin"]["subdir"]["somesubdirfile.png"].open("w").close()
-        p1["$Recycle.Bin"]["subdir"]["unwanted_subdirfile.gif"].open("w").close()
-        p1["$Recycle.Bin"]["subdar"].mkdir()
-        p1["$Recycle.Bin"]["subdar"]["somesubdarfile.jpeg"].open("w").close()
-        p1["$Recycle.Bin"]["subdar"]["unwanted_subdarfile.png"].open("w").close()
-        self.d.add_path(p1["$Recycle.Bin"])
+        p1.joinpath("$Recycle.Bin").mkdir()
+        p1.joinpath("$Recycle.Bin", "somefile.png").touch()
+        p1.joinpath("$Recycle.Bin", "some_unwanted_file.jpg").touch()
+        p1.joinpath("$Recycle.Bin", "subdir").mkdir()
+        p1.joinpath("$Recycle.Bin", "subdir", "somesubdirfile.png").touch()
+        p1.joinpath("$Recycle.Bin", "subdir", "unwanted_subdirfile.gif").touch()
+        p1.joinpath("$Recycle.Bin", "subdar").mkdir()
+        p1.joinpath("$Recycle.Bin", "subdar", "somesubdarfile.jpeg").touch()
+        p1.joinpath("$Recycle.Bin", "subdar", "unwanted_subdarfile.png").touch()
+        self.d.add_path(p1.joinpath("$Recycle.Bin"))

        # Filter should set the default state to Excluded
-        eq_(self.d.get_state(p1["$Recycle.Bin"]), DirectoryState.Excluded)
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin")), DirectoryState.EXCLUDED)
        # The subdir should inherit its parent state
-        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Excluded)
-        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdar"]), DirectoryState.Excluded)
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin", "subdir")), DirectoryState.EXCLUDED)
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin", "subdar")), DirectoryState.EXCLUDED)
        # Override a child path's state
-        self.d.set_state(p1["$Recycle.Bin"]["subdir"], DirectoryState.Normal)
-        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal)
+        self.d.set_state(p1.joinpath("$Recycle.Bin", "subdir"), DirectoryState.NORMAL)
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin", "subdir")), DirectoryState.NORMAL)
        # Parent should keep its default state, and the other child too
-        eq_(self.d.get_state(p1["$Recycle.Bin"]), DirectoryState.Excluded)
-        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdar"]), DirectoryState.Excluded)
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin")), DirectoryState.EXCLUDED)
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin", "subdar")), DirectoryState.EXCLUDED)
        # print(f"get_folders(): {[x for x in self.d.get_folders()]}")

        # only the 2 files directly under the Normal directory
@ -412,8 +413,8 @@ files: {self.d._exclude_list.compiled_files} all: {self.d._exclude_list.compiled
        assert "somesubdirfile.png" in files
        assert "unwanted_subdirfile.gif" in files
        # Overriding the parent should enable all children
-        self.d.set_state(p1["$Recycle.Bin"], DirectoryState.Normal)
-        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdar"]), DirectoryState.Normal)
+        self.d.set_state(p1.joinpath("$Recycle.Bin"), DirectoryState.NORMAL)
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin", "subdar")), DirectoryState.NORMAL)
        # all files there
        files = self.get_files_and_expect_num_result(6)
        assert "somefile.png" in files
@ -437,7 +438,7 @@ files: {self.d._exclude_list.compiled_files} all: {self.d._exclude_list.compiled
        assert self.d._exclude_list.error(regex3) is None
        # print(f"get_folders(): {[x for x in self.d.get_folders()]}")
        # Directory shouldn't change its state here, unless explicitely done by user
-        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal)
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin", "subdir")), DirectoryState.NORMAL)
        files = self.get_files_and_expect_num_result(5)
        assert "unwanted_subdirfile.gif" not in files
        assert "unwanted_subdarfile.png" in files
@ -446,15 +447,15 @@ files: {self.d._exclude_list.compiled_files} all: {self.d._exclude_list.compiled
        regex4 = r".*subdir$"
        self.d._exclude_list.rename(regex3, regex4)
        assert self.d._exclude_list.error(regex4) is None
-        p1["$Recycle.Bin"]["subdar"]["file_ending_with_subdir"].open("w").close()
-        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Excluded)
+        p1.joinpath("$Recycle.Bin", "subdar", "file_ending_with_subdir").touch()
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin", "subdir")), DirectoryState.EXCLUDED)
        files = self.get_files_and_expect_num_result(4)
        assert "file_ending_with_subdir" not in files
        assert "somesubdarfile.jpeg" in files
        assert "somesubdirfile.png" not in files
        assert "unwanted_subdirfile.gif" not in files
-        self.d.set_state(p1["$Recycle.Bin"]["subdir"], DirectoryState.Normal)
-        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal)
+        self.d.set_state(p1.joinpath("$Recycle.Bin", "subdir"), DirectoryState.NORMAL)
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin", "subdir")), DirectoryState.NORMAL)
        # print(f"get_folders(): {[x for x in self.d.get_folders()]}")
        files = self.get_files_and_expect_num_result(6)
        assert "file_ending_with_subdir" not in files
@ -464,31 +465,54 @@ files: {self.d._exclude_list.compiled_files} all: {self.d._exclude_list.compiled
        regex5 = r".*subdir.*"
        self.d._exclude_list.rename(regex4, regex5)
        # Files containing substring should be filtered
-        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal)
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin", "subdir")), DirectoryState.NORMAL)
        # The path should not match, only the filename, the "subdir" in the directory name shouldn't matter
-        p1["$Recycle.Bin"]["subdir"]["file_which_shouldnt_match"].open("w").close()
+        p1.joinpath("$Recycle.Bin", "subdir", "file_which_shouldnt_match").touch()
        files = self.get_files_and_expect_num_result(5)
        assert "somesubdirfile.png" not in files
        assert "unwanted_subdirfile.gif" not in files
        assert "file_ending_with_subdir" not in files
        assert "file_which_shouldnt_match" in files

+        # This should match the directory only
+        regex6 = r".*/.*subdir.*/.*"
+        if ISWINDOWS:
+            regex6 = r".*\\.*subdir.*\\.*"
+        assert os.sep in regex6
+        self.d._exclude_list.rename(regex5, regex6)
+        self.d._exclude_list.remove(regex1)
+        eq_(len(self.d._exclude_list.compiled), 1)
+        assert regex1 not in self.d._exclude_list
+        assert regex5 not in self.d._exclude_list
+        assert self.d._exclude_list.error(regex6) is None
+        assert regex6 in self.d._exclude_list
+        # This still should not be affected
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin", "subdir")), DirectoryState.NORMAL)
+        files = self.get_files_and_expect_num_result(5)
+        # These files are under the "/subdir" directory
+        assert "somesubdirfile.png" not in files
+        assert "unwanted_subdirfile.gif" not in files
+        # This file under "subdar" directory should not be filtered out
+        assert "file_ending_with_subdir" in files
+        # This file is in a directory that should be filtered out
+        assert "file_which_shouldnt_match" not in files
+
    def test_japanese_unicode(self, tmpdir):
        p1 = Path(str(tmpdir))
-        p1["$Recycle.Bin"].mkdir()
-        p1["$Recycle.Bin"]["somerecycledfile.png"].open("w").close()
-        p1["$Recycle.Bin"]["some_unwanted_file.jpg"].open("w").close()
-        p1["$Recycle.Bin"]["subdir"].mkdir()
-        p1["$Recycle.Bin"]["subdir"]["過去白濁物語～]_カラー.jpg"].open("w").close()
-        p1["$Recycle.Bin"]["思叫物語"].mkdir()
-        p1["$Recycle.Bin"]["思叫物語"]["なししろ会う前"].open("w").close()
-        p1["$Recycle.Bin"]["思叫物語"]["堂～ロ"].open("w").close()
-        self.d.add_path(p1["$Recycle.Bin"])
+        p1.joinpath("$Recycle.Bin").mkdir()
+        p1.joinpath("$Recycle.Bin", "somerecycledfile.png").touch()
+        p1.joinpath("$Recycle.Bin", "some_unwanted_file.jpg").touch()
+        p1.joinpath("$Recycle.Bin", "subdir").mkdir()
+        p1.joinpath("$Recycle.Bin", "subdir", "過去白濁物語～]_カラー.jpg").touch()
+        p1.joinpath("$Recycle.Bin", "思叫物語").mkdir()
+        p1.joinpath("$Recycle.Bin", "思叫物語", "なししろ会う前").touch()
+        p1.joinpath("$Recycle.Bin", "思叫物語", "堂～ロ").touch()
+        self.d.add_path(p1.joinpath("$Recycle.Bin"))
        regex3 = r".*物語.*"
        self.d._exclude_list.add(regex3)
        self.d._exclude_list.mark(regex3)
        # print(f"get_folders(): {[x for x in self.d.get_folders()]}")
-        eq_(self.d.get_state(p1["$Recycle.Bin"]["思叫物語"]), DirectoryState.Excluded)
+        eq_(self.d.get_state(p1.joinpath("$Recycle.Bin", "思叫物語")), DirectoryState.EXCLUDED)
        files = self.get_files_and_expect_num_result(2)
        assert "過去白濁物語～]_カラー.jpg" not in files
        assert "なししろ会う前" not in files
@ -497,7 +521,7 @@ files: {self.d._exclude_list.compiled_files} all: {self.d._exclude_list.compiled
        regex4 = r".*物語$"
        self.d._exclude_list.rename(regex3, regex4)
        assert self.d._exclude_list.error(regex4) is None
-        self.d.set_state(p1["$Recycle.Bin"]["思叫物語"], DirectoryState.Normal)
+        self.d.set_state(p1.joinpath("$Recycle.Bin", "思叫物語"), DirectoryState.NORMAL)
        files = self.get_files_and_expect_num_result(5)
        assert "過去白濁物語～]_カラー.jpg" in files
        assert "なししろ会う前" in files
@ -509,15 +533,15 @@ files: {self.d._exclude_list.compiled_files} all: {self.d._exclude_list.compiled
        self.d._exclude_list.add(regex)
        self.d._exclude_list.mark(regex)
        p1 = Path(str(tmpdir))
-        p1["foobar"].mkdir()
-        p1["foobar"][".hidden_file.txt"].open("w").close()
-        p1["foobar"][".hidden_dir"].mkdir()
-        p1["foobar"][".hidden_dir"]["foobar.jpg"].open("w").close()
-        p1["foobar"][".hidden_dir"][".hidden_subfile.png"].open("w").close()
-        self.d.add_path(p1["foobar"])
+        p1.joinpath("foobar").mkdir()
+        p1.joinpath("foobar", ".hidden_file.txt").touch()
+        p1.joinpath("foobar", ".hidden_dir").mkdir()
+        p1.joinpath("foobar", ".hidden_dir", "foobar.jpg").touch()
+        p1.joinpath("foobar", ".hidden_dir", ".hidden_subfile.png").touch()
+        self.d.add_path(p1.joinpath("foobar"))
        # It should not inherit its parent's state originally
-        eq_(self.d.get_state(p1["foobar"][".hidden_dir"]), DirectoryState.Excluded)
-        self.d.set_state(p1["foobar"][".hidden_dir"], DirectoryState.Normal)
+        eq_(self.d.get_state(p1.joinpath("foobar", ".hidden_dir")), DirectoryState.EXCLUDED)
+        self.d.set_state(p1.joinpath("foobar", ".hidden_dir"), DirectoryState.NORMAL)
        # The files should still be filtered
        files = self.get_files_and_expect_num_result(1)
        eq_(len(self.d._exclude_list.compiled_paths), 0)
--- a/core/tests/engine_test.py
+++ b/core/tests/engine_test.py
@ -10,9 +10,9 @@ from hscommon.jobprogress import job
 from hscommon.util import first
 from hscommon.testutil import eq_, log_calls

-from .base import NamedObject
-from .. import engine
-from ..engine import (
+from core.tests.base import NamedObject
+from core import engine
+from core.engine import (
    get_match,
    getwords,
    Group,
@ -69,6 +69,13 @@ class TestCasegetwords:
        eq_(["a", "b", "c", "d"], getwords("a b c d"))
        eq_(["a", "b", "c", "d"], getwords(" a  b  c d "))

+    def test_unicode(self):
+        eq_(["e", "c", "0", "a", "o", "u", "e", "u"], getwords("é ç 0 à ö û è ¤ ù"))
+        eq_(
+            ["02", "君のこころは輝いてるかい？", "国木田花丸", "solo", "ver"],
+            getwords("02 君のこころは輝いてるかい？ 国木田花丸 Solo Ver"),
+        )
+
    def test_splitter_chars(self):
        eq_(
            [chr(i) for i in range(ord("a"), ord("z") + 1)],
@ -85,7 +92,7 @@ class TestCasegetwords:
        eq_(["foo", "bar"], getwords("FOO BAR"))

    def test_decompose_unicode(self):
-        eq_(getwords("foo\xe9bar"), ["fooebar"])
+        eq_(["fooebar"], getwords("foo\xe9bar"))


 class TestCasegetfields:
@ -99,10 +106,9 @@ class TestCasegetfields:
        expected = [["a", "bc", "def"]]
        actual = getfields(" - a bc def")
        eq_(expected, actual)
-        expected = [["bc", "def"]]


-class TestCaseunpack_fields:
+class TestCaseUnpackFields:
    def test_with_fields(self):
        expected = ["a", "b", "c", "d", "e", "f"]
        actual = unpack_fields([["a"], ["b", "c"], ["d", "e", "f"]])
@ -173,9 +179,7 @@ class TestCaseWordCompareWithFields:
    def test_simple(self):
        eq_(
            67,
-            compare_fields(
-                [["a", "b"], ["c", "d", "e"]], [["a", "b"], ["c", "d", "f"]]
-            ),
+            compare_fields([["a", "b"], ["c", "d", "e"]], [["a", "b"], ["c", "d", "f"]]),
        )

    def test_empty(self):
@ -216,24 +220,24 @@ class TestCaseWordCompareWithFields:
        eq_([["c", "d", "f"], ["a", "b"]], second)


-class TestCasebuild_word_dict:
+class TestCaseBuildWordDict:
    def test_with_standard_words(self):
-        itemList = [NamedObject("foo bar", True)]
-        itemList.append(NamedObject("bar baz", True))
-        itemList.append(NamedObject("baz bleh foo", True))
-        d = build_word_dict(itemList)
+        item_list = [NamedObject("foo bar", True)]
+        item_list.append(NamedObject("bar baz", True))
+        item_list.append(NamedObject("baz bleh foo", True))
+        d = build_word_dict(item_list)
        eq_(4, len(d))
        eq_(2, len(d["foo"]))
-        assert itemList[0] in d["foo"]
-        assert itemList[2] in d["foo"]
+        assert item_list[0] in d["foo"]
+        assert item_list[2] in d["foo"]
        eq_(2, len(d["bar"]))
-        assert itemList[0] in d["bar"]
-        assert itemList[1] in d["bar"]
+        assert item_list[0] in d["bar"]
+        assert item_list[1] in d["bar"]
        eq_(2, len(d["baz"]))
-        assert itemList[1] in d["baz"]
-        assert itemList[2] in d["baz"]
+        assert item_list[1] in d["baz"]
+        assert item_list[2] in d["baz"]
        eq_(1, len(d["bleh"]))
-        assert itemList[2] in d["bleh"]
+        assert item_list[2] in d["bleh"]

    def test_unpack_fields(self):
        o = NamedObject("")
@ -261,31 +265,29 @@ class TestCasebuild_word_dict:
        j = job.Job(1, do_progress)
        self.log = []
        s = "foo bar"
-        build_word_dict(
-            [NamedObject(s, True), NamedObject(s, True), NamedObject(s, True)], j
-        )
+        build_word_dict([NamedObject(s, True), NamedObject(s, True), NamedObject(s, True)], j)
        # We don't have intermediate log because iter_with_progress is called with every > 1
        eq_(0, self.log[0])
        eq_(100, self.log[1])


-class TestCasemerge_similar_words:
+class TestCaseMergeSimilarWords:
    def test_some_similar_words(self):
        d = {
-            "foobar": set([1]),
-            "foobar1": set([2]),
-            "foobar2": set([3]),
+            "foobar": {1},
+            "foobar1": {2},
+            "foobar2": {3},
        }
        merge_similar_words(d)
        eq_(1, len(d))
        eq_(3, len(d["foobar"]))


-class TestCasereduce_common_words:
+class TestCaseReduceCommonWords:
    def test_typical(self):
        d = {
-            "foo": set([NamedObject("foo bar", True) for i in range(50)]),
-            "bar": set([NamedObject("foo bar", True) for i in range(49)]),
+            "foo": {NamedObject("foo bar", True) for _ in range(50)},
+            "bar": {NamedObject("foo bar", True) for _ in range(49)},
        }
        reduce_common_words(d, 50)
        assert "foo" not in d
@ -293,11 +295,8 @@ class TestCasereduce_common_words:

    def test_dont_remove_objects_with_only_common_words(self):
        d = {
-            "common": set(
-                [NamedObject("common uncommon", True) for i in range(50)]
-                + [NamedObject("common", True)]
-            ),
-            "uncommon": set([NamedObject("common uncommon", True)]),
+            "common": set([NamedObject("common uncommon", True) for _ in range(50)] + [NamedObject("common", True)]),
+            "uncommon": {NamedObject("common uncommon", True)},
        }
        reduce_common_words(d, 50)
        eq_(1, len(d["common"]))
@ -305,23 +304,20 @@ class TestCasereduce_common_words:

    def test_values_still_are_set_instances(self):
        d = {
-            "common": set(
-                [NamedObject("common uncommon", True) for i in range(50)]
-                + [NamedObject("common", True)]
-            ),
-            "uncommon": set([NamedObject("common uncommon", True)]),
+            "common": set([NamedObject("common uncommon", True) for _ in range(50)] + [NamedObject("common", True)]),
+            "uncommon": {NamedObject("common uncommon", True)},
        }
        reduce_common_words(d, 50)
        assert isinstance(d["common"], set)
        assert isinstance(d["uncommon"], set)

-    def test_dont_raise_KeyError_when_a_word_has_been_removed(self):
+    def test_dont_raise_keyerror_when_a_word_has_been_removed(self):
        # If a word has been removed by the reduce, an object in a subsequent common word that
        # contains the word that has been removed would cause a KeyError.
        d = {
-            "foo": set([NamedObject("foo bar baz", True) for i in range(50)]),
-            "bar": set([NamedObject("foo bar baz", True) for i in range(50)]),
-            "baz": set([NamedObject("foo bar baz", True) for i in range(49)]),
+            "foo": {NamedObject("foo bar baz", True) for _ in range(50)},
+            "bar": {NamedObject("foo bar baz", True) for _ in range(50)},
+            "baz": {NamedObject("foo bar baz", True) for _ in range(49)},
        }
        try:
            reduce_common_words(d, 50)
@ -335,7 +331,7 @@ class TestCasereduce_common_words:
            o.words = [["foo", "bar"], ["baz"]]
            return o

-        d = {"foo": set([create_it() for i in range(50)])}
+        d = {"foo": {create_it() for _ in range(50)}}
        try:
            reduce_common_words(d, 50)
        except TypeError:
@ -348,13 +344,9 @@ class TestCasereduce_common_words:
        # would not stay in 'bar' because 'foo' is not a common word anymore.
        only_common = NamedObject("foo bar", True)
        d = {
-            "foo": set(
-                [NamedObject("foo bar baz", True) for i in range(49)] + [only_common]
-            ),
-            "bar": set(
-                [NamedObject("foo bar baz", True) for i in range(49)] + [only_common]
-            ),
-            "baz": set([NamedObject("foo bar baz", True) for i in range(49)]),
+            "foo": set([NamedObject("foo bar baz", True) for _ in range(49)] + [only_common]),
+            "bar": set([NamedObject("foo bar baz", True) for _ in range(49)] + [only_common]),
+            "baz": {NamedObject("foo bar baz", True) for _ in range(49)},
        }
        reduce_common_words(d, 50)
        eq_(1, len(d["foo"]))
@ -362,7 +354,7 @@ class TestCasereduce_common_words:
        eq_(49, len(d["baz"]))


-class TestCaseget_match:
+class TestCaseGetMatch:
    def test_simple(self):
        o1 = NamedObject("foo bar", True)
        o2 = NamedObject("bar bleh", True)
@ -382,9 +374,7 @@ class TestCaseget_match:
        assert object() not in m

    def test_word_weight(self):
-        m = get_match(
-            NamedObject("foo bar", True), NamedObject("bar bleh", True), (WEIGHT_WORDS,)
-        )
+        m = get_match(NamedObject("foo bar", True), NamedObject("bar bleh", True), (WEIGHT_WORDS,))
        eq_(m.percentage, int((6.0 / 13.0) * 100))


@ -393,12 +383,12 @@ class TestCaseGetMatches:
        eq_(getmatches([]), [])

    def test_simple(self):
-        itemList = [
+        item_list = [
            NamedObject("foo bar"),
            NamedObject("bar bleh"),
            NamedObject("a b c foo"),
        ]
-        r = getmatches(itemList)
+        r = getmatches(item_list)
        eq_(2, len(r))
        m = first(m for m in r if m.percentage == 50)  # "foo bar" and "bar bleh"
        assert_match(m, "foo bar", "bar bleh")
@ -406,40 +396,40 @@ class TestCaseGetMatches:
        assert_match(m, "foo bar", "a b c foo")

    def test_null_and_unrelated_objects(self):
-        itemList = [
+        item_list = [
            NamedObject("foo bar"),
            NamedObject("bar bleh"),
            NamedObject(""),
            NamedObject("unrelated object"),
        ]
-        r = getmatches(itemList)
+        r = getmatches(item_list)
        eq_(len(r), 1)
        m = r[0]
        eq_(m.percentage, 50)
        assert_match(m, "foo bar", "bar bleh")

    def test_twice_the_same_word(self):
-        itemList = [NamedObject("foo foo bar"), NamedObject("bar bleh")]
-        r = getmatches(itemList)
+        item_list = [NamedObject("foo foo bar"), NamedObject("bar bleh")]
+        r = getmatches(item_list)
        eq_(1, len(r))

    def test_twice_the_same_word_when_preworded(self):
-        itemList = [NamedObject("foo foo bar", True), NamedObject("bar bleh", True)]
-        r = getmatches(itemList)
+        item_list = [NamedObject("foo foo bar", True), NamedObject("bar bleh", True)]
+        r = getmatches(item_list)
        eq_(1, len(r))

    def test_two_words_match(self):
-        itemList = [NamedObject("foo bar"), NamedObject("foo bar bleh")]
-        r = getmatches(itemList)
+        item_list = [NamedObject("foo bar"), NamedObject("foo bar bleh")]
+        r = getmatches(item_list)
        eq_(1, len(r))

    def test_match_files_with_only_common_words(self):
        # If a word occurs more than 50 times, it is excluded from the matching process
        # The problem with the common_word_threshold is that the files containing only common
        # words will never be matched together. We *should* match them.
-        # This test assumes that the common word threashold const is 50
-        itemList = [NamedObject("foo") for i in range(50)]
-        r = getmatches(itemList)
+        # This test assumes that the common word threshold const is 50
+        item_list = [NamedObject("foo") for _ in range(50)]
+        r = getmatches(item_list)
        eq_(1225, len(r))

    def test_use_words_already_there_if_there(self):
@ -462,28 +452,28 @@ class TestCaseGetMatches:
        eq_(100, self.log[-1])

    def test_weight_words(self):
-        itemList = [NamedObject("foo bar"), NamedObject("bar bleh")]
-        m = getmatches(itemList, weight_words=True)[0]
+        item_list = [NamedObject("foo bar"), NamedObject("bar bleh")]
+        m = getmatches(item_list, weight_words=True)[0]
        eq_(int((6.0 / 13.0) * 100), m.percentage)

    def test_similar_word(self):
-        itemList = [NamedObject("foobar"), NamedObject("foobars")]
-        eq_(len(getmatches(itemList, match_similar_words=True)), 1)
-        eq_(getmatches(itemList, match_similar_words=True)[0].percentage, 100)
-        itemList = [NamedObject("foobar"), NamedObject("foo")]
-        eq_(len(getmatches(itemList, match_similar_words=True)), 0)  # too far
-        itemList = [NamedObject("bizkit"), NamedObject("bizket")]
-        eq_(len(getmatches(itemList, match_similar_words=True)), 1)
-        itemList = [NamedObject("foobar"), NamedObject("foosbar")]
-        eq_(len(getmatches(itemList, match_similar_words=True)), 1)
+        item_list = [NamedObject("foobar"), NamedObject("foobars")]
+        eq_(len(getmatches(item_list, match_similar_words=True)), 1)
+        eq_(getmatches(item_list, match_similar_words=True)[0].percentage, 100)
+        item_list = [NamedObject("foobar"), NamedObject("foo")]
+        eq_(len(getmatches(item_list, match_similar_words=True)), 0)  # too far
+        item_list = [NamedObject("bizkit"), NamedObject("bizket")]
+        eq_(len(getmatches(item_list, match_similar_words=True)), 1)
+        item_list = [NamedObject("foobar"), NamedObject("foosbar")]
+        eq_(len(getmatches(item_list, match_similar_words=True)), 1)

    def test_single_object_with_similar_words(self):
-        itemList = [NamedObject("foo foos")]
-        eq_(len(getmatches(itemList, match_similar_words=True)), 0)
+        item_list = [NamedObject("foo foos")]
+        eq_(len(getmatches(item_list, match_similar_words=True)), 0)

    def test_double_words_get_counted_only_once(self):
-        itemList = [NamedObject("foo bar foo bleh"), NamedObject("foo bar bleh bar")]
-        m = getmatches(itemList)[0]
+        item_list = [NamedObject("foo bar foo bleh"), NamedObject("foo bar bleh bar")]
+        m = getmatches(item_list)[0]
        eq_(75, m.percentage)

    def test_with_fields(self):
@ -503,13 +493,13 @@ class TestCaseGetMatches:
        eq_(m.percentage, 50)

    def test_only_match_similar_when_the_option_is_set(self):
-        itemList = [NamedObject("foobar"), NamedObject("foobars")]
-        eq_(len(getmatches(itemList, match_similar_words=False)), 0)
+        item_list = [NamedObject("foobar"), NamedObject("foobars")]
+        eq_(len(getmatches(item_list, match_similar_words=False)), 0)

    def test_dont_recurse_do_match(self):
        # with nosetests, the stack is increased. The number has to be high enough not to be failing falsely
        sys.setrecursionlimit(200)
-        files = [NamedObject("foo bar") for i in range(201)]
+        files = [NamedObject("foo bar") for _ in range(201)]
        try:
            getmatches(files)
        except RuntimeError:
@ -518,38 +508,60 @@ class TestCaseGetMatches:
            sys.setrecursionlimit(1000)

    def test_min_match_percentage(self):
-        itemList = [
+        item_list = [
            NamedObject("foo bar"),
            NamedObject("bar bleh"),
            NamedObject("a b c foo"),
        ]
-        r = getmatches(itemList, min_match_percentage=50)
+        r = getmatches(item_list, min_match_percentage=50)
        eq_(1, len(r))  # Only "foo bar" / "bar bleh" should match

-    def test_MemoryError(self, monkeypatch):
+    def test_memory_error(self, monkeypatch):
        @log_calls
        def mocked_match(first, second, flags):
            if len(mocked_match.calls) > 42:
                raise MemoryError()
            return Match(first, second, 0)

-        objects = [NamedObject() for i in range(10)]  # results in 45 matches
+        objects = [NamedObject() for _ in range(10)]  # results in 45 matches
        monkeypatch.setattr(engine, "get_match", mocked_match)
        try:
            r = getmatches(objects)
        except MemoryError:
-            self.fail("MemorryError must be handled")
+            self.fail("MemoryError must be handled")
        eq_(42, len(r))


 class TestCaseGetMatchesByContents:
-    def test_dont_compare_empty_files(self):
-        o1, o2 = no(size=0), no(size=0)
-        assert not getmatches_by_contents([o1, o2])
+    def test_big_file_partial_hashing(self):
+        smallsize = 1
+        bigsize = 100 * 1024 * 1024  # 100MB
+        f = [
+            no("bigfoo", size=bigsize),
+            no("bigbar", size=bigsize),
+            no("smallfoo", size=smallsize),
+            no("smallbar", size=smallsize),
+        ]
+        f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar"
+        f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar"
+        f[2].digest = f[2].digest_partial = "bleh"
+        f[3].digest = f[3].digest_partial = "bleh"
+        r = getmatches_by_contents(f, bigsize=bigsize)
+        eq_(len(r), 2)
+        # User disabled optimization for big files, compute digests as usual
+        r = getmatches_by_contents(f, bigsize=0)
+        eq_(len(r), 2)
+        # Other file is now slightly different, digest_partial is still the same
+        f[1].digest = f[1].digest_samples = "foobardiff"
+        r = getmatches_by_contents(f, bigsize=bigsize)
+        # Successfully filter it out
+        eq_(len(r), 1)
+        r = getmatches_by_contents(f, bigsize=0)
+        eq_(len(r), 1)


 class TestCaseGroup:
-    def test_empy(self):
+    def test_empty(self):
        g = Group()
        eq_(None, g.ref)
        eq_([], g.dupes)
@ -723,8 +735,7 @@ class TestCaseGroup:
        # if the ref has the same key as one or more of the dupe, run the tie_breaker func among them
        g = get_test_group()
        o1, o2, o3 = g.ordered
-        tie_breaker = lambda ref, dupe: dupe is o3
-        g.prioritize(lambda x: 0, tie_breaker)
+        g.prioritize(lambda x: 0, lambda ref, dupe: dupe is o3)
        assert g.ref is o3

    def test_prioritize_with_tie_breaker_runs_on_all_dupes(self):
@ -735,8 +746,7 @@ class TestCaseGroup:
        o1.foo = 1
        o2.foo = 2
        o3.foo = 3
-        tie_breaker = lambda ref, dupe: dupe.foo > ref.foo
-        g.prioritize(lambda x: 0, tie_breaker)
+        g.prioritize(lambda x: 0, lambda ref, dupe: dupe.foo > ref.foo)
        assert g.ref is o3

    def test_prioritize_with_tie_breaker_runs_only_on_tie_dupes(self):
@ -749,9 +759,7 @@ class TestCaseGroup:
        o1.bar = 1
        o2.bar = 2
        o3.bar = 3
-        key_func = lambda x: -x.foo
-        tie_breaker = lambda ref, dupe: dupe.bar > ref.bar
-        g.prioritize(key_func, tie_breaker)
+        g.prioritize(lambda x: -x.foo, lambda ref, dupe: dupe.bar > ref.bar)
        assert g.ref is o2

    def test_prioritize_with_ref_dupe(self):
@ -792,14 +800,14 @@ class TestCaseGroup:
        eq_(0, len(g.candidates))


-class TestCaseget_groups:
+class TestCaseGetGroups:
    def test_empty(self):
        r = get_groups([])
        eq_([], r)

    def test_simple(self):
-        itemList = [NamedObject("foo bar"), NamedObject("bar bleh")]
-        matches = getmatches(itemList)
+        item_list = [NamedObject("foo bar"), NamedObject("bar bleh")]
+        matches = getmatches(item_list)
        m = matches[0]
        r = get_groups(matches)
        eq_(1, len(r))
@ -809,15 +817,15 @@ class TestCaseget_groups:

    def test_group_with_multiple_matches(self):
        # This results in 3 matches
-        itemList = [NamedObject("foo"), NamedObject("foo"), NamedObject("foo")]
-        matches = getmatches(itemList)
+        item_list = [NamedObject("foo"), NamedObject("foo"), NamedObject("foo")]
+        matches = getmatches(item_list)
        r = get_groups(matches)
        eq_(1, len(r))
        g = r[0]
        eq_(3, len(g))

    def test_must_choose_a_group(self):
-        itemList = [
+        item_list = [
            NamedObject("a b"),
            NamedObject("a b"),
            NamedObject("b c"),
@ -826,13 +834,13 @@ class TestCaseget_groups:
        ]
        # There will be 2 groups here: group "a b" and group "c d"
        # "b c" can go either of them, but not both.
-        matches = getmatches(itemList)
+        matches = getmatches(item_list)
        r = get_groups(matches)
        eq_(2, len(r))
        eq_(5, len(r[0]) + len(r[1]))

    def test_should_all_go_in_the_same_group(self):
-        itemList = [
+        item_list = [
            NamedObject("a b"),
            NamedObject("a b"),
            NamedObject("a b"),
@ -840,7 +848,7 @@ class TestCaseget_groups:
        ]
        # There will be 2 groups here: group "a b" and group "c d"
        # "b c" can fit in both, but it must be in only one of them
-        matches = getmatches(itemList)
+        matches = getmatches(item_list)
        r = get_groups(matches)
        eq_(1, len(r))

@ -859,8 +867,8 @@ class TestCaseget_groups:
        assert o3 in g

    def test_four_sized_group(self):
-        itemList = [NamedObject("foobar") for i in range(4)]
-        m = getmatches(itemList)
+        item_list = [NamedObject("foobar") for _ in range(4)]
+        m = getmatches(item_list)
        r = get_groups(m)
        eq_(1, len(r))
        eq_(4, len(r[0]))
@ -879,13 +887,11 @@ class TestCaseget_groups:
        # If, with a (A, B, C, D) set, all match with A, but C and D don't match with B and that the
        # (A, B) match is the highest (thus resulting in an (A, B) group), still match C and D
        # in a separate group instead of discarding them.
-        A, B, C, D = [NamedObject() for _ in range(4)]
+        A, B, C, D = (NamedObject() for _ in range(4))
        m1 = Match(A, B, 90)  # This is the strongest "A" match
        m2 = Match(A, C, 80)  # Because C doesn't match with B, it won't be in the group
        m3 = Match(A, D, 80)  # Same thing for D
-        m4 = Match(
-            C, D, 70
-        )  # However, because C and D match, they should have their own group.
+        m4 = Match(C, D, 70)  # However, because C and D match, they should have their own group.
        groups = get_groups([m1, m2, m3, m4])
        eq_(len(groups), 2)
        g1, g2 = groups
--- a/core/tests/exclude_test.py
+++ b/core/tests/exclude_test.py
@ -5,16 +5,13 @@
 # http://www.gnu.org/licenses/gpl-3.0.html

 import io
-# import os.path as op
-
 from xml.etree import ElementTree as ET

-# from pytest import raises
 from hscommon.testutil import eq_
 from hscommon.plat import ISWINDOWS

-from .base import DupeGuru
-from ..exclude import ExcludeList, ExcludeDict, default_regexes, AlreadyThereException
+from core.tests.base import DupeGuru
+from core.exclude import ExcludeList, ExcludeDict, default_regexes, AlreadyThereException

 from re import error

@ -104,7 +101,7 @@ class TestCaseListEmpty:
        regex1 = r"one"
        regex2 = r"two"
        self.exclude_list.add(regex1)
-        assert(regex1 in self.exclude_list)
+        assert regex1 in self.exclude_list
        self.exclude_list.add(regex2)
        self.exclude_list.mark(regex1)
        self.exclude_list.mark(regex2)
@ -113,17 +110,17 @@ class TestCaseListEmpty:
        compiled_files = [x for x in self.exclude_list.compiled_files]
        eq_(len(compiled_files), 2)
        self.exclude_list.remove(regex2)
-        assert(regex2 not in self.exclude_list)
+        assert regex2 not in self.exclude_list
        eq_(len(self.exclude_list), 1)

    def test_add_duplicate(self):
        self.exclude_list.add(r"one")
-        eq_(1 , len(self.exclude_list))
+        eq_(1, len(self.exclude_list))
        try:
            self.exclude_list.add(r"one")
        except Exception:
            pass
-        eq_(1 , len(self.exclude_list))
+        eq_(1, len(self.exclude_list))

    def test_add_not_compilable(self):
        # Trying to add a non-valid regex should not work and raise exception
@ -143,11 +140,7 @@ class TestCaseListEmpty:
    def test_force_add_not_compilable(self):
        """Used when loading from XML for example"""
        regex = r"one))"
-        try:
-            self.exclude_list.add(regex, forced=True)
-        except Exception as e:
-            # Should not get an exception here unless it's a duplicate regex
-            raise e
+        self.exclude_list.add(regex, forced=True)
        marked = self.exclude_list.mark(regex)
        eq_(marked, False)  # can't be marked since not compilable
        eq_(len(self.exclude_list), 1)
@ -188,6 +181,28 @@ class TestCaseListEmpty:
        self.exclude_list.rename(regex_renamed_compilable, regex_compilable)
        eq_(self.exclude_list.is_marked(regex_compilable), True)

+    def test_rename_regex_file_to_path(self):
+        regex = r".*/one.*"
+        if ISWINDOWS:
+            regex = r".*\\one.*"
+        regex2 = r".*one.*"
+        self.exclude_list.add(regex)
+        self.exclude_list.mark(regex)
+        compiled_re = [x.pattern for x in self.exclude_list._excluded_compiled]
+        files_re = [x.pattern for x in self.exclude_list.compiled_files]
+        paths_re = [x.pattern for x in self.exclude_list.compiled_paths]
+        assert regex in compiled_re
+        assert regex not in files_re
+        assert regex in paths_re
+        self.exclude_list.rename(regex, regex2)
+        compiled_re = [x.pattern for x in self.exclude_list._excluded_compiled]
+        files_re = [x.pattern for x in self.exclude_list.compiled_files]
+        paths_re = [x.pattern for x in self.exclude_list.compiled_paths]
+        assert regex not in compiled_re
+        assert regex2 in compiled_re
+        assert regex2 in files_re
+        assert regex2 not in paths_re
+
    def test_restore_default(self):
        """Only unmark previously added regexes and mark the pre-defined ones"""
        regex = r"one"
@ -208,26 +223,163 @@ class TestCaseListEmpty:
                if compiled_re.pattern == re:
                    found = True
            if not found:
-                raise(Exception(f"Default RE {re} not found in compiled list."))
-            continue
+                raise (Exception(f"Default RE {re} not found in compiled list."))
        eq_(len(default_regexes), len(self.exclude_list.compiled))


+class TestCaseListEmptyUnion(TestCaseListEmpty):
+    """Same but with union regex"""
+
+    def setup_method(self, method):
+        self.app = DupeGuru()
+        self.app.exclude_list = ExcludeList(union_regex=True)
+        self.exclude_list = self.app.exclude_list
+
+    def test_add_mark_and_remove_regex(self):
+        regex1 = r"one"
+        regex2 = r"two"
+        self.exclude_list.add(regex1)
+        assert regex1 in self.exclude_list
+        self.exclude_list.add(regex2)
+        self.exclude_list.mark(regex1)
+        self.exclude_list.mark(regex2)
+        eq_(len(self.exclude_list), 2)
+        eq_(len(self.exclude_list.compiled), 1)
+        compiled_files = [x for x in self.exclude_list.compiled_files]
+        eq_(len(compiled_files), 1)  # Two patterns joined together into one
+        assert "|" in compiled_files[0].pattern
+        self.exclude_list.remove(regex2)
+        assert regex2 not in self.exclude_list
+        eq_(len(self.exclude_list), 1)
+
+    def test_rename_regex_file_to_path(self):
+        regex = r".*/one.*"
+        if ISWINDOWS:
+            regex = r".*\\one.*"
+        regex2 = r".*one.*"
+        self.exclude_list.add(regex)
+        self.exclude_list.mark(regex)
+        eq_(len([x for x in self.exclude_list]), 1)
+        compiled_re = [x.pattern for x in self.exclude_list.compiled]
+        files_re = [x.pattern for x in self.exclude_list.compiled_files]
+        paths_re = [x.pattern for x in self.exclude_list.compiled_paths]
+        assert regex in compiled_re
+        assert regex not in files_re
+        assert regex in paths_re
+        self.exclude_list.rename(regex, regex2)
+        eq_(len([x for x in self.exclude_list]), 1)
+        compiled_re = [x.pattern for x in self.exclude_list.compiled]
+        files_re = [x.pattern for x in self.exclude_list.compiled_files]
+        paths_re = [x.pattern for x in self.exclude_list.compiled_paths]
+        assert regex not in compiled_re
+        assert regex2 in compiled_re
+        assert regex2 in files_re
+        assert regex2 not in paths_re
+
+    def test_restore_default(self):
+        """Only unmark previously added regexes and mark the pre-defined ones"""
+        regex = r"one"
+        self.exclude_list.add(regex)
+        self.exclude_list.mark(regex)
+        self.exclude_list.restore_defaults()
+        eq_(len(default_regexes), self.exclude_list.marked_count)
+        # added regex shouldn't be marked
+        eq_(self.exclude_list.is_marked(regex), False)
+        # added regex shouldn't be in compiled list either
+        compiled = [x for x in self.exclude_list.compiled]
+        assert regex not in compiled
+        # Need to escape both to get the same strings after compilation
+        compiled_escaped = {x.encode("unicode-escape").decode() for x in compiled[0].pattern.split("|")}
+        default_escaped = {x.encode("unicode-escape").decode() for x in default_regexes}
+        assert compiled_escaped == default_escaped
+        eq_(len(default_regexes), len(compiled[0].pattern.split("|")))
+
+
 class TestCaseDictEmpty(TestCaseListEmpty):
    """Same, but with dictionary implementation"""
+
    def setup_method(self, method):
        self.app = DupeGuru()
        self.app.exclude_list = ExcludeDict(union_regex=False)
        self.exclude_list = self.app.exclude_list


+class TestCaseDictEmptyUnion(TestCaseDictEmpty):
+    """Same, but with union regex"""
+
+    def setup_method(self, method):
+        self.app = DupeGuru()
+        self.app.exclude_list = ExcludeDict(union_regex=True)
+        self.exclude_list = self.app.exclude_list
+
+    def test_add_mark_and_remove_regex(self):
+        regex1 = r"one"
+        regex2 = r"two"
+        self.exclude_list.add(regex1)
+        assert regex1 in self.exclude_list
+        self.exclude_list.add(regex2)
+        self.exclude_list.mark(regex1)
+        self.exclude_list.mark(regex2)
+        eq_(len(self.exclude_list), 2)
+        eq_(len(self.exclude_list.compiled), 1)
+        compiled_files = [x for x in self.exclude_list.compiled_files]
+        # two patterns joined into one
+        eq_(len(compiled_files), 1)
+        self.exclude_list.remove(regex2)
+        assert regex2 not in self.exclude_list
+        eq_(len(self.exclude_list), 1)
+
+    def test_rename_regex_file_to_path(self):
+        regex = r".*/one.*"
+        if ISWINDOWS:
+            regex = r".*\\one.*"
+        regex2 = r".*one.*"
+        self.exclude_list.add(regex)
+        self.exclude_list.mark(regex)
+        marked_re = [x for marked, x in self.exclude_list if marked]
+        eq_(len(marked_re), 1)
+        compiled_re = [x.pattern for x in self.exclude_list.compiled]
+        files_re = [x.pattern for x in self.exclude_list.compiled_files]
+        paths_re = [x.pattern for x in self.exclude_list.compiled_paths]
+        assert regex in compiled_re
+        assert regex not in files_re
+        assert regex in paths_re
+        self.exclude_list.rename(regex, regex2)
+        compiled_re = [x.pattern for x in self.exclude_list.compiled]
+        files_re = [x.pattern for x in self.exclude_list.compiled_files]
+        paths_re = [x.pattern for x in self.exclude_list.compiled_paths]
+        assert regex not in compiled_re
+        assert regex2 in compiled_re
+        assert regex2 in files_re
+        assert regex2 not in paths_re
+
+    def test_restore_default(self):
+        """Only unmark previously added regexes and mark the pre-defined ones"""
+        regex = r"one"
+        self.exclude_list.add(regex)
+        self.exclude_list.mark(regex)
+        self.exclude_list.restore_defaults()
+        eq_(len(default_regexes), self.exclude_list.marked_count)
+        # added regex shouldn't be marked
+        eq_(self.exclude_list.is_marked(regex), False)
+        # added regex shouldn't be in compiled list either
+        compiled = [x for x in self.exclude_list.compiled]
+        assert regex not in compiled
+        # Need to escape both to get the same strings after compilation
+        compiled_escaped = {x.encode("unicode-escape").decode() for x in compiled[0].pattern.split("|")}
+        default_escaped = {x.encode("unicode-escape").decode() for x in default_regexes}
+        assert compiled_escaped == default_escaped
+        eq_(len(default_regexes), len(compiled[0].pattern.split("|")))
+
+
 def split_union(pattern_object):
    """Returns list of strings for each union pattern"""
    return [x for x in pattern_object.pattern.split("|")]


-class TestCaseCompiledList():
+class TestCaseCompiledList:
    """Test consistency between union or and separate versions."""
+
    def setup_method(self, method):
        self.e_separate = ExcludeList(union_regex=False)
        self.e_separate.restore_defaults()
@ -275,6 +427,7 @@ class TestCaseCompiledList():

 class TestCaseCompiledDict(TestCaseCompiledList):
    """Test the dictionary version"""
+
    def setup_method(self, method):
        self.e_separate = ExcludeDict(union_regex=False)
        self.e_separate.restore_defaults()
--- a/core/tests/fs_test.py
+++ b/core/tests/fs_test.py
@ -6,13 +6,48 @@
 # which should be included with this package. The terms are also available at
 # http://www.gnu.org/licenses/gpl-3.0.html

-import hashlib
+import typing
+from os import urandom

-from hscommon.path import Path
+from pathlib import Path
 from hscommon.testutil import eq_
 from core.tests.directories_test import create_fake_fs

-from .. import fs
+from core import fs
+
+hasher: typing.Callable
+try:
+    import xxhash
+
+    hasher = xxhash.xxh128
+except ImportError:
+    import hashlib
+
+    hasher = hashlib.md5
+
+
+def create_fake_fs_with_random_data(rootpath):
+    rootpath = rootpath.joinpath("fs")
+    rootpath.mkdir()
+    rootpath.joinpath("dir1").mkdir()
+    rootpath.joinpath("dir2").mkdir()
+    rootpath.joinpath("dir3").mkdir()
+    data1 = urandom(200 * 1024)  # 200KiB
+    data2 = urandom(1024 * 1024)  # 1MiB
+    data3 = urandom(10 * 1024 * 1024)  # 10MiB
+    with rootpath.joinpath("file1.test").open("wb") as fp:
+        fp.write(data1)
+    with rootpath.joinpath("file2.test").open("wb") as fp:
+        fp.write(data2)
+    with rootpath.joinpath("file3.test").open("wb") as fp:
+        fp.write(data3)
+    with rootpath.joinpath("dir1", "file1.test").open("wb") as fp:
+        fp.write(data1)
+    with rootpath.joinpath("dir2", "file2.test").open("wb") as fp:
+        fp.write(data2)
+    with rootpath.joinpath("dir3", "file3.test").open("wb") as fp:
+        fp.write(data3)
+    return rootpath


 def test_size_aggregates_subfiles(tmpdir):
@ -21,24 +56,54 @@ def test_size_aggregates_subfiles(tmpdir):
    eq_(b.size, 12)


-def test_md5_aggregate_subfiles_sorted(tmpdir):
-    # dir.allfiles can return child in any order. Thus, bundle.md5 must aggregate
-    # all files' md5 it contains, but it must make sure that it does so in the
+def test_digest_aggregate_subfiles_sorted(tmpdir):
+    # dir.allfiles can return child in any order. Thus, bundle.digest must aggregate
+    # all files' digests it contains, but it must make sure that it does so in the
    # same order everytime.
-    p = create_fake_fs(Path(str(tmpdir)))
+    p = create_fake_fs_with_random_data(Path(str(tmpdir)))
    b = fs.Folder(p)
-    md51 = fs.File(p["dir1"]["file1.test"]).md5
-    md52 = fs.File(p["dir2"]["file2.test"]).md5
-    md53 = fs.File(p["dir3"]["file3.test"]).md5
-    md54 = fs.File(p["file1.test"]).md5
-    md55 = fs.File(p["file2.test"]).md5
-    md56 = fs.File(p["file3.test"]).md5
-    # The expected md5 is the md5 of md5s for folders and the direct md5 for files
-    folder_md51 = hashlib.md5(md51).digest()
-    folder_md52 = hashlib.md5(md52).digest()
-    folder_md53 = hashlib.md5(md53).digest()
-    md5 = hashlib.md5(folder_md51 + folder_md52 + folder_md53 + md54 + md55 + md56)
-    eq_(b.md5, md5.digest())
+    digest1 = fs.File(p.joinpath("dir1", "file1.test")).digest
+    digest2 = fs.File(p.joinpath("dir2", "file2.test")).digest
+    digest3 = fs.File(p.joinpath("dir3", "file3.test")).digest
+    digest4 = fs.File(p.joinpath("file1.test")).digest
+    digest5 = fs.File(p.joinpath("file2.test")).digest
+    digest6 = fs.File(p.joinpath("file3.test")).digest
+    # The expected digest is the hash of digests for folders and the direct digest for files
+    folder_digest1 = hasher(digest1).digest()
+    folder_digest2 = hasher(digest2).digest()
+    folder_digest3 = hasher(digest3).digest()
+    digest = hasher(folder_digest1 + folder_digest2 + folder_digest3 + digest4 + digest5 + digest6).digest()
+    eq_(b.digest, digest)
+
+
+def test_partial_digest_aggregate_subfile_sorted(tmpdir):
+    p = create_fake_fs_with_random_data(Path(str(tmpdir)))
+    b = fs.Folder(p)
+    digest1 = fs.File(p.joinpath("dir1", "file1.test")).digest_partial
+    digest2 = fs.File(p.joinpath("dir2", "file2.test")).digest_partial
+    digest3 = fs.File(p.joinpath("dir3", "file3.test")).digest_partial
+    digest4 = fs.File(p.joinpath("file1.test")).digest_partial
+    digest5 = fs.File(p.joinpath("file2.test")).digest_partial
+    digest6 = fs.File(p.joinpath("file3.test")).digest_partial
+    # The expected digest is the hash of digests for folders and the direct digest for files
+    folder_digest1 = hasher(digest1).digest()
+    folder_digest2 = hasher(digest2).digest()
+    folder_digest3 = hasher(digest3).digest()
+    digest = hasher(folder_digest1 + folder_digest2 + folder_digest3 + digest4 + digest5 + digest6).digest()
+    eq_(b.digest_partial, digest)
+
+    digest1 = fs.File(p.joinpath("dir1", "file1.test")).digest_samples
+    digest2 = fs.File(p.joinpath("dir2", "file2.test")).digest_samples
+    digest3 = fs.File(p.joinpath("dir3", "file3.test")).digest_samples
+    digest4 = fs.File(p.joinpath("file1.test")).digest_samples
+    digest5 = fs.File(p.joinpath("file2.test")).digest_samples
+    digest6 = fs.File(p.joinpath("file3.test")).digest_samples
+    # The expected digest is the digest of digests for folders and the direct digest for files
+    folder_digest1 = hasher(digest1).digest()
+    folder_digest2 = hasher(digest2).digest()
+    folder_digest3 = hasher(digest3).digest()
+    digest = hasher(folder_digest1 + folder_digest2 + folder_digest3 + digest4 + digest5 + digest6).digest()
+    eq_(b.digest_samples, digest)


 def test_has_file_attrs(tmpdir):
--- a/core/tests/ignore_test.py
+++ b/core/tests/ignore_test.py
@ -10,60 +10,60 @@ from xml.etree import ElementTree as ET
 from pytest import raises
 from hscommon.testutil import eq_

-from ..ignore import IgnoreList
+from core.ignore import IgnoreList


 def test_empty():
    il = IgnoreList()
    eq_(0, len(il))
-    assert not il.AreIgnored("foo", "bar")
+    assert not il.are_ignored("foo", "bar")


 def test_simple():
    il = IgnoreList()
-    il.Ignore("foo", "bar")
-    assert il.AreIgnored("foo", "bar")
-    assert il.AreIgnored("bar", "foo")
-    assert not il.AreIgnored("foo", "bleh")
-    assert not il.AreIgnored("bleh", "bar")
+    il.ignore("foo", "bar")
+    assert il.are_ignored("foo", "bar")
+    assert il.are_ignored("bar", "foo")
+    assert not il.are_ignored("foo", "bleh")
+    assert not il.are_ignored("bleh", "bar")
    eq_(1, len(il))


 def test_multiple():
    il = IgnoreList()
-    il.Ignore("foo", "bar")
-    il.Ignore("foo", "bleh")
-    il.Ignore("bleh", "bar")
-    il.Ignore("aybabtu", "bleh")
-    assert il.AreIgnored("foo", "bar")
-    assert il.AreIgnored("bar", "foo")
-    assert il.AreIgnored("foo", "bleh")
-    assert il.AreIgnored("bleh", "bar")
-    assert not il.AreIgnored("aybabtu", "bar")
+    il.ignore("foo", "bar")
+    il.ignore("foo", "bleh")
+    il.ignore("bleh", "bar")
+    il.ignore("aybabtu", "bleh")
+    assert il.are_ignored("foo", "bar")
+    assert il.are_ignored("bar", "foo")
+    assert il.are_ignored("foo", "bleh")
+    assert il.are_ignored("bleh", "bar")
+    assert not il.are_ignored("aybabtu", "bar")
    eq_(4, len(il))


 def test_clear():
    il = IgnoreList()
-    il.Ignore("foo", "bar")
-    il.Clear()
-    assert not il.AreIgnored("foo", "bar")
-    assert not il.AreIgnored("bar", "foo")
+    il.ignore("foo", "bar")
+    il.clear()
+    assert not il.are_ignored("foo", "bar")
+    assert not il.are_ignored("bar", "foo")
    eq_(0, len(il))


 def test_add_same_twice():
    il = IgnoreList()
-    il.Ignore("foo", "bar")
-    il.Ignore("bar", "foo")
+    il.ignore("foo", "bar")
+    il.ignore("bar", "foo")
    eq_(1, len(il))


 def test_save_to_xml():
    il = IgnoreList()
-    il.Ignore("foo", "bar")
-    il.Ignore("foo", "bleh")
-    il.Ignore("bleh", "bar")
+    il.ignore("foo", "bar")
+    il.ignore("foo", "bleh")
+    il.ignore("bleh", "bar")
    f = io.BytesIO()
    il.save_to_xml(f)
    f.seek(0)
@ -73,50 +73,46 @@ def test_save_to_xml():
    eq_(len(root), 2)
    eq_(len([c for c in root if c.tag == "file"]), 2)
    f1, f2 = root[:]
-    subchildren = [c for c in f1 if c.tag == "file"] + [
-        c for c in f2 if c.tag == "file"
-    ]
+    subchildren = [c for c in f1 if c.tag == "file"] + [c for c in f2 if c.tag == "file"]
    eq_(len(subchildren), 3)


-def test_SaveThenLoad():
+def test_save_then_load():
    il = IgnoreList()
-    il.Ignore("foo", "bar")
-    il.Ignore("foo", "bleh")
-    il.Ignore("bleh", "bar")
-    il.Ignore("\u00e9", "bar")
+    il.ignore("foo", "bar")
+    il.ignore("foo", "bleh")
+    il.ignore("bleh", "bar")
+    il.ignore("\u00e9", "bar")
    f = io.BytesIO()
    il.save_to_xml(f)
    f.seek(0)
    il = IgnoreList()
    il.load_from_xml(f)
    eq_(4, len(il))
-    assert il.AreIgnored("\u00e9", "bar")
+    assert il.are_ignored("\u00e9", "bar")


-def test_LoadXML_with_empty_file_tags():
+def test_load_xml_with_empty_file_tags():
    f = io.BytesIO()
-    f.write(
-        b'<?xml version="1.0" encoding="utf-8"?><ignore_list><file><file/></file></ignore_list>'
-    )
+    f.write(b'<?xml version="1.0" encoding="utf-8"?><ignore_list><file><file/></file></ignore_list>')
    f.seek(0)
    il = IgnoreList()
    il.load_from_xml(f)
    eq_(0, len(il))


-def test_AreIgnore_works_when_a_child_is_a_key_somewhere_else():
+def test_are_ignore_works_when_a_child_is_a_key_somewhere_else():
    il = IgnoreList()
-    il.Ignore("foo", "bar")
-    il.Ignore("bar", "baz")
-    assert il.AreIgnored("bar", "foo")
+    il.ignore("foo", "bar")
+    il.ignore("bar", "baz")
+    assert il.are_ignored("bar", "foo")


 def test_no_dupes_when_a_child_is_a_key_somewhere_else():
    il = IgnoreList()
-    il.Ignore("foo", "bar")
-    il.Ignore("bar", "baz")
-    il.Ignore("bar", "foo")
+    il.ignore("foo", "bar")
+    il.ignore("bar", "baz")
+    il.ignore("bar", "foo")
    eq_(2, len(il))


@ -125,7 +121,7 @@ def test_iterate():
    il = IgnoreList()
    expected = [("foo", "bar"), ("bar", "baz"), ("foo", "baz")]
    for i in expected:
-        il.Ignore(i[0], i[1])
+        il.ignore(i[0], i[1])
    for i in il:
        expected.remove(i)  # No exception should be raised
    assert not expected  # expected should be empty
@ -133,18 +129,18 @@ def test_iterate():

 def test_filter():
    il = IgnoreList()
-    il.Ignore("foo", "bar")
-    il.Ignore("bar", "baz")
-    il.Ignore("foo", "baz")
-    il.Filter(lambda f, s: f == "bar")
+    il.ignore("foo", "bar")
+    il.ignore("bar", "baz")
+    il.ignore("foo", "baz")
+    il.filter(lambda f, s: f == "bar")
    eq_(1, len(il))
-    assert not il.AreIgnored("foo", "bar")
-    assert il.AreIgnored("bar", "baz")
+    assert not il.are_ignored("foo", "bar")
+    assert il.are_ignored("bar", "baz")


 def test_save_with_non_ascii_items():
    il = IgnoreList()
-    il.Ignore("\xac", "\xbf")
+    il.ignore("\xac", "\xbf")
    f = io.BytesIO()
    try:
        il.save_to_xml(f)
@ -155,29 +151,29 @@ def test_save_with_non_ascii_items():
 def test_len():
    il = IgnoreList()
    eq_(0, len(il))
-    il.Ignore("foo", "bar")
+    il.ignore("foo", "bar")
    eq_(1, len(il))


 def test_nonzero():
    il = IgnoreList()
    assert not il
-    il.Ignore("foo", "bar")
+    il.ignore("foo", "bar")
    assert il


 def test_remove():
    il = IgnoreList()
-    il.Ignore("foo", "bar")
-    il.Ignore("foo", "baz")
+    il.ignore("foo", "bar")
+    il.ignore("foo", "baz")
    il.remove("bar", "foo")
    eq_(len(il), 1)
-    assert not il.AreIgnored("foo", "bar")
+    assert not il.are_ignored("foo", "bar")


 def test_remove_non_existant():
    il = IgnoreList()
-    il.Ignore("foo", "bar")
-    il.Ignore("foo", "baz")
+    il.ignore("foo", "bar")
+    il.ignore("foo", "baz")
    with raises(ValueError):
        il.remove("foo", "bleh")
--- a/core/tests/markable_test.py
+++ b/core/tests/markable_test.py
@ -6,7 +6,7 @@

 from hscommon.testutil import eq_

-from ..markable import MarkableList, Markable
+from core.markable import MarkableList, Markable


 def gen():
--- a/core/tests/prioritize_test.py
+++ b/core/tests/prioritize_test.py
@ -9,8 +9,8 @@
 import os.path as op
 from itertools import combinations

-from .base import TestApp, NamedObject, with_app, eq_
-from ..engine import Group, Match
+from core.tests.base import TestApp, NamedObject, with_app, eq_
+from core.engine import Group, Match

 no = NamedObject

--- a/core/tests/result_table_test.py
+++ b/core/tests/result_table_test.py
@ -6,7 +6,7 @@
 # which should be included with this package. The terms are also available at
 # http://www.gnu.org/licenses/gpl-3.0.html

-from .base import TestApp, GetTestGroups
+from core.tests.base import TestApp, GetTestGroups


 def app_with_results():
--- a/core/tests/results_test.py
+++ b/core/tests/results_test.py
@ -12,10 +12,9 @@ from xml.etree import ElementTree as ET
 from pytest import raises
 from hscommon.testutil import eq_
 from hscommon.util import first
-
-from .. import engine
-from .base import NamedObject, GetTestGroups, DupeGuru
-from ..results import Results
+from core import engine
+from core.tests.base import NamedObject, GetTestGroups, DupeGuru
+from core.results import Results


 class TestCaseResultsEmpty:
@ -117,9 +116,7 @@ class TestCaseResultsWithSomeGroups:
        assert d is g.ref

    def test_sort_groups(self):
-        self.results.make_ref(
-            self.objects[1]
-        )  # We want to make the 1024 sized object to go ref.
+        self.results.make_ref(self.objects[1])  # We want to make the 1024 sized object to go ref.
        g1, g2 = self.groups
        self.results.sort_groups("size")
        assert self.results.groups[0] is g2
@ -129,9 +126,7 @@ class TestCaseResultsWithSomeGroups:
        assert self.results.groups[1] is g2

    def test_set_groups_when_sorted(self):
-        self.results.make_ref(
-            self.objects[1]
-        )  # We want to make the 1024 sized object to go ref.
+        self.results.make_ref(self.objects[1])  # We want to make the 1024 sized object to go ref.
        self.results.sort_groups("size")
        objects, matches, groups = GetTestGroups()
        g1, g2 = groups
@ -341,7 +336,7 @@ class TestCaseResultsMarkings:
        def log_object(o):
            log.append(o)
            if o is self.objects[1]:
-                raise EnvironmentError("foobar")
+                raise OSError("foobar")

        log = []
        self.results.mark_all()
@ -406,7 +401,7 @@ class TestCaseResultsMarkings:
        self.results.make_ref(d)
        eq_("0 / 3 (0.00 B / 3.00 B) duplicates marked.", self.results.stat_line)

-    def test_SaveXML(self):
+    def test_save_xml(self):
        self.results.mark(self.objects[1])
        self.results.mark_invert()
        f = io.BytesIO()
@ -423,7 +418,7 @@ class TestCaseResultsMarkings:
        eq_("n", d1.get("marked"))
        eq_("y", d2.get("marked"))

-    def test_LoadXML(self):
+    def test_load_xml(self):
        def get_file(path):
            return [f for f in self.objects if str(f.path) == path][0]

@ -451,7 +446,7 @@ class TestCaseResultsXML:
        self.results.groups = self.groups

    def get_file(self, path):  # use this as a callback for load_from_xml
-        return [o for o in self.objects if o.path == path][0]
+        return [o for o in self.objects if str(o.path) == path][0]

    def test_save_to_xml(self):
        self.objects[0].is_ref = True
@ -468,7 +463,7 @@ class TestCaseResultsXML:
        eq_(6, len(g1))
        eq_(3, len([c for c in g1 if c.tag == "file"]))
        eq_(3, len([c for c in g1 if c.tag == "match"]))
-        d1, d2, d3 = [c for c in g1 if c.tag == "file"]
+        d1, d2, d3 = (c for c in g1 if c.tag == "file")
        eq_(op.join("basepath", "foo bar"), d1.get("path"))
        eq_(op.join("basepath", "bar bleh"), d2.get("path"))
        eq_(op.join("basepath", "foo bleh"), d3.get("path"))
@ -481,7 +476,7 @@ class TestCaseResultsXML:
        eq_(3, len(g2))
        eq_(2, len([c for c in g2 if c.tag == "file"]))
        eq_(1, len([c for c in g2 if c.tag == "match"]))
-        d1, d2 = [c for c in g2 if c.tag == "file"]
+        d1, d2 = (c for c in g2 if c.tag == "file")
        eq_(op.join("basepath", "ibabtu"), d1.get("path"))
        eq_(op.join("basepath", "ibabtu"), d2.get("path"))
        eq_("n", d1.get("is_ref"))
@ -489,7 +484,7 @@ class TestCaseResultsXML:
        eq_("ibabtu", d1.get("words"))
        eq_("ibabtu", d2.get("words"))

-    def test_LoadXML(self):
+    def test_load_xml(self):
        def get_file(path):
            return [f for f in self.objects if str(f.path) == path][0]

@ -521,7 +516,7 @@ class TestCaseResultsXML:
        eq_(["ibabtu"], g2[0].words)
        eq_(["ibabtu"], g2[1].words)

-    def test_LoadXML_with_filename(self, tmpdir):
+    def test_load_xml_with_filename(self, tmpdir):
        def get_file(path):
            return [f for f in self.objects if str(f.path) == path][0]

@ -533,7 +528,7 @@ class TestCaseResultsXML:
        r.load_from_xml(filename, get_file)
        eq_(2, len(r.groups))

-    def test_LoadXML_with_some_files_that_dont_exist_anymore(self):
+    def test_load_xml_with_some_files_that_dont_exist_anymore(self):
        def get_file(path):
            if path.endswith("ibabtu 2"):
                return None
@ -549,7 +544,7 @@ class TestCaseResultsXML:
        eq_(1, len(r.groups))
        eq_(3, len(r.groups[0]))

-    def test_LoadXML_missing_attributes_and_bogus_elements(self):
+    def test_load_xml_missing_attributes_and_bogus_elements(self):
        def get_file(path):
            return [f for f in self.objects if str(f.path) == path][0]

@ -601,9 +596,7 @@ class TestCaseResultsXML:
        matches = engine.getmatches(objects)  # we should have 5 matches
        groups = engine.get_groups(matches)  # We should have 2 groups
        for g in groups:
-            g.prioritize(
-                lambda x: objects.index(x)
-            )  # We want the dupes to be in the same order as the list is
+            g.prioritize(lambda x: objects.index(x))  # We want the dupes to be in the same order as the list is
        app = DupeGuru()
        results = Results(app)
        results.groups = groups
@ -807,9 +800,7 @@ class TestCaseResultsFilter:
        # Now the stats should display *2* markable dupes (instead of 1)
        expected = "0 / 2 (0.00 B / 2.00 B) duplicates marked. filter: foo"
        eq_(expected, self.results.stat_line)
-        self.results.apply_filter(
-            None
-        )  # Now let's make sure our unfiltered results aren't fucked up
+        self.results.apply_filter(None)  # Now let's make sure our unfiltered results aren't fucked up
        expected = "0 / 3 (0.00 B / 3.00 B) duplicates marked."
        eq_(expected, self.results.stat_line)

--- a/core/tests/scanner_test.py
+++ b/core/tests/scanner_test.py
@ -7,29 +7,33 @@
 import pytest

 from hscommon.jobprogress import job
-from hscommon.path import Path
+from pathlib import Path
 from hscommon.testutil import eq_

-from .. import fs
-from ..engine import getwords, Match
-from ..ignore import IgnoreList
-from ..scanner import Scanner, ScanType
-from ..me.scanner import ScannerME
+from core import fs
+from core.engine import getwords, Match
+from core.ignore import IgnoreList
+from core.scanner import Scanner, ScanType
+from core.me.scanner import ScannerME


+# TODO update this to be able to inherit from fs.File
 class NamedObject:
    def __init__(self, name="foobar", size=1, path=None):
        if path is None:
            path = Path(name)
        else:
-            path = Path(path)[name]
+            path = Path(path, name)
        self.name = name
        self.size = size
        self.path = path
        self.words = getwords(name)

    def __repr__(self):
-        return "<NamedObject %r %r>" % (self.name, self.path)
+        return "<NamedObject {!r} {!r}>".format(self.name, self.path)
+
+    def exists(self):
+        return self.path.exists()


 no = NamedObject
@ -52,10 +56,13 @@ def test_empty(fake_fileexists):
 def test_default_settings(fake_fileexists):
    s = Scanner()
    eq_(s.min_match_percentage, 80)
-    eq_(s.scan_type, ScanType.Filename)
+    eq_(s.scan_type, ScanType.FILENAME)
    eq_(s.mix_file_kind, True)
    eq_(s.word_weighting, False)
    eq_(s.match_similar_words, False)
+    eq_(s.size_threshold, 0)
+    eq_(s.large_size_threshold, 0)
+    eq_(s.big_file_size_threshold, 0)


 def test_simple_with_default_settings(fake_fileexists):
@ -97,7 +104,7 @@ def test_trim_all_ref_groups(fake_fileexists):
    eq_(s.discarded_file_count, 0)


-def test_priorize(fake_fileexists):
+def test_prioritize(fake_fileexists):
    s = Scanner()
    f = [
        no("foo", path="p1"),
@ -118,36 +125,109 @@ def test_priorize(fake_fileexists):

 def test_content_scan(fake_fileexists):
    s = Scanner()
-    s.scan_type = ScanType.Contents
+    s.scan_type = ScanType.CONTENTS
    f = [no("foo"), no("bar"), no("bleh")]
-    f[0].md5 = f[0].md5partial = "foobar"
-    f[1].md5 = f[1].md5partial = "foobar"
-    f[2].md5 = f[2].md5partial = "bleh"
+    f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar"
+    f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar"
+    f[2].digest = f[2].digest_partial = f[1].digest_samples = "bleh"
    r = s.get_dupe_groups(f)
    eq_(len(r), 1)
    eq_(len(r[0]), 2)
-    eq_(s.discarded_file_count, 0)  # don't count the different md5 as discarded!
+    eq_(s.discarded_file_count, 0)  # don't count the different digest as discarded!


 def test_content_scan_compare_sizes_first(fake_fileexists):
    class MyFile(no):
        @property
-        def md5(file):
+        def digest(self):
            raise AssertionError()

    s = Scanner()
-    s.scan_type = ScanType.Contents
+    s.scan_type = ScanType.CONTENTS
    f = [MyFile("foo", 1), MyFile("bar", 2)]
    eq_(len(s.get_dupe_groups(f)), 0)


+def test_ignore_file_size(fake_fileexists):
+    s = Scanner()
+    s.scan_type = ScanType.CONTENTS
+    small_size = 10  # 10KB
+    s.size_threshold = 0
+    large_size = 100 * 1024 * 1024  # 100MB
+    s.large_size_threshold = 0
+    f = [
+        no("smallignore1", small_size - 1),
+        no("smallignore2", small_size - 1),
+        no("small1", small_size),
+        no("small2", small_size),
+        no("large1", large_size),
+        no("large2", large_size),
+        no("largeignore1", large_size + 1),
+        no("largeignore2", large_size + 1),
+    ]
+    f[0].digest = f[0].digest_partial = f[0].digest_samples = "smallignore"
+    f[1].digest = f[1].digest_partial = f[1].digest_samples = "smallignore"
+    f[2].digest = f[2].digest_partial = f[2].digest_samples = "small"
+    f[3].digest = f[3].digest_partial = f[3].digest_samples = "small"
+    f[4].digest = f[4].digest_partial = f[4].digest_samples = "large"
+    f[5].digest = f[5].digest_partial = f[5].digest_samples = "large"
+    f[6].digest = f[6].digest_partial = f[6].digest_samples = "largeignore"
+    f[7].digest = f[7].digest_partial = f[7].digest_samples = "largeignore"
+
+    r = s.get_dupe_groups(f)
+    # No ignores
+    eq_(len(r), 4)
+    # Ignore smaller
+    s.size_threshold = small_size
+    r = s.get_dupe_groups(f)
+    eq_(len(r), 3)
+    # Ignore larger
+    s.size_threshold = 0
+    s.large_size_threshold = large_size
+    r = s.get_dupe_groups(f)
+    eq_(len(r), 3)
+    # Ignore both
+    s.size_threshold = small_size
+    r = s.get_dupe_groups(f)
+    eq_(len(r), 2)
+
+
+def test_big_file_partial_hashes(fake_fileexists):
+    s = Scanner()
+    s.scan_type = ScanType.CONTENTS
+
+    smallsize = 1
+    bigsize = 100 * 1024 * 1024  # 100MB
+    s.big_file_size_threshold = bigsize
+
+    f = [no("bigfoo", bigsize), no("bigbar", bigsize), no("smallfoo", smallsize), no("smallbar", smallsize)]
+    f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar"
+    f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar"
+    f[2].digest = f[2].digest_partial = "bleh"
+    f[3].digest = f[3].digest_partial = "bleh"
+    r = s.get_dupe_groups(f)
+    eq_(len(r), 2)
+
+    # digest_partial is still the same, but the file is actually different
+    f[1].digest = f[1].digest_samples = "difffoobar"
+    # here we compare the full digests, as the user disabled the optimization
+    s.big_file_size_threshold = 0
+    r = s.get_dupe_groups(f)
+    eq_(len(r), 1)
+
+    # here we should compare the digest_samples, and see they are different
+    s.big_file_size_threshold = bigsize
+    r = s.get_dupe_groups(f)
+    eq_(len(r), 1)
+
+
 def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists):
    s = Scanner()
-    s.scan_type = ScanType.Contents
+    s.scan_type = ScanType.CONTENTS
    f = [no("foo"), no("bar"), no("bleh")]
-    f[0].md5 = f[0].md5partial = "foobar"
-    f[1].md5 = f[1].md5partial = "foobar"
-    f[2].md5 = f[2].md5partial = "bleh"
+    f[0].digest = f[0].digest_partial = f[0].digest_samples = "foobar"
+    f[1].digest = f[1].digest_partial = f[1].digest_samples = "foobar"
+    f[2].digest = f[2].digest_partial = f[2].digest_samples = "bleh"
    s.min_match_percentage = 101
    r = s.get_dupe_groups(f)
    eq_(len(r), 1)
@ -158,17 +238,18 @@ def test_min_match_perc_doesnt_matter_for_content_scan(fake_fileexists):
    eq_(len(r[0]), 2)


-def test_content_scan_doesnt_put_md5_in_words_at_the_end(fake_fileexists):
+def test_content_scan_doesnt_put_digest_in_words_at_the_end(fake_fileexists):
    s = Scanner()
-    s.scan_type = ScanType.Contents
+    s.scan_type = ScanType.CONTENTS
    f = [no("foo"), no("bar")]
-    f[0].md5 = f[
-        0
-    ].md5partial = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-    f[1].md5 = f[
-        1
-    ].md5partial = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+    f[0].digest = f[0].digest_partial = f[0].digest_samples = (
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+    )
+    f[1].digest = f[1].digest_partial = f[1].digest_samples = (
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+    )
    r = s.get_dupe_groups(f)
+    # FIXME looks like we are missing something here?
    r[0]


@ -229,7 +310,7 @@ def test_similar_words(fake_fileexists):

 def test_fields(fake_fileexists):
    s = Scanner()
-    s.scan_type = ScanType.Fields
+    s.scan_type = ScanType.FIELDS
    f = [no("The White Stripes - Little Ghost"), no("The White Stripes - Little Acorn")]
    r = s.get_dupe_groups(f)
    eq_(len(r), 0)
@ -237,7 +318,7 @@ def test_fields(fake_fileexists):

 def test_fields_no_order(fake_fileexists):
    s = Scanner()
-    s.scan_type = ScanType.FieldsNoOrder
+    s.scan_type = ScanType.FIELDSNOORDER
    f = [no("The White Stripes - Little Ghost"), no("Little Ghost - The White Stripes")]
    r = s.get_dupe_groups(f)
    eq_(len(r), 1)
@ -245,7 +326,7 @@ def test_fields_no_order(fake_fileexists):

 def test_tag_scan(fake_fileexists):
    s = Scanner()
-    s.scan_type = ScanType.Tag
+    s.scan_type = ScanType.TAG
    o1 = no("foo")
    o2 = no("bar")
    o1.artist = "The White Stripes"
@ -258,8 +339,8 @@ def test_tag_scan(fake_fileexists):

 def test_tag_with_album_scan(fake_fileexists):
    s = Scanner()
-    s.scan_type = ScanType.Tag
-    s.scanned_tags = set(["artist", "album", "title"])
+    s.scan_type = ScanType.TAG
+    s.scanned_tags = {"artist", "album", "title"}
    o1 = no("foo")
    o2 = no("bar")
    o3 = no("bleh")
@ -278,8 +359,8 @@ def test_tag_with_album_scan(fake_fileexists):

 def test_that_dash_in_tags_dont_create_new_fields(fake_fileexists):
    s = Scanner()
-    s.scan_type = ScanType.Tag
-    s.scanned_tags = set(["artist", "album", "title"])
+    s.scan_type = ScanType.TAG
+    s.scanned_tags = {"artist", "album", "title"}
    s.min_match_percentage = 50
    o1 = no("foo")
    o2 = no("bar")
@ -295,8 +376,8 @@ def test_that_dash_in_tags_dont_create_new_fields(fake_fileexists):

 def test_tag_scan_with_different_scanned(fake_fileexists):
    s = Scanner()
-    s.scan_type = ScanType.Tag
-    s.scanned_tags = set(["track", "year"])
+    s.scan_type = ScanType.TAG
+    s.scanned_tags = {"track", "year"}
    o1 = no("foo")
    o2 = no("bar")
    o1.artist = "The White Stripes"
@ -313,8 +394,8 @@ def test_tag_scan_with_different_scanned(fake_fileexists):

 def test_tag_scan_only_scans_existing_tags(fake_fileexists):
    s = Scanner()
-    s.scan_type = ScanType.Tag
-    s.scanned_tags = set(["artist", "foo"])
+    s.scan_type = ScanType.TAG
+    s.scanned_tags = {"artist", "foo"}
    o1 = no("foo")
    o2 = no("bar")
    o1.artist = "The White Stripes"
@ -327,8 +408,8 @@ def test_tag_scan_only_scans_existing_tags(fake_fileexists):

 def test_tag_scan_converts_to_str(fake_fileexists):
    s = Scanner()
-    s.scan_type = ScanType.Tag
-    s.scanned_tags = set(["track"])
+    s.scan_type = ScanType.TAG
+    s.scanned_tags = {"track"}
    o1 = no("foo")
    o2 = no("bar")
    o1.track = 42
@ -342,8 +423,8 @@ def test_tag_scan_converts_to_str(fake_fileexists):

 def test_tag_scan_non_ascii(fake_fileexists):
    s = Scanner()
-    s.scan_type = ScanType.Tag
-    s.scanned_tags = set(["title"])
+    s.scan_type = ScanType.TAG
+    s.scanned_tags = {"title"}
    o1 = no("foo")
    o2 = no("bar")
    o1.title = "foobar\u00e9"
@ -364,8 +445,8 @@ def test_ignore_list(fake_fileexists):
    f2.path = Path("dir2/foobar")
    f3.path = Path("dir3/foobar")
    ignore_list = IgnoreList()
-    ignore_list.Ignore(str(f1.path), str(f2.path))
-    ignore_list.Ignore(str(f1.path), str(f3.path))
+    ignore_list.ignore(str(f1.path), str(f2.path))
+    ignore_list.ignore(str(f1.path), str(f3.path))
    r = s.get_dupe_groups([f1, f2, f3], ignore_list=ignore_list)
    eq_(len(r), 1)
    g = r[0]
@ -388,8 +469,8 @@ def test_ignore_list_checks_for_unicode(fake_fileexists):
    f2.path = Path("foo2\u00e9")
    f3.path = Path("foo3\u00e9")
    ignore_list = IgnoreList()
-    ignore_list.Ignore(str(f1.path), str(f2.path))
-    ignore_list.Ignore(str(f1.path), str(f3.path))
+    ignore_list.ignore(str(f1.path), str(f2.path))
+    ignore_list.ignore(str(f1.path), str(f3.path))
    r = s.get_dupe_groups([f1, f2, f3], ignore_list=ignore_list)
    eq_(len(r), 1)
    g = r[0]
@ -493,14 +574,16 @@ def test_dont_group_files_that_dont_exist(tmpdir):
    # In this test, we have to delete one of the files between the get_matches() part and the
    # get_groups() part.
    s = Scanner()
-    s.scan_type = ScanType.Contents
+    s.scan_type = ScanType.CONTENTS
    p = Path(str(tmpdir))
-    p["file1"].open("w").write("foo")
-    p["file2"].open("w").write("foo")
+    with p.joinpath("file1").open("w") as fp:
+        fp.write("foo")
+    with p.joinpath("file2").open("w") as fp:
+        fp.write("foo")
    file1, file2 = fs.get_files(p)

    def getmatches(*args, **kw):
-        file2.path.remove()
+        file2.path.unlink()
        return [Match(file1, file2, 100)]

    s._getmatches = getmatches
@ -512,23 +595,23 @@ def test_folder_scan_exclude_subfolder_matches(fake_fileexists):
    # when doing a Folders scan type, don't include matches for folders whose parent folder already
    # match.
    s = Scanner()
-    s.scan_type = ScanType.Folders
+    s.scan_type = ScanType.FOLDERS
    topf1 = no("top folder 1", size=42)
-    topf1.md5 = topf1.md5partial = b"some_md5_1"
+    topf1.digest = topf1.digest_partial = topf1.digest_samples = b"some_digest__1"
    topf1.path = Path("/topf1")
    topf2 = no("top folder 2", size=42)
-    topf2.md5 = topf2.md5partial = b"some_md5_1"
+    topf2.digest = topf2.digest_partial = topf2.digest_samples = b"some_digest__1"
    topf2.path = Path("/topf2")
    subf1 = no("sub folder 1", size=41)
-    subf1.md5 = subf1.md5partial = b"some_md5_2"
+    subf1.digest = subf1.digest_partial = subf1.digest_samples = b"some_digest__2"
    subf1.path = Path("/topf1/sub")
    subf2 = no("sub folder 2", size=41)
-    subf2.md5 = subf2.md5partial = b"some_md5_2"
+    subf2.digest = subf2.digest_partial = subf2.digest_samples = b"some_digest__2"
    subf2.path = Path("/topf2/sub")
    eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2])), 1)  # only top folders
    # however, if another folder matches a subfolder, keep in in the matches
    otherf = no("other folder", size=41)
-    otherf.md5 = otherf.md5partial = b"some_md5_2"
+    otherf.digest = otherf.digest_partial = otherf.digest_samples = b"some_digest__2"
    otherf.path = Path("/otherfolder")
    eq_(len(s.get_dupe_groups([topf1, topf2, subf1, subf2, otherf])), 2)

@ -547,21 +630,21 @@ def test_dont_count_ref_files_as_discarded(fake_fileexists):
    # However, this causes problems in "discarded" counting and we make sure here that we don't
    # report discarded matches in exact duplicate scans.
    s = Scanner()
-    s.scan_type = ScanType.Contents
+    s.scan_type = ScanType.CONTENTS
    o1 = no("foo", path="p1")
    o2 = no("foo", path="p2")
    o3 = no("foo", path="p3")
-    o1.md5 = o1.md5partial = "foobar"
-    o2.md5 = o2.md5partial = "foobar"
-    o3.md5 = o3.md5partial = "foobar"
+    o1.digest = o1.digest_partial = o1.digest_samples = "foobar"
+    o2.digest = o2.digest_partial = o2.digest_samples = "foobar"
+    o3.digest = o3.digest_partial = o3.digest_samples = "foobar"
    o1.is_ref = True
    o2.is_ref = True
    eq_(len(s.get_dupe_groups([o1, o2, o3])), 1)
    eq_(s.discarded_file_count, 0)


-def test_priorize_me(fake_fileexists):
-    # in ScannerME, bitrate goes first (right after is_ref) in priorization
+def test_prioritize_me(fake_fileexists):
+    # in ScannerME, bitrate goes first (right after is_ref) in prioritization
    s = ScannerME()
    o1, o2 = no("foo", path="p1"), no("foo", path="p2")
    o1.bitrate = 1
--- a/core/util.py
+++ b/core/util.py
@ -5,6 +5,14 @@
 # http://www.gnu.org/licenses/gpl-3.0.html

 import time
+import sys
+import os
+import urllib.request
+import urllib.error
+import json
+import semantic_version
+import logging
+from typing import Union

 from hscommon.util import format_time_decimal

@ -58,3 +66,38 @@ def fix_surrogate_encoding(s, encoding="utf-8"):
        return s.encode(encoding, "replace").decode(encoding)
    else:
        return s
+
+
+def executable_folder():
+    return os.path.dirname(os.path.abspath(sys.argv[0]))
+
+
+def check_for_update(current_version: str, include_prerelease: bool = False) -> Union[None, dict]:
+    request = urllib.request.Request(
+        "https://api.github.com/repos/arsenetar/dupeguru/releases",
+        headers={"Accept": "application/vnd.github.v3+json"},
+    )
+    try:
+        with urllib.request.urlopen(request) as response:
+            if response.status != 200:
+                logging.warn(f"Error retriving updates. Status: {response.status}")
+                return None
+            try:
+                response_json = json.loads(response.read())
+            except json.JSONDecodeError as ex:
+                logging.warn(f"Error parsing updates. {ex.msg}")
+                return None
+    except urllib.error.URLError as ex:
+        logging.warn(f"Error retriving updates. {ex.reason}")
+        return None
+    new_version = semantic_version.Version(current_version)
+    new_url = None
+    for release in response_json:
+        release_version = semantic_version.Version(release["name"])
+        if new_version < release_version and (include_prerelease or not release_version.prerelease):
+            new_version = release_version
+            new_url = release["html_url"]
+    if new_url is not None:
+        return {"version": new_version, "url": new_url}
+    else:
+        return None
--- a/help/changelog
+++ b/help/changelog
@ -1,6 +1,63 @@
+=== 4.3.1 (2022-07-08)
+* Fix issue where cache db exceptions could prevent files being hashed (#1015)
+* Add extra guard for non-zero length files without digests to prevent false duplicates
+* Update Italian translations
+
+=== 4.3.0 (2022-07-01)
+* Redirect stdout from custom command to the log files (#1008)
+* Update translations
+* Fix typo in debian control file (#989)
+* Add option to profile scans
+* Update fs.py to optimize stat() calls
+* Fix Error when delete after scan (#988)
+* Update directory scanning to use os.scandir() and DirEntry objects
+* Improve performance of Directories.get_state()
+* Migrate from hscommon.path to pathlib
+* Switch file hashing to xxhash with fallback to md5
+* Add update check feature to about box
+
+=== 4.2.1 (2022-03-25)
+* Default to English on unsupported system language (#976)
+* Fix image viewer zoom datatype issue (#978)
+* Fix errors from window change event (#937, #980)
+* Fix deprecation warning from SQLite
+* Enforce minimum Windows version in installer (#983)
+* Fix help path for local files
+* Drop python 3.6 support
+* VS Code project settings added, yaml validation for GitHub actions
+
+=== 4.2.0 (2021-01-24)
+
+* Add Malay and Turkish
+* Add dark style for windows (#900)
+* Add caching md5 file hashes (#942)
+* Add feature to partially hash large files, with user adjustable preference (#908)
+* Add portable mode (store settings next to executable)
+* Add file association for .dupeguru files on windows
+* Add ability to pass .dupeguru file to load on startup (#902)
+* Add ability to reveal in explorer/finder (#895)
+* Switch audio tag processing from hsaudiotag to mutagen (#440)
+* Add ability to use Qt dialogs instead of native OS dialogs for some file selection operations
+* Add OS and Python details to error dialog to assist in troubleshooting
+* Add preference to ignore large files with threshold (#430)
+* Fix error on close from DetailsPanel (#857, #873)
+* Change reference background color (#894, #898)
+* Remove stripping of unicode characters when matching names (#879)
+* Fix exception when deleting in delta view (#863, #905)
+* Fix dupes only view not updating after re-prioritize results (#757, #910, #911)
+* Fix ability to drag'n'drop file/folder with certain characters in name (#897)
+* Fix window position opening partially offscreen (#653)
+* Fix TypeError is photo mode (#551)
+* Change message for when files are deleted directly (#904)
+* Add more feedback during scan (#700)
+* Add Python version check to build.py (#589)
+* General code cleanups
+* Improvements to using standardized build tooling
+* Moved CI/CD to github actions, added codeql, SonarCloud
+
 === 4.1.1 (2021-03-21)

-* Add Japanese 
+* Add Japanese
 * Update internationalization and translations to be up to date with current UI.
 * Minor translation and UI language updates
 * Fix language selection issues on Windows (#760)
@ -362,7 +419,7 @@

 === 2.6.1 (2009-03-27)
 * **Fixed** an occasional crash caused by permission issues.
-* **Fixed** a bug where the "X discarded" notice would show a too large number of discarded 
+* **Fixed** a bug where the "X discarded" notice would show a too large number of discarded
      duplicates.

 === 2.6.0 (2008-09-10)
@ -396,14 +453,14 @@
 * **Added** the "Remove empty folders" option.
 * **Fixed** results load/save issues.
 * **Fixed** occasional status bar inaccuracies when the results are filtered.
-           
+

 === 2.5.0 (2007-09-15)

 * **Added** post scan filtering.
 * **Fixed** issues with the rename feature under Windows
 * **Fixed** some user interface annoyances under Windows
-                
+

 === 2.4.8 (2007-04-14)

@ -419,7 +476,7 @@

 * **Added** Re-orderable columns. In fact, I re-added the feature which was lost in the C# conversion in 2.4.0 (Windows).
 * **Changed** the behavior of the scanning engine when setting the hardness to 100. It will now only match files that have their words in the same order.
-* **Fixed** a bug with all the Delete/Move/Copy actions with certain kinds of files.            
+* **Fixed** a bug with all the Delete/Move/Copy actions with certain kinds of files.

 === 2.4.5 (2007-01-11)

@ -457,7 +514,7 @@

 === 2.3.4 (2006-11-07)

-* **Improved** speed and memory usage of the scanning engine, again. Does it mean there was a lot of improvements to be made? Nah...            
+* **Improved** speed and memory usage of the scanning engine, again. Does it mean there was a lot of improvements to be made? Nah...

 === 2.3.3 (2006-11-02)

@ -515,7 +572,7 @@
 === 2.2.3 (2006-06-15)

 * **Improved** duplicate scanning speed.
-* **Added** a warning that a file couldn't be renamed if a file with the same name already exists.            
+* **Added** a warning that a file couldn't be renamed if a file with the same name already exists.

 === 2.2.2 (2006-06-07)

@ -559,9 +616,9 @@

 === 2.0.0 (2006-03-17)

-* Complete rewrite.         
+* Complete rewrite.
 * Now runs on Mac OS X.

 === 1.0.0 (2004-09-24)

-* Initial release.     
+* Initial release.
--- a/help/de/faq.rst
+++ b/help/de/faq.rst
@ -1,7 +1,7 @@
 Häufig gestellte Fragen
 ==========================

-.. topic:: What is |appname|?
+.. topic:: What is dupeGuru?

    .. only:: edition_se

@ -25,7 +25,7 @@ Häufig gestellte Fragen

 .. topic:: Was sind die Demo-Einschränkungen von dupeGuru?

-    Keine, |appname| ist `Fairware <http://open.hardcoded.net/about/>`_.
+    Keine, dupeGuru ist `Fairware <http://open.hardcoded.net/about/>`_.

 .. topic:: Die Markierungsbox einer Datei, die ich löschen möchte, ist deaktiviert. Was muss ich tun?

@ -71,7 +71,7 @@ Häufig gestellte Fragen
        * Klicken Sie **Markieren --> Alle Markieren**.

 .. only:: edition_me
-    
+
    .. topic:: Ich möchte alle Stücke markieren, die mehr als 3 Sekunden von ihrer Referenz verschieden sind. Was kann ich tun?

        * Aktivieren Sie den :doc:`Nur Duplikate <results>` Modus.
@ -83,7 +83,7 @@ Häufig gestellte Fragen
        * Klicken Sie auf **Entferne Ausgewählte von den Ergebnissen**.

    .. topic:: Ich möchte meine Stücke mit der höchsten Bitrate zur Referenz machen. Was kann ich tun?
-    
+
        * Aktivieren Sie den :doc:`Nur Duplikate <results>` Modus.
        * Aktivieren Sie den **Deltawerte** Modus.
        * Klicken Sie auf die "Bitrate" Spalte, um nach Bitrate zu sortieren.
@ -92,9 +92,9 @@ Häufig gestellte Fragen
        * Klicken Sie auf **Mache Ausgewählte zur Referenz**.

    .. topic:: Ich möchte nicht das [live] und [remix] Versionen meiner Stücke als Duplikate erkannt werden. Was kann ich tun?
-    
+
        Ist Ihre Vergleichsschwelle niedrig genug, werden möglicherweise die live und remix Versionen in der Ergebnisliste landen. Das kann nicht verhindert werden, aber es gibt die Möglichkeit die Ergebnisse nach dem Scan zu entfernen, mittels dem Filter. Möchten Sie jedes Stück mit irgendetwas in eckigen Klammern [] im Dateinamen entfernen, so:
-    
+
        * **Windows**: Klicken Sie auf **Aktionen --> Filter anwenden**, geben "[*]" ein und klicken OK.
        * **Mac OS X**: Geben Sie "[*]" in das "Filter" Feld der Werkzeugleiste ein.
        * Klicken Sie auf **Markieren --> Alle Markieren**.
--- a/help/de/folders.rst
+++ b/help/de/folders.rst
@ -16,7 +16,7 @@ Jeder Ordner kann in einem von 3 Zuständen sein:
 * **Referenz:** Duplikate in diesem Ordner können **nicht** gelöscht werden. Dateien dieses Ordners können sich nur in der **Referenz** Position einer Duplikatgruppe befinden. Ist mehr als eine Datei des Referenzordners in derselben Duplikatgruppe, so wird nur Eine behalten. Die Anderen werden aus der Gruppe entfernt.
 * **Ausgeschlossen:** Dateien in diesem Verzeichnis sind nicht im Scan eingeschlossen.

-Der Standardzustand eines Ordners ist natürlich **Normal**. Sie können den **Referenz** Zustand für Ordner nutzen, in denen auf keinen Fall eine Datei gelöscht werden soll. 
+Der Standardzustand eines Ordners ist natürlich **Normal**. Sie können den **Referenz** Zustand für Ordner nutzen, in denen auf keinen Fall eine Datei gelöscht werden soll.

 Wenn sie einen Zustand für ein Verzeichnis setzen, erben alle Unterordner automatisch diesen Zustand, es sei denn Sie ändern den Zustand der Unterordner explizit.

--- a/help/de/index.rst
+++ b/help/de/index.rst
@ -1,21 +1,13 @@
-|appname| Hilfe
+dupeGuru Hilfe
 ===============

 .. only:: edition_se

-    Dieses Dokument ist auch auf `Englisch <http://www.hardcoded.net/dupeguru/help/en/>`__ und `Französisch <http://www.hardcoded.net/dupeguru/help/fr/>`__ verfügbar.
-
-.. only:: edition_me
-
-    Dieses Dokument ist auch auf `Englisch <http://www.hardcoded.net/dupeguru/help/en/>`__ und `Französisch <http://www.hardcoded.net/dupeguru_me/help/fr/>`__ verfügbar.
-
-.. only:: edition_pe
-
-    Dieses Dokument ist auch auf `Englisch <http://www.hardcoded.net/dupeguru/help/en/>`__ und `Französisch <http://www.hardcoded.net/dupeguru_pe/help/fr/>`__ verfügbar.
+    Dieses Dokument ist auch auf `Englisch <http://dupeguru.voltaicideas.net/help/en/>`__ und `Französisch <http://dupeguru.voltaicideas.net/help/fr/>`__ verfügbar.

 .. only:: edition_se or edition_me

-    |appname| ist ein Tool zum Auffinden von Duplikaten auf Ihrem Computer. Es kann entweder Dateinamen oder Inhalte scannen. Der Dateiname-Scan stellt einen lockeren Suchalgorithmus zur Verfügung, der sogar Duplikate findet, die nicht den exakten selben Namen haben.
+    dupeGuru ist ein Tool zum Auffinden von Duplikaten auf Ihrem Computer. Es kann entweder Dateinamen oder Inhalte scannen. Der Dateiname-Scan stellt einen lockeren Suchalgorithmus zur Verfügung, der sogar Duplikate findet, die nicht den exakten selben Namen haben.

 .. only:: edition_pe

@ -23,13 +15,13 @@

 Obwohl dupeGuru auch leicht ohne Dokumentation genutzt werden kann, ist es sinnvoll die Hilfe zu lesen. Wenn Sie nach einer Führung für den ersten Duplikatscan suchen, werfen Sie einen Blick auf die :doc:`Schnellstart <quick_start>` Sektion

-Es ist eine gute Idee |appname| aktuell zu halten. Sie können die neueste Version auf der `homepage`_ finden.
+Es ist eine gute Idee dupeGuru aktuell zu halten. Sie können die neueste Version auf der http://dupeguru.voltaicideas.net finden.

 Inhalte:

 .. toctree::
    :maxdepth: 2
-    
+
    quick_start
    folders
    preferences
--- a/help/de/preferences.rst
+++ b/help/de/preferences.rst
@ -4,9 +4,9 @@ Einstellungen
 .. only:: edition_se

    **Scan Typ:** Diese Option bestimmt nach welcher Eigenschaft die Dateien in einem Duplikate Scan verglichen werden. Wenn Sie **Dateiname** auswählen, wird dupeGuru jeden Dateinamen Wort für Wort vergleichen und, abhängig von den unteren Einstellungen, feststellen ob genügend Wörter übereinstimmen, um 2 Dateien als Duplikate zu betrachten. Wenn Sie **Inhalt** wählen, werden nur Dateien mit dem exakt gleichen Inhalt zusammenpassen.
-    
+
    Der **Ordner** Scan Typ ist etwas speziell. Wird er ausgewählt, scannt dupeGuru nach doppelten Ordnern anstelle von Dateien. Um festzustellen ob 2 Ordner identisch sind, werden alle Datein im Ordner gescannt und wenn die Inhalte aller Dateien der Ordner übereinstimmen, werden die Ordner als Duplikate erkannt.
-    
+
    **Filterempfindlichkeit:** Wenn Sie den **Dateiname** Scan Typ wählen, bestimmt diese Option wie ähnlich 2 Dateinamen für dupeGuru sein müssen, um Duplikate zu sein. Ist die Empfindlichkeit zum Beispiel 80, müssen 80% der Worte der 2 Dateinamen übereinstimmen. Um den Übereinstimmungsanteil herauszufinden, zählt dupeGuru zuerst die Gesamtzahl der Wörter **beider** Dateinamen, dann werden die gleichen Wörter gezählt (jedes Wort zählt als 2) und durch die Gesamtzahl der Wörter dividiert. Ist das Resultat größer oder gleich der Filterempfindlichkeit, haben wir ein Duplikat. Zum Beispiel, "a b c d" und "c d e" haben einen Übereinstimmungsanteil von 57 (4 gleiche Wörter, insgesamt 7 Wörter).

 .. only:: edition_me
@ -33,7 +33,7 @@ Einstellungen
 .. only:: edition_pe

    **Scan Typ:** Diese option bestimmt, welcher Scan Typ bei Ihren Bildern angewendet wird. Der **Inhalte** Scan Typ vergleicht den Inhalt der Bilder auf eine ungenaue Art und Weise (so werden nicht nur exakte Duplikate gefunden, sondern auch Ähnliche). Der **EXIF Zeitstempel** Scan Typ schaut auf die EXIF Metadaten der Bilder (wenn vorhanden) und erkennt Bilder die den Selben haben. Er ist viel schneller als der Inhalte Scan. **Warnung:** Veränderte Bilder behalten oft den selben EXIF Zeitstempel, also achten Sie auf Falschpositive bei der Nutzung dieses Scans.
-    
+
    **Filterempfindlichkeit:** *Nur Inhalte Scan.* Je höher diese Einstellung, desto strenger ist der Filter (Mit anderen Worten, desto weniger Ergebnisse erhalten Sie). Die meisten Bilder der selben Qualität stimmen zu 100% überein, selbst wenn das Format anders ist (PNG und JPG zum Beispiel). Wie auch immer, wenn ein PNG mit einem JPG niederiger Qualität übereinstimmen soll, muss die Filterempfindlichkeit kleiner als 100 sein. Die Voreinstellung, 95, ist eine gute Wahl.

    **Bilder unterschiedlicher Abmessung gleich:** Wird diese Box gewählt, dürfen Bilder unterschiedlicher Abmessung in einer Duplikategruppe sein..
@ -57,7 +57,7 @@ Auf jeden Fall behandelt dupeGuru Namenskonflikte indem es dem Ziel-Dateinamen e
 **Eigener Befehl:** Diese Einstellung bestimmt den Befehl der durch "Führe eigenen Befehl aus" ausgeführt wird. Sie können jede externe Anwendung durch diese Aktion aufrufen. Dies ist zum Beispiel hilfreich, wenn Sie eine gute diff-Anwendung installiert haben.

 Das Format des Befehls ist das Selbe wie in einer Befehlszeile, außer das 2 Platzhalter vorhanden sind: **%d** und **%r**. Diese Platzhalter werden durch den Pfad des markierten Duplikates (%d) und dem Pfad der Duplikatereferenz ersetzt (%r).
-  
+
 Wenn der Pfad Ihrer ausführbaren Datei Leerzeichen enthält, so schließen sie ihn bitte mit "" Zeichen ein. Sie sollten auch Platzhalter mit den Zitatzeichen einschließen, denn es ist möglich, das die Pfade der Duplikate und Referenzen ebenfalls Leerzeichen enthalten. Hier ist ein Beispiel eines eigenen Befehls::
-  
+
    "C:\Program Files\SuperDiffProg\SuperDiffProg.exe" "%d" "%r"
--- a/help/de/reprioritize.rst
+++ b/help/de/reprioritize.rst
@ -22,4 +22,4 @@ criterion is used and so on and so on. For example, if your arguments are "Size
 "Filename (Doesn't end with a number)", the reference file that will be picked in a group will be
 the biggest file, and if two or more files have the same size, the one that has a filename that
 doesn't end with a number will be used. When all criteria result in ties, the order in which dupes
-previously were in the group will be used.
+previously were in the group will be used.
--- a/help/de/results.rst
+++ b/help/de/results.rst
@ -98,4 +98,4 @@ Aktionen Menü
 * **Ausgewählte umbenennen:** Fragt nach einem neuen Namen und benennt die ausgewählte Datei um.

 .. todo:: Add Move and iPhoto/iTunes warning
-.. todo:: Add "Deletion Options" section.
+.. todo:: Add "Deletion Options" section.
--- a/help/en/contribute.rst
+++ b/help/en/contribute.rst
@ -12,7 +12,7 @@ a community around this project.

 So, whatever your skills, if you're interested in contributing to dupeGuru, please do so. Normally,
 this documentation should be enough to get you started, but if it isn't, then **please**,
-`let me know`_ because it's a problem that I'm committed to fix. If there's any situation where you'd
+open a discussion at https://github.com/arsenetar/dupeguru/discussions.  If there's any situation where you'd
 wish to contribute but some doubt you're having prevent you from going forward, please contact me.
 I'd much prefer to spend the time figuring out with you whether (and how) you can contribute than
 taking the chance of missing that opportunity.
@ -24,7 +24,7 @@ Development process
 * `Issue Tracker`_
 * `Issue labels meaning`_

-dupeGuru's source code is on Github and thus managed in a Git repository. At all times, you should
+dupeGuru's source code is on GitHub and thus managed in a Git repository. At all times, you should
 be able to build from source a fresh checkout of the ``master`` branch using instructions from the
 ``README.md`` file at the root of this project. If you can't, it's a bug. Please report it.

@ -61,7 +61,7 @@ It's the same thing with feature requests. Description of a feature request, whe
 already been given to how such a feature would fit in the current design, are precious to developers
 and help them figure out a clear roadmap for the project.

-So, even if you're not a developer, you can always open a Github account and create/comment issues.
+So, even if you're not a developer, you can always open a GitHub account and create/comment issues.
 Your contribution will be much appreciated.

 **Documentation**. This is a bit trickier because dupeGuru's documentation is written with a rather
@ -82,10 +82,9 @@ agree on what should be added to the documentation.
 dupeGuru. For more information about how to do that, you can refer to the `translator guide`_.

 .. _been open source: https://www.hardcoded.net/articles/free-as-in-speech-fair-as-in-trade
-.. _let me know: mailto:hsoft@hardcoded.net
-.. _Source code repository: https://github.com/hsoft/dupeguru
-.. _Issue Tracker: https://github.com/hsoft/dupeguru/issues
-.. _Issue labels meaning: https://github.com/hsoft/dupeguru/wiki/issue-labels
+.. _Source code repository: https://github.com/arsenetar/dupeguru
+.. _Issue Tracker: https://github.com/arsenetar/issues
+.. _Issue labels meaning: https://github.com/arsenetar/wiki/issue-labels
 .. _Sphinx: http://sphinx-doc.org/
 .. _reST: http://en.wikipedia.org/wiki/ReStructuredText
-.. _translator guide: https://github.com/hsoft/dupeguru/wiki/Translator-Guide
+.. _translator guide: https://github.com/arsenetar/wiki/Translator-Guide
--- a/help/en/developer/core/engine.rst
+++ b/help/en/developer/core/engine.rst
@ -2,12 +2,12 @@ core.engine
 ===========

 .. automodule:: core.engine
-    
+
    .. autoclass:: Match
-    
+
    .. autoclass:: Group
        :members:
-    
+
    .. autofunction:: build_word_dict
    .. autofunction:: compare
    .. autofunction:: compare_fields
@ -16,7 +16,7 @@ core.engine
    .. autofunction:: get_groups
    .. autofunction:: merge_similar_words
    .. autofunction:: reduce_common_words
-    
+
 .. _fields:

 Fields
--- a/help/en/developer/core/gui/index.rst
+++ b/help/en/developer/core/gui/index.rst
@ -6,5 +6,5 @@ core.gui

 .. toctree::
    :maxdepth: 2
-    
+
    deletion_options
--- a/help/en/developer/core/index.rst
+++ b/help/en/developer/core/index.rst
@ -3,7 +3,7 @@ core

 .. toctree::
    :maxdepth: 2
-    
+
    app
    fs
    engine
--- a/help/en/developer/hscommon/gui/base.rst
+++ b/help/en/developer/hscommon/gui/base.rst
@ -4,9 +4,9 @@ hscommon.gui.base
 .. automodule:: hscommon.gui.base

    .. autosummary::
-        
+
        GUIObject
-    
+
    .. autoclass:: GUIObject
        :members:
        :private-members:
--- a/help/en/developer/hscommon/gui/column.rst
+++ b/help/en/developer/hscommon/gui/column.rst
@ -4,22 +4,22 @@ hscommon.gui.column
 .. automodule:: hscommon.gui.column

    .. autosummary::
-        
+
        Columns
        Column
        ColumnsView
        PrefAccessInterface
-    
+
    .. autoclass:: Columns
        :members:
        :private-members:
-    
+
    .. autoclass:: Column
        :members:
        :private-members:
-    
+
    .. autoclass:: ColumnsView
        :members:
-    
+
    .. autoclass:: PrefAccessInterface
        :members:
--- a/help/en/developer/hscommon/gui/progress_window.rst
+++ b/help/en/developer/hscommon/gui/progress_window.rst
@ -4,15 +4,14 @@ hscommon.gui.progress_window
 .. automodule:: hscommon.gui.progress_window

    .. autosummary::
-        
+
        ProgressWindow
        ProgressWindowView
-    
+
    .. autoclass:: ProgressWindow
        :members:
        :private-members:
-    
+
    .. autoclass:: ProgressWindowView
        :members:
        :private-members:
-    
--- a/help/en/developer/hscommon/gui/selectable_list.rst
+++ b/help/en/developer/hscommon/gui/selectable_list.rst
@ -4,23 +4,23 @@ hscommon.gui.selectable_list
 .. automodule:: hscommon.gui.selectable_list

    .. autosummary::
-        
+
        Selectable
        SelectableList
        GUISelectableList
        GUISelectableListView
-    
+
    .. autoclass:: Selectable
        :members:
        :private-members:
-    
+
    .. autoclass:: SelectableList
        :members:
        :private-members:
-    
+
    .. autoclass:: GUISelectableList
        :members:
        :private-members:
-    
+
    .. autoclass:: GUISelectableListView
        :members:
--- a/help/en/developer/hscommon/gui/table.rst
+++ b/help/en/developer/hscommon/gui/table.rst
@ -2,18 +2,18 @@ hscommon.gui.table
 ==================

 .. automodule:: hscommon.gui.table
-    
+
    .. autosummary::
-        
+
        Table
        Row
        GUITable
        GUITableView
-    
+
    .. autoclass:: Table
        :members:
        :private-members:
-    
+
    .. autoclass:: Row
        :members:
        :private-members:
@ -21,6 +21,6 @@ hscommon.gui.table
    .. autoclass:: GUITable
        :members:
        :private-members:
-    
+
    .. autoclass:: GUITableView
        :members:
--- a/help/en/developer/hscommon/gui/text_field.rst
+++ b/help/en/developer/hscommon/gui/text_field.rst
@ -4,10 +4,10 @@ hscommon.gui.text_field
 .. automodule:: hscommon.gui.text_field

    .. autosummary::
-        
+
        TextField
        TextFieldView
-    
+
    .. autoclass:: TextField
        :members:
        :private-members:
--- a/help/en/developer/hscommon/gui/tree.rst
+++ b/help/en/developer/hscommon/gui/tree.rst
@ -2,17 +2,16 @@ hscommon.gui.tree
 =================

 .. automodule:: hscommon.gui.tree
-    
+
    .. autosummary::
-        
+
        Tree
        Node
-    
+
    .. autoclass:: Tree
        :members:
        :private-members:
-    
+
    .. autoclass:: Node
        :members:
        :private-members:
-
--- a/help/en/developer/hscommon/index.rst
+++ b/help/en/developer/hscommon/index.rst
@ -4,7 +4,7 @@ hscommon
 .. toctree::
    :maxdepth: 2
    :glob:
-    
+
    build
    conflict
    desktop
@ -13,4 +13,3 @@ hscommon
    util
    jobprogress/*
    gui/*
-
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`sonar.python.version=3.7, 3.8, 3.9, 3.10, 3.11`